def count_seqs_in_file (pth): """ Largely just for checking we're processing data right. """ all_seqs = mcda.read_seqs (pth) return len (all_seqs)
def get_names_of_seqs_in_file (pth): all_seqs = mcda.read_seqs (pth) return [s.name for s in all_seqs]
"there are duplicate sequence names in the control set" overlap = exp_seq_names.intersection (cntrl_seq_names) assert len (overlap) == 0, \ "some sequences appear in experimental and control sets: %s" % \ ', '.join (overlap) ## Main: # create required directories snakemake.utils.makedirs (BUILD_DIR) snakemake.utils.makedirs (RESULTS_DIR) snakemake.utils.makedirs (SEQ_WORK_DIR) snakemake.utils.makedirs (COMP_SEQ_WORK_DIR) # copy & rename controls, sample first 100 & make non-discovery set all_control_seqs = mcda.read_seqs (input.all_cntrl_seqs) mcda.write_seqs (all_control_seqs[:MEME_SEQ_CNT], output.disc_cntrl_seqs) mcda.write_seqs (all_control_seqs[MEME_SEQ_CNT:], output.nondisc_control_seqs) # copy & rename experimental, take top 100 & make non-discovery set # NOTE: assumes exp seqs are in order of effect all_exp_seqs = mcda.read_seqs (input.all_exp_seqs) mcda.write_seqs (all_exp_seqs[:MEME_SEQ_CNT], output.disc_exp_seqs) mcda.write_seqs (all_exp_seqs[MEME_SEQ_CNT:], output.nondisc_exp_seqs) # copy all comparative seqs across shell ("cp {input.comp_seq_data_dir}/*.fasta {output.comp_seq_work_dir}")