예제 #1
0
def count_seqs_in_file (pth):
	"""
	Largely just for checking we're processing data right.
	"""
	all_seqs = mcda.read_seqs (pth)
	return len (all_seqs)
예제 #2
0
def get_names_of_seqs_in_file (pth):
	all_seqs = mcda.read_seqs (pth)
	return [s.name for s in all_seqs]
예제 #3
0
			"there are duplicate sequence names in the control set"

		overlap = exp_seq_names.intersection (cntrl_seq_names)
		assert len (overlap) == 0, \
			"some sequences appear in experimental and control sets: %s" % \
			', '.join (overlap)

		## Main:
		# create required directories
		snakemake.utils.makedirs (BUILD_DIR)
		snakemake.utils.makedirs (RESULTS_DIR)
		snakemake.utils.makedirs (SEQ_WORK_DIR)
		snakemake.utils.makedirs (COMP_SEQ_WORK_DIR)

		# copy & rename controls, sample first 100 & make non-discovery set
		all_control_seqs = mcda.read_seqs (input.all_cntrl_seqs)
		mcda.write_seqs (all_control_seqs[:MEME_SEQ_CNT],
			output.disc_cntrl_seqs)
		mcda.write_seqs (all_control_seqs[MEME_SEQ_CNT:],
			output.nondisc_control_seqs)

		# copy & rename experimental, take top 100 & make non-discovery set
		# NOTE: assumes exp seqs are in order of effect
		all_exp_seqs = mcda.read_seqs (input.all_exp_seqs)
		mcda.write_seqs (all_exp_seqs[:MEME_SEQ_CNT],
			output.disc_exp_seqs)
		mcda.write_seqs (all_exp_seqs[MEME_SEQ_CNT:],
			output.nondisc_exp_seqs)

		# copy all comparative seqs across
		shell ("cp {input.comp_seq_data_dir}/*.fasta {output.comp_seq_work_dir}")