def test_3_generate_mixed_dataset(self): """ Generates a mixed synthetic dataset of eschColi with reads[0] reads and dm3 with reads[1] reads. """ orgs = [os.path.join(self.reference, "eschColi_K12/seq/eschColi_K12.fa"), os.path.join(self.reference, "dm3/seq/dm3.fa")] reads = [3000, 6000] # Will hold the real number of fastq reads after simNGS, independent of # organism lines = [] dst = os.path.join(self.synthetic_fastq, "simngs.mixed_{org1}_{org2}_{reads1}vs{reads2}.fastq".format(org1='eschColi_K12', org2='dm3', reads1=reads[0], reads2=reads[1])) for org, read in izip(orgs, reads): fa_entries = 0 with open(org, 'r') as cnt: for line in cnt: if '>' in line: fa_entries += 1 with open(dst, 'a') as fh: n = str(ceil(read/float(fa_entries))) cl1 = [self.simlib, "--seed", self.sim_seed, "-n", n, org] cl2 = [self.simngs, "-s", self.sim_seed, "-o", "fastq", self.runfile] p1 = subprocess.Popen(cl1, stdout=subprocess.PIPE) p2 = subprocess.Popen(cl2, stdin=p1.stdout, stdout=fh).communicate() p1.stdout.close() #trim_fastq will trim the excess of dm3 reads, as they're the last ones, #leading to a file with exactly 3000 reads of E.choli and 6000 of dm3 helpers.trim_fastq(dst, sum(reads))
def test_2_run_simNGS(self): """ Simulates an Illumina run with simNGS read simulator for each organism in references directory. """ # Generate N simulated reads of every organism present in "org" orgs = [o for o in glob.glob(os.path.join(self.reference, "*/seq/*.fa"))] for org in orgs: for reads in self.sim_reads: dst = os.path.join(self.synthetic_fastq, "simngs_{org}_{reads}.fastq".format(org=org.split(os.sep)[-3], reads=reads)) #Do not regenerate datasets that are already present if not os.path.exists(dst): fa_entries = 0 # Determine how many FASTA "Description lines" (headers) there are # since simNGS will generate reads depending on that number with open(org, 'r') as cnt: for line in cnt: if '>' in line: fa_entries += 1 with open(dst, 'w') as fh: n = str(ceil(reads/float(fa_entries))) cl1 = [self.simlib, "--seed", self.sim_seed, "-n", n, org] cl2 = [self.simngs, "-s", self.sim_seed, "-o", "fastq", self.runfile] # XXX: To be parametrized in future benchmarks (for paired end reads) #cl2 = [simngs, "-o", "fastq", "-p", "paired", runfile] # http://docs.python.org/2/library/subprocess.html#replacing-shell-pipeline p1 = subprocess.Popen(cl1, stdout=subprocess.PIPE) p2 = subprocess.Popen(cl2, stdin=p1.stdout, stdout=fh).communicate() p1.stdout.close() #Trim the FAST file to the actual number of reads helpers.trim_fastq(dst, reads)