def separatePairsAndSingles(cf): """In an array of runs, separate the paired end reads into two separate files, and the singles into a third.""" fastqfiles = get_array(cf, 'fastqfiles') srafetchxml = cf.get_input('srafetchxml') leftfh = open(cf.get_output('left'), 'w') rightfh = open(cf.get_output('right'), 'w') singlefh = open(cf.get_output('single'), 'w') for accession, fastqfile in fastqfiles: fqp = FastqParser() fastqfh = open(fastqfile, 'U') if isPaired(srafetchxml, accession): iter = fqp.parse(fastqfh) #paired end run while True: try: pe1 = iter.next() pe2 = iter.next() leftfh.write(str(pe1) + '\n') rightfh.write(str(pe2) + '\n') except StopIteration: break else: #single end run for rec in fqp.parse(fastqfh): singlefh.write(str(rec) + '\n') leftfh.close() rightfh.close() singlefh.close() return constants.OK
def getOverRepClusters(cf): """Identify over represented clusters in a fastqfile and write the cluster seed to a file.""" fastqfile = cf.get_input('fastqfile') resultsuc = cf.get_input('resultsuc') resultsfa = cf.get_input('resultsfa') percRep = cf.get_parameter('percRep', 'float') output = cf.get_output('resultsfa') totalSeqs = 0 fqp = FastqParser() for rec in fqp.parse(open(fastqfile, 'rb')): totalSeqs += 1 clusterCounts = {} reader = csv.reader(open(resultsuc, 'rb'), quoting=csv.QUOTE_NONE, delimiter='\t') for row in reader: if row[0] == 'H': if not clusterCounts.has_key(row[-1]): clusterCounts[row[-1]] = 0 clusterCounts[row[-1]] += 1 outfh = open(output, 'wb') for rec in fasta_itr(resultsfa): if not clusterCounts.has_key(rec.header): continue clusterRep = (float(clusterCounts[rec.header]) / float(totalSeqs)) * 100 if clusterRep >= percRep: outfh.write(str(rec) + '\n') outfh.close() return constants.OK
def getNamesFromFastQ(cf): """Write the headers of a fastqfile to an outputfile.""" outfh = open(cf.get_output("namelist"), "w") fqp = FastqParser() for rec in fqp.parse(open(cf.get_input("fastqfile"), "U")): outfh.write("%s\n" % rec.header) outfh.close() return constants.OK
def fastq_merge(cf): """Merge an array of fastqfiles.""" outfh = open(cf.get_output('output'), 'w') fastqfiles = get_array(cf, 'in_array') cf.write_log(str(fastqfiles)) fqp = FastqParser() for key, fastqfile in fastqfiles: for rec in fqp.parse(open(fastqfile, 'U')): outfh.write(str(rec) + '\n') outfh.close() return constants.OK
inputfile = cf.get_input('srafile') srafetchxml = cf.get_input('srafetchxml') accession = cf.get_parameter('accession', 'string') outputfile = cf.get_output('fastqfile') params = [] if isPaired(srafetchxml, accession): cf.write_log('Run is paired end.') params.append('--split-spot') outfh = open(outputfile + '.tmp', 'wb') try: params.append('-Z') params.append(inputfile) subprocess.check_call(['fastq-dump'] + params, stdout=outfh) except subprocess.CalledProcessError, e: cf.write_log("Error running fastq-dump.") cf.write_log("Error: %s" % str(e)) return constants.GENERIC_ERROR finally: outfh.close() #format the fastq headers outfh = open(outputfile, 'wb') fqp = FastqParser() for rec in fqp.parse(open(outputfile + '.tmp', 'U')): rec.header = rec.header.split(' ')[1] outfh.write(str(rec) + '\n') outfh.close() return constants.OK anduril.main(sra2fastq)