def write_reads(self, out_file_handle, output_format='fasta', filter_expression=None, startidx=0, rowbuffer=100000, overwrite=False): """ Write records returned by the querry to one large fasta or fastq Defaults is to search by GLOBing the individual descriptions with the search_query. If sql_query = True, search_query is passed as a full sql statment. If use_type_column=True, search is done by GLOBing the individual type column instead. out_file_handle -- A file object or string specifying a filename. startidx -- starting base index of DNA sequence that is written, used to miss out cutsite if necessary. """ # Output check out_file_handle = outputfile_check(out_file_handle, mode='a', overwrite=overwrite) query = '''SELECT seqid, seq, phred FROM seqs INNER JOIN samples ON seqs.sampleId=samples.sampleId''' if filter_expression: query += ' WHERE {0}'.format(filter_expression) with self.con as con: toc = time.time() print >> sys.stderr, 'Writing records to {0} format....'.format(output_format), c = con.execute(query) returned_records = c.fetchmany(rowbuffer) rec_count = 0 while returned_records: for rec in returned_records: seq_rec = SeqRecord(Seq(rec['seq'][startidx:]), id=str(rec['seqid']), description='') if output_format == 'fastq': seq_rec.letter_annotations['phred_quality'] = [ord(x) - 33 for x in rec['phred']] SeqIO.write(seq_rec, out_file_handle, format=output_format) rec_count += 1 # Fetch next batch of records from cursor returned_records = c.fetchmany(rowbuffer) print >> sys.stderr, ' Done!' print >> sys.stderr, '\n{0} records written successfully to {1}\nin {2}'.format(rec_count, out_file_handle.name, time.strftime('%H:%M:%S', time.gmtime(time.time() - toc))) if out_file_handle.name not in ['<stdout>', '<stderr>']: out_file_handle.close() return out_file_handle
def run(self, infile_handle): ''' Run CD-HIT in parallel on list of fasta files. Each file is clustered seperately. Other flags used: -d 0 --> No limit on description written to cluster file (goes to first space in seq ID). -r 0 --> DO Only +/+ and not -/+ alignment comparisons as reads are done in both directions but on different strands. -s 0.8 --> If shorter sequence is less than 80% of the representative sequence, dont cluster. infile_handle -- Takes file object or string of the file path/filename ouput -- Defines output location and file name. Writes stdout to console. Counter dictionary and summary logfile are generated after each run. ''' # input checks infile_handle = inputfile_check(infile_handle) logfile_path = os.path.join(os.path.split(self.args.output)[0], 'clusterfile.log') infile_path = os.path.abspath(infile_handle.name) logfile = outputfile_check(logfile_path) # setup internal vars start_time = time.time() #======================================================================= # Run CDHIT #======================================================================= cmd = ('cd-hit-est -i {0} -o {1} -c {2} -n {3} -d 0 -r 0 -s 0.8 -M {4} ' '-T {5}').format(infile_path, self.args.output, self.args.similarity, self.args.n_gram, self.args.maxmemory, self.args.threads) if self.args.maskN: cmd = cmd + ' -mask N' if self.args.allvall: cmd = cmd + ' -g 1' cdhitpath = os.path.expanduser(self.args.cdhitpath) # Spawn Process to run CD-HIT subprocess.check_call(shlex.split(os.path.join(cdhitpath, cmd))) finish_time = time.time() #======================================================================= # Generate a summary log file #======================================================================= # Get cluster size summary counter total_counter, by_seqlen_counter = self.cluster_summary_counter(infile_path=self.args.output, mode='both', report=True) st_idx = cmd.find('-c ') CDHIT_parameters = cmd[st_idx:] # Write summary logfile with logfile as f: program_name = os.path.join(self.args.cdhitpath, cmd).split(' -i ')[0] f.write('=========================================================\n') f.write('Program : {0}\n'.format(program_name)) f.write('Input File : {0}\n'.format(infile_path)) f.write('Output File : {0}\n'.format(self.args.output)) f.write('Commands : {0}\n'.format(CDHIT_parameters)) f.write('\n') f.write('Started : {0}\n'.format(time.strftime('%a, %d %b %Y, %H:%M:%S', time.gmtime(start_time)))) f.write('=========================================================\n') f.write('\n') f.write(' Report Log\n') f.write('---------------------------------------------------------\n') reads_per_cluster = {key: int(key)*value for key, value in total_counter.iteritems()} total_reads = sum(reads_per_cluster.values()) total_clusters = sum(total_counter.values()) f.write('Total number of reads : {0}\n'.format(total_reads)) f.write('Total number of clusters : {0}\n'.format(total_clusters)) read_lengths = [int(key) for key in by_seqlen_counter.keys()] f.write('Read length Min and Max : {0} and {1}\n'.format(min(read_lengths), max(read_lengths))) f.write('Time taken : {0}\n'.format(time.strftime('%H:%M:%S', time.gmtime(finish_time - start_time)))) f.write('\n') f.write('Top 20 Percentage Reads per cluster \n') f.write('---------------------------------------------------------\n') f.write('Cluster Size No. Clusters Total Reads % \n') f.write('---------------------------------------------------------\n') top_reads_per_cluster = sorted(reads_per_cluster.iteritems(), key=lambda tup: int(tup[1]), reverse=True)[:20] for tup in top_reads_per_cluster: if total_reads == 0: perc = 0.0 else: perc = float(tup[1]) / total_reads f.write("{clust_size: <16}{num_clust: <16}{total_reads: <18d}{percentage:.2%}\n".format( clust_size=tup[0], num_clust=total_counter[tup[0]], total_reads=tup[1], percentage=perc)) cluster_file_handle = open(self.args.output, 'rb') return cluster_file_handle, total_counter