def run(self, infile_handle): ''' Run CD-HIT in parallel on list of fasta files. Each file is clustered seperately. Other flags used: -d 0 --> No limit on description written to cluster file (goes to first space in seq ID). -r 0 --> DO Only +/+ and not -/+ alignment comparisons as reads are done in both directions but on different strands. -s 0.8 --> If shorter sequence is less than 80% of the representative sequence, dont cluster. infile_handle -- Takes file object or string of the file path/filename ouput -- Defines output location and file name. Writes stdout to console. Counter dictionary and summary logfile are generated after each run. ''' # input checks infile_handle = inputfile_check(infile_handle) logfile_path = os.path.join(os.path.split(self.args.output)[0], 'clusterfile.log') infile_path = os.path.abspath(infile_handle.name) logfile = outputfile_check(logfile_path) # setup internal vars start_time = time.time() #======================================================================= # Run CDHIT #======================================================================= cmd = ('cd-hit-est -i {0} -o {1} -c {2} -n {3} -d 0 -r 0 -s 0.8 -M {4} ' '-T {5}').format(infile_path, self.args.output, self.args.similarity, self.args.n_gram, self.args.maxmemory, self.args.threads) if self.args.maskN: cmd = cmd + ' -mask N' if self.args.allvall: cmd = cmd + ' -g 1' cdhitpath = os.path.expanduser(self.args.cdhitpath) # Spawn Process to run CD-HIT subprocess.check_call(shlex.split(os.path.join(cdhitpath, cmd))) finish_time = time.time() #======================================================================= # Generate a summary log file #======================================================================= # Get cluster size summary counter total_counter, by_seqlen_counter = self.cluster_summary_counter(infile_path=self.args.output, mode='both', report=True) st_idx = cmd.find('-c ') CDHIT_parameters = cmd[st_idx:] # Write summary logfile with logfile as f: program_name = os.path.join(self.args.cdhitpath, cmd).split(' -i ')[0] f.write('=========================================================\n') f.write('Program : {0}\n'.format(program_name)) f.write('Input File : {0}\n'.format(infile_path)) f.write('Output File : {0}\n'.format(self.args.output)) f.write('Commands : {0}\n'.format(CDHIT_parameters)) f.write('\n') f.write('Started : {0}\n'.format(time.strftime('%a, %d %b %Y, %H:%M:%S', time.gmtime(start_time)))) f.write('=========================================================\n') f.write('\n') f.write(' Report Log\n') f.write('---------------------------------------------------------\n') reads_per_cluster = {key: int(key)*value for key, value in total_counter.iteritems()} total_reads = sum(reads_per_cluster.values()) total_clusters = sum(total_counter.values()) f.write('Total number of reads : {0}\n'.format(total_reads)) f.write('Total number of clusters : {0}\n'.format(total_clusters)) read_lengths = [int(key) for key in by_seqlen_counter.keys()] f.write('Read length Min and Max : {0} and {1}\n'.format(min(read_lengths), max(read_lengths))) f.write('Time taken : {0}\n'.format(time.strftime('%H:%M:%S', time.gmtime(finish_time - start_time)))) f.write('\n') f.write('Top 20 Percentage Reads per cluster \n') f.write('---------------------------------------------------------\n') f.write('Cluster Size No. Clusters Total Reads % \n') f.write('---------------------------------------------------------\n') top_reads_per_cluster = sorted(reads_per_cluster.iteritems(), key=lambda tup: int(tup[1]), reverse=True)[:20] for tup in top_reads_per_cluster: if total_reads == 0: perc = 0.0 else: perc = float(tup[1]) / total_reads f.write("{clust_size: <16}{num_clust: <16}{total_reads: <18d}{percentage:.2%}\n".format( clust_size=tup[0], num_clust=total_counter[tup[0]], total_reads=tup[1], percentage=perc)) cluster_file_handle = open(self.args.output, 'rb') return cluster_file_handle, total_counter
def load_cluster_file(self, cluster_file_handle, table_prefix=None, overwrite=False, fmin=2, fmax=None, skipsort=False, buffer_max=1000000): ''' Load in a clustering file into the database By default singletons are not loaded as cutoff = 2 can also set a fmin and fmax threshold if only clusters of a certain size are to be added. ''' # define names if table_prefix is None: members_table_name = 'members' cluster_table_name = 'clusters' index_name = 'clustersizeIndex' else: members_table_name = table_prefix + '_members' cluster_table_name = table_prefix + '_clusters' index_name = table_prefix + '_clustersizeIndex' # input checks if type(cluster_file_handle) == str: if not cluster_file_handle.endswith('.clstr'): cluster_file_handle = cluster_file_handle + '.clstr' cluster_file_handle = inputfile_check(cluster_file_handle) # Sort file if not skipsort: # Filter out singletons and sort clusters in accending order print >> sys.stderr, 'Sorting cluster file %s ...' % (cluster_file_handle.name) sorted_cluster_file = sortby(cluster_file_handle, reverse=True, mode='cluster_size', outfile_postfix='-subset', cutoff=fmin) cluster_file_handle = inputfile_check(sorted_cluster_file) print >> sys.stderr, 'Importing cluster file %s to database...' % (cluster_file_handle.name) # Overwrite/ make tables if necessary if overwrite: with self.con as con: con.execute(''' DROP TABLE IF EXISTS {0} '''.format(cluster_table_name)) con.execute(''' DROP TABLE IF EXISTS {0} '''.format(members_table_name)) self.create_cluster_table(cluster_table_name) self.create_members_table(members_table_name) # Make cluster generator. Returns all cluster info cluster_gen = parse(cluster_file_handle) # Buffer to hold clusters in memory then write all at once cluster_info_list = [] cumulative_cluster_size = 0 # Find starting cluster id c = self.con.execute(''' SELECT COUNT(*) FROM {0}'''.format(cluster_table_name)) clusterid = c.fetchone()['count(*)'] + 1 # Drop any index on Cluster size with self.con as con: con.execute(''' DROP INDEX IF EXISTS {0} '''.format(index_name)) if fmax: for cluster in cluster_gen: if cluster.size <= fmax and cluster.size >= fmin: cluster_info_list.append(( clusterid, cluster.rep_seq_id, cluster.size, cluster.members_id)) clusterid += 1 cumulative_cluster_size += cluster.size if cumulative_cluster_size > buffer_max: self.load_batch_clusterdata(cluster_info_list, table_prefix) cluster_info_list = [] else: for cluster in cluster_gen: cluster_info_list.append(( clusterid, cluster.rep_seq_id, cluster.size, cluster.members_id)) clusterid += 1 cumulative_cluster_size += cluster.size if cumulative_cluster_size > buffer_max: self.load_batch_clusterdata(cluster_info_list, table_prefix) cluster_info_list = [] # Final flush of data if cluster_info_list: self.load_batch_clusterdata(cluster_info_list, table_prefix) # Rebuild index on Cluster size with self.con as con: con.execute('''CREATE INDEX {indexname} ON {tablename}(size)'''.format( indexname=index_name, tablename=cluster_table_name))