def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir): """Calculate statistics for genomes. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. num_clusters : int Number of cluster to form. num_components : int Number of PCA components to consider. K : int K-mer size to use for calculating genomic signature. no_coverage : boolean Flag indicating if coverage information should be used during clustering. no_pca : boolean Flag indicating if PCA of genomic signature should be calculated. iterations : int Iterations of clustering to perform. genome_file : str Sequences being clustered. output_dir : str Directory to write results. """ # get GC and mean coverage for each scaffold in genome self.logger.info('') self.logger.info(' Determining mean coverage and genomic signatures.') signatures = GenomicSignature(K) genome_stats = [] signature_matrix = [] seqs = seq_io.read(genome_file) for seq_id, seq in seqs.iteritems(): stats = scaffold_stats.stats[seq_id] if not no_coverage: genome_stats.append((np_mean(stats.coverage))) else: genome_stats.append(()) if K == 0: pass elif K == 4: signature_matrix.append(stats.signature) else: sig = signatures.seq_signature(seq) total_kmers = sum(sig) for i in xrange(0, len(sig)): sig[i] = float(sig[i]) / total_kmers signature_matrix.append(sig) # calculate PCA of tetranucleotide signatures if K != 0: if not no_pca: self.logger.info(' Calculating PCA of genomic signatures.') pc, variance = self.pca(signature_matrix) self.logger.info(' First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100)) for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, pc[i][0:num_components]) else: self.logger.info(' Using complete genomic signature.') for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, signature_matrix[i]) # whiten data if feature matrix contains coverage and genomic signature data if not no_coverage and K != 0: print ' Whitening data.' genome_stats = whiten(genome_stats) else: genome_stats = np_array(genome_stats) # cluster self.logger.info(' Partitioning genome into %d clusters.' % num_clusters) bError = True while bError: try: bError = False _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise') except ClusterError: bError = True for k in range(num_clusters): self.logger.info(' Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1))) # write out clusters genome_id = remove_extension(genome_file) for k in range(num_clusters): fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w') for i in np_where(labels == k)[0]: seq_id = seqs.keys()[i] fout.write('>' + seq_id + '\n') fout.write(seqs[seq_id] + '\n') fout.close()
class Tetranucleotide(object): """Calculate tetranucleotide signature of sequences.""" def __init__(self, cpus=1): """Initialization. Parameters ---------- cpus : int Number of cpus to use. """ self.logger = logging.getLogger() self.cpus = cpus self.signatures = GenomicSignature(4) def canonical_order(self): """Canonical order of tetranucleotides.""" return self.signatures.canonical_order() def _producer(self, seq_info): """Calculate tetranucleotide signature of a sequence. Parameters ---------- seq_id : str Unique id of sequence. seq : str Sequence in nuceltoide space. Returns ------- str Unique id of sequence. list Count of each kmer in the canonical order. """ seq_id, seq = seq_info sig = self.signatures.seq_signature(seq) total_kmers = sum(sig) for i in xrange(0, len(sig)): sig[i] = float(sig[i]) / total_kmers return (seq_id, sig) def _consumer(self, produced_data, consumer_data): """Consume results from producer processes. Parameters ---------- produced_data : list -> kmers in the canonical order Tetranucleotide signature in canconical order. consumer_data : d[seq_id] -> tetranucleotide signature Set of kmers observed across all genomes (kmer_set), along with the kmer usage of each genome (genome_kmer_usage). Returns ------- consumer_data: dict The consumer data structure or None must be returned """ if consumer_data == None: consumer_data = {} seq_id, sig = produced_data consumer_data[seq_id] = sig return consumer_data def _progress(self, processed_items, total_items): """Report progress of consumer processes. Parameters ---------- processed_items : int Number of sequences processed. total_items : int Total number of sequences to process. Returns ------- str String indicating progress of data processing. """ return ' Finished processing %d of %d (%.2f%%) sequences.' % (processed_items, total_items, float(processed_items) * 100 / total_items) def run(self, seq_file): """Calculate tetranucleotide signatures of sequences. Parameters ---------- seq_file : str Name of fasta/q file to read. Returns ------- dict : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. """ self.logger.info(' Calculating tetranucleotide signature for each sequence:') parallel = Parallel(self.cpus) seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress) return seq_signatures def read(self, signature_file): """Read tetranucleotide signatures. Parameters ---------- signature_file : str Name of file to read. Returns ------- dict : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. """ try: sig = {} with open(signature_file) as f: header = f.readline().split('\t') kmer_order = [x.strip().upper() for x in header[1:]] if len(kmer_order) != len(self.canonical_order()): raise ParsingError("[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns." % len(self.canonical_order())) canonical_order_index = np.argsort(kmer_order) canonical_order = [kmer_order[i] for i in canonical_order_index] if canonical_order != self.canonical_order(): raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file) for line in f: line_split = line.split('\t') sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index] return sig except IOError: print '[Error] Failed to open signature file: %s' % signature_file sys.exit() except ParsingError: sys.exit() def write(self, signatures, output_file): """Write tetranucleotide signatures. Parameters ---------- signature_file : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. output_file : str Name of output file. """ #singlegenome.write_links(output_file) fout = open(output_file, 'w') fout.write('Scaffold id') for kmer in self.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for seq_id, tetra_signature in signatures.iteritems(): fout.write(seq_id + '\t') fout.write('\t'.join(map(str, tetra_signature))) fout.write('\n') fout.close()
class Tetranucleotide(object): """Calculate tetranucleotide signature of sequences.""" def __init__(self, cpus=1): """Initialization. Parameters ---------- cpus : int Number of cpus to use. """ self.logger = logging.getLogger('timestamp') self.cpus = cpus self.signatures = GenomicSignature(4) def canonical_order(self): """Canonical order of tetranucleotides.""" return self.signatures.canonical_order() def _producer(self, seq_info): """Calculate tetranucleotide signature of a sequence. Parameters ---------- seq_id : str Unique id of sequence. seq : str Sequence in nuceltoide space. Returns ------- str Unique id of sequence. list Count of each kmer in the canonical order. """ seq_id, seq = seq_info sig = self.signatures.seq_signature(seq) total_kmers = sum(sig) for i in xrange(0, len(sig)): sig[i] = float(sig[i]) / total_kmers return (seq_id, sig) def _consumer(self, produced_data, consumer_data): """Consume results from producer processes. Parameters ---------- produced_data : list -> kmers in the canonical order Tetranucleotide signature in canconical order. consumer_data : d[seq_id] -> tetranucleotide signature Set of kmers observed across all genomes (kmer_set), along with the kmer usage of each genome (genome_kmer_usage). Returns ------- consumer_data: dict The consumer data structure or None must be returned """ if consumer_data == None: consumer_data = {} seq_id, sig = produced_data consumer_data[seq_id] = sig return consumer_data def _progress(self, processed_items, total_items): """Report progress of consumer processes. Parameters ---------- processed_items : int Number of sequences processed. total_items : int Total number of sequences to process. Returns ------- str String indicating progress of data processing. """ if self.logger.is_silent: return None else: return ' Finished processing %d of %d (%.2f%%) sequences.' % ( processed_items, total_items, float(processed_items) * 100 / total_items) def run(self, seq_file): """Calculate tetranucleotide signatures of sequences. Parameters ---------- seq_file : str Name of fasta/q file to read. Returns ------- dict : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. """ self.logger.info( 'Calculating tetranucleotide signature for each sequence:') parallel = Parallel(self.cpus) seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress) return seq_signatures def read(self, signature_file): """Read tetranucleotide signatures. Parameters ---------- signature_file : str Name of file to read. Returns ------- dict : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. """ try: sig = {} with open(signature_file) as f: header = f.readline().split('\t') kmer_order = [x.strip().upper() for x in header[1:]] if len(kmer_order) != len(self.canonical_order()): raise ParsingError( "[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns." % len(self.canonical_order())) canonical_order_index = np.argsort(kmer_order) canonical_order = [ kmer_order[i] for i in canonical_order_index ] if canonical_order != self.canonical_order(): raise ParsingError( "[Error] Failed to process tetranucleotide signature file: " + signature_file) for line in f: line_split = line.split('\t') sig[line_split[0]] = [ float(line_split[i + 1]) for i in canonical_order_index ] return sig except IOError: print '[Error] Failed to open signature file: %s' % signature_file sys.exit() except ParsingError: sys.exit() def write(self, signatures, output_file): """Write tetranucleotide signatures. Parameters ---------- signature_file : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. output_file : str Name of output file. """ fout = open(output_file, 'w') fout.write('Scaffold id') for kmer in self.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for seq_id, tetra_signature in signatures.iteritems(): fout.write(seq_id + '\t') fout.write('\t'.join(map(str, tetra_signature))) fout.write('\n') fout.close()
def kmeans(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir): """Cluster genome with k-means. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. num_clusters : int Number of cluster to form. num_components : int Number of PCA components to consider. K : int K-mer size to use for calculating genomic signature no_coverage : boolean Flag indicating if coverage information should be used during clustering. no_pca : boolean Flag indicating if PCA of genomic signature should be calculated. iterations: int iterations to perform during clustering genome_file : str Sequences being clustered. output_dir : str Directory to write results. """ # get GC and mean coverage for each scaffold in genome self.logger.info('Determining mean coverage and genomic signatures.') signatures = GenomicSignature(K) genome_stats = [] signature_matrix = [] seqs = seq_io.read(genome_file) for seq_id, seq in seqs.items(): stats = scaffold_stats.stats[seq_id] if not no_coverage: genome_stats.append((np_mean(stats.coverage))) else: genome_stats.append(()) if K == 0: pass elif K == 4: signature_matrix.append(stats.signature) else: sig = signatures.seq_signature(seq) total_kmers = sum(sig) for i in range(0, len(sig)): sig[i] = float(sig[i]) / total_kmers signature_matrix.append(sig) # calculate PCA of signatures if K != 0: if not no_pca: self.logger.info('Calculating PCA of genomic signatures.') pc, variance = self.pca(signature_matrix) self.logger.info( 'First {:,} PCs capture {:.1f}% of the variance.'.format( num_components, sum(variance[0:num_components]) * 100)) for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, pc[i][0:num_components]) else: self.logger.info('Using complete genomic signature.') for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, signature_matrix[i]) # whiten data if feature matrix contains coverage and genomic signature data if not no_coverage and K != 0: self.logger.info('Whitening data.') genome_stats = whiten(genome_stats) else: genome_stats = np_array(genome_stats) # cluster self.logger.info( 'Partitioning genome into {:,} clusters.'.format(num_clusters)) bError = True while bError: try: bError = False _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise') except ClusterError: bError = True for k in range(num_clusters): self.logger.info('Placed {:,} sequences in cluster {:,}.'.format( sum(labels == k), (k + 1))) # write out clusters genome_id = remove_extension(genome_file) for k in range(num_clusters): fout = open( os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w') for i in np_where(labels == k)[0]: seq_id = seqs.keys()[i] fout.write('>' + seq_id + '\n') fout.write(seqs[seq_id] + '\n') fout.close()