Exemplo n.º 1
0
class Tetranucleotide(object):
    """Calculate tetranucleotide signature of sequences."""

    def __init__(self, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger()

        self.cpus = cpus

        self.signatures = GenomicSignature(4)
        

    def canonical_order(self):
        """Canonical order of tetranucleotides."""
        return self.signatures.canonical_order()


    def _producer(self, seq_info):
        """Calculate tetranucleotide signature of a sequence.
        Parameters
        ----------
        seq_id : str
            Unique id of sequence.
        seq : str
            Sequence in nuceltoide space.
        Returns
        -------
        str
            Unique id of sequence.
        list
            Count of each kmer in the canonical order.
        """

        seq_id, seq = seq_info

        sig = self.signatures.seq_signature(seq)

        total_kmers = sum(sig)
        for i in xrange(0, len(sig)):
            sig[i] = float(sig[i]) / total_kmers

        return (seq_id, sig)

    def _consumer(self, produced_data, consumer_data):
        """Consume results from producer processes.
         Parameters
        ----------
        produced_data : list -> kmers in the canonical order
            Tetranucleotide signature in canconical order.
        consumer_data : d[seq_id] -> tetranucleotide signature
            Set of kmers observed across all genomes (kmer_set),
            along with the kmer usage of each genome (genome_kmer_usage).
        Returns
        -------
        consumer_data: dict
            The consumer data structure or None must be returned
        """

        if consumer_data == None:
            consumer_data = {}

        seq_id, sig = produced_data
        consumer_data[seq_id] = sig

        return consumer_data

    def _progress(self, processed_items, total_items):
        """Report progress of consumer processes.

        Parameters
        ----------
        processed_items : int
            Number of sequences processed.
        total_items : int
            Total number of sequences to process.

        Returns
        -------
        str
            String indicating progress of data processing.
        """

        return '    Finished processing %d of %d (%.2f%%) sequences.' % (processed_items, total_items, float(processed_items) * 100 / total_items)

    def run(self, seq_file):
        """Calculate tetranucleotide signatures of sequences.

        Parameters
        ----------
        seq_file : str
            Name of fasta/q file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        self.logger.info('  Calculating tetranucleotide signature for each sequence:')

        parallel = Parallel(self.cpus)
        seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress)

        return seq_signatures

    def read(self, signature_file):
        """Read tetranucleotide signatures.

        Parameters
        ----------
        signature_file : str
            Name of file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        try:
            sig = {}
            with open(signature_file) as f:
                header = f.readline().split('\t')
                kmer_order = [x.strip().upper() for x in header[1:]]
                if len(kmer_order) != len(self.canonical_order()):
                    raise ParsingError("[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns." % len(self.canonical_order()))

                canonical_order_index = np.argsort(kmer_order)
                canonical_order = [kmer_order[i] for i in canonical_order_index]

                if canonical_order != self.canonical_order():
                    raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file)

                for line in f:
                    line_split = line.split('\t')
                    sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index]

            return sig
        except IOError:
            print '[Error] Failed to open signature file: %s' % signature_file
            sys.exit()
        except ParsingError:
            sys.exit()

    def write(self, signatures, output_file):
        """Write tetranucleotide signatures.

        Parameters
        ----------
        signature_file : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        output_file : str
            Name of output file.
        """
        #singlegenome.write_links(output_file)
        fout = open(output_file, 'w')

        fout.write('Scaffold id')
        for kmer in self.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for seq_id, tetra_signature in signatures.iteritems():
            fout.write(seq_id + '\t')
            fout.write('\t'.join(map(str, tetra_signature)))
            fout.write('\n')

        fout.close()
Exemplo n.º 2
0
class Tetranucleotide(object):
    """Calculate tetranucleotide signature of sequences."""
    def __init__(self, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger('timestamp')

        self.cpus = cpus

        self.signatures = GenomicSignature(4)

    def canonical_order(self):
        """Canonical order of tetranucleotides."""
        return self.signatures.canonical_order()

    def _producer(self, seq_info):
        """Calculate tetranucleotide signature of a sequence.

        Parameters
        ----------
        seq_id : str
            Unique id of sequence.
        seq : str
            Sequence in nuceltoide space.

        Returns
        -------
        str
            Unique id of sequence.
        list
            Count of each kmer in the canonical order.
        """

        seq_id, seq = seq_info

        sig = self.signatures.seq_signature(seq)

        total_kmers = sum(sig)
        for i in xrange(0, len(sig)):
            sig[i] = float(sig[i]) / total_kmers

        return (seq_id, sig)

    def _consumer(self, produced_data, consumer_data):
        """Consume results from producer processes.

         Parameters
        ----------
        produced_data : list -> kmers in the canonical order
            Tetranucleotide signature in canconical order.
        consumer_data : d[seq_id] -> tetranucleotide signature
            Set of kmers observed across all genomes (kmer_set),
            along with the kmer usage of each genome (genome_kmer_usage).

        Returns
        -------
        consumer_data: dict
            The consumer data structure or None must be returned
        """

        if consumer_data == None:
            consumer_data = {}

        seq_id, sig = produced_data
        consumer_data[seq_id] = sig

        return consumer_data

    def _progress(self, processed_items, total_items):
        """Report progress of consumer processes.

        Parameters
        ----------
        processed_items : int
            Number of sequences processed.
        total_items : int
            Total number of sequences to process.

        Returns
        -------
        str
            String indicating progress of data processing.
        """

        if self.logger.is_silent:
            return None
        else:
            return '  Finished processing %d of %d (%.2f%%) sequences.' % (
                processed_items, total_items,
                float(processed_items) * 100 / total_items)

    def run(self, seq_file):
        """Calculate tetranucleotide signatures of sequences.

        Parameters
        ----------
        seq_file : str
            Name of fasta/q file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        self.logger.info(
            'Calculating tetranucleotide signature for each sequence:')

        parallel = Parallel(self.cpus)
        seq_signatures = parallel.run_seqs_file(self._producer, self._consumer,
                                                seq_file, self._progress)

        return seq_signatures

    def read(self, signature_file):
        """Read tetranucleotide signatures.

        Parameters
        ----------
        signature_file : str
            Name of file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        try:
            sig = {}
            with open(signature_file) as f:
                header = f.readline().split('\t')
                kmer_order = [x.strip().upper() for x in header[1:]]
                if len(kmer_order) != len(self.canonical_order()):
                    raise ParsingError(
                        "[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns."
                        % len(self.canonical_order()))

                canonical_order_index = np.argsort(kmer_order)
                canonical_order = [
                    kmer_order[i] for i in canonical_order_index
                ]

                if canonical_order != self.canonical_order():
                    raise ParsingError(
                        "[Error] Failed to process tetranucleotide signature file: "
                        + signature_file)

                for line in f:
                    line_split = line.split('\t')
                    sig[line_split[0]] = [
                        float(line_split[i + 1]) for i in canonical_order_index
                    ]

            return sig
        except IOError:
            print '[Error] Failed to open signature file: %s' % signature_file
            sys.exit()
        except ParsingError:
            sys.exit()

    def write(self, signatures, output_file):
        """Write tetranucleotide signatures.

        Parameters
        ----------
        signature_file : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        output_file : str
            Name of output file.
        """

        fout = open(output_file, 'w')

        fout.write('Scaffold id')
        for kmer in self.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for seq_id, tetra_signature in signatures.iteritems():
            fout.write(seq_id + '\t')
            fout.write('\t'.join(map(str, tetra_signature)))
            fout.write('\n')

        fout.close()
Exemplo n.º 3
0
class KmerUsage(object):
    """Calculate kmer usage over a set of genomes.

    The implementation for calculating genomic signatures
    is not optimized for speed. As such, this class is
    useful for k <= 8.
    """

    def __init__(self, k, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger('timestamp')

        self.k = k
        self.cpus = cpus

        self.logger.info('Calculating unique kmers of size k = %d.' % self.k)
        self.signatures = GenomicSignature(self.k)

    def _producer(self, genome_file):
        """Calculates kmer usage of a genome.

        Parameters
        ----------
        genome_file : str
            Fasta file containing genomic sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : d[kmer] -> count
            Occurrence of each kmer.
        """

        genome_id = ntpath.basename(genome_file)
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(genome_file)
        kmer_usage = self.signatures.counts(seqs)

        return (genome_id, kmer_usage)

    def _consumer(self, produced_data, consumer_data):
        """Consume results from producer processes.

         Parameters
        ----------
        produced_data : list -> [genome_id, kmer_usage]
            Unique id of a genome followed by a dictionary
            indicating its kmer usage.

        Returns
        -------
        consumer_data
            dictionary indicating the frequency of kmers in each genome
        """

        if consumer_data == None:
            consumer_data = defaultdict(dict)

        genome_id, kmer_usage = produced_data
        
        for idx, kmer in enumerate(self.signatures.canonical_order()):
            consumer_data[genome_id][kmer] = kmer_usage[idx]

        return consumer_data

    def _progress(self, processed_items, total_items):
        """Report progress of consumer processes.

        Parameters
        ----------
        processed_items : int
            Number of genomes processed.
        total_items : int
            Total number of genomes to process.

        Returns
        -------
        str
            String indicating progress of data processing.
        """

        return '  Finished processing %d of %d (%.2f%%) genomes.' % (processed_items, total_items, float(processed_items) * 100 / total_items)

    def run(self, genome_files):
        """Calculate kmer usage over a set of genomes.

        Parameters
        ----------
        genome_files : list
            Fasta files containing genomic sequences in nucleotide space.

        Returns
        -------
        dict of dict : d[genome_id][kmer] -> count
           Kmer usage of each genome.
        set
           Set with all identified kmers.
        """

        self.logger.info('Calculating kmer usage for each genome.')
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        kmer_counts = parallel.run(self._producer, self._consumer, genome_files, progress_func)

        return kmer_counts, self.signatures.canonical_order()