예제 #1
0
    def __init__(self,
                 infasta=None,
                 outfile=None,
                 k=6,
                 binary=True,
                 mean=True,
                 std=True,
                 leave=True,
                 silent=False,
                 label=False):
        self.infasta = infasta
        self.seqs = None
        if infasta is not None:
            self.reader = Reader(infasta)
            self.seqs = self.reader.get_seqs()
        self.outfile = outfile
        self.k = k
        self.binary = binary
        self.mean = mean
        if isinstance(mean, str):
            self.mean = np.load(mean)
        self.std = std
        if isinstance(std, str):
            self.std = np.load(std)
        self.leave = leave
        self.silent = silent
        self.label = label

        self.counts = None
        self.kmers = [''.join(i) for i in product('AGTC', repeat=k)]
        self.map = {k: i for k, i in zip(self.kmers, range(4**k))}
예제 #2
0
    def save(self, names=None):
        """Saves the counts appropriately based on current settings.

        There are four output methods for the counts:
        1. Binary. This saves just the counts as a binary numpy array.
        2. No labels. Saves in plain text, but without any labels.
        3. Default names. If no names are provided, fasta headers will be used as labels.
        4. Custom names. Provide a list of names if you want to label lncRNAs with your own names.

        Parameters
        ----------
        names : [str] (default=None)
            Unique names for rows of the Dataframe.
        """
        assert not (self.binary and self.label
                    ), 'You cannot label a binary file. Set only one as True.'
        assert self.outfile is not None, 'Please provide an outfile location.'
        if self.binary:
            np.save(self.outfile, self.counts)
        elif self.label:
            if names is None:
                if self.reader is None:
                    self.reader = Reader(self.infasta)

                names = self.reader.get_headers()
            df = DataFrame(data=self.counts, index=names, columns=self.kmers)
            df.to_csv(self.outfile)
        else:
            np.savetxt(self.outfile, self.counts, delimiter=',', fmt='%1.6f')
예제 #3
0
    def __init__(self, infasta=None, outfile=None, k=6,
                 binary=True, mean=True, std=True,
                 leave=True, silent=False, label=False, percentage=True):
        self.infasta = infasta
        self.seqs = None
        if infasta is not None:
            self.seqs = Reader(infasta).get_seqs()
        self.outfile = outfile
        self.k = k
        self.binary = binary
        self.mean = mean
        if isinstance(mean, str):
            self.mean = np.load(mean)
        self.std = std
        if isinstance(std, str):
            self.std = np.load(std)
        self.leave = leave
        self.silent = silent
        self.label = label
        self.perc = percentage

        self.counts = None
        self.kmers = [''.join(i) for i in product('AGTC', repeat=k)]
        self.map = {k:i for k,i in zip(self.kmers, range(4**k))}

        if len(self.seqs) == 1 and self.std is True:
            err = ('You cannot standardize a single sequence. '
                   'Please pass the path to an std. dev. array, '
                   'or use raw counts by setting std=False.')
            raise ValueError(err)
예제 #4
0
 def __init__(self, infasta=None, outfile=None, k=6):
     self.seqs = None
     if infasta is not None:
         self.data, self.names, self.seqs = Reader(infasta).get_data()
     self.outfile = outfile
     self.k = k
     self.counts = None
     self.kmers = [''.join(i) for i in list(product('AGTC', repeat=k))]
예제 #5
0
class BasicCounter:
    """Generates overlapping kmer counts for a fasta file

    Parameters
    ----------
    infasta : str (default=None)
        Full path to fasta file to be counted
    outfile : str (default=None)
        Full path to the counts file to be saved
    k : int (default=6)
        Size of kmer to be counted
    binary : bool (default=True)
        Saves as numpy array if True, else saves as csv
    mean : bool, np.array, str (default=True)
        Set the mean to 0 for each kmer/column of the count matrix.
        If str, provide path to a previously calculated mean array.
    std : bool or str (default=True)
        Set the std. dev. to 1 for each kmer/column of the count matrix
        If str, provide path to a previously calculated std array.
    leave : bool (default=True)
        Set to False if get_counts is used within another tqdm loop
    silent : bool (default=False)
        Set to True to turn off tqdm progress bar

    Attributes
    ----------
    counts : None
        Stores the ndarray of kmer counts
    kmers : list
        str elements of all kmers of size k
    map : dict
        Mapping of kmers to column values
    """
    def __init__(self,
                 infasta=None,
                 outfile=None,
                 k=6,
                 binary=True,
                 mean=True,
                 std=True,
                 leave=True,
                 silent=False,
                 label=False):
        self.infasta = infasta
        self.seqs = None
        if infasta is not None:
            self.reader = Reader(infasta)
            self.seqs = self.reader.get_seqs()
        self.outfile = outfile
        self.k = k
        self.binary = binary
        self.mean = mean
        if isinstance(mean, str):
            self.mean = np.load(mean)
        self.std = std
        if isinstance(std, str):
            self.std = np.load(std)
        self.leave = leave
        self.silent = silent
        self.label = label

        self.counts = None
        self.kmers = [''.join(i) for i in product('AGTC', repeat=k)]
        self.map = {k: i for k, i in zip(self.kmers, range(4**k))}

    def occurrences(self, row, seq):
        """Counts kmers on a per kilobase scale"""
        counts = defaultdict(int)
        length = len(seq)
        increment = 1000 / length
        for c in range(length - self.k + 1):
            kmer = seq[c:c + self.k]
            counts[kmer] += increment
        for kmer, n in counts.items():
            if kmer in self.map:
                row[self.map[kmer]] = n
        return row

    def _progress(self):
        """Determine which iterator to loop over for counting."""
        if self.silent:
            return self.seqs

        if not self.leave:
            tqdm_seqs = my_tqdm()(self.seqs, desc='Kmers', leave=False)
        else:
            tqdm_seqs = my_tqdm()(self.seqs)

        return tqdm_seqs

    def center(self):
        """mean center counts by column"""
        if self.mean is True:
            self.mean = np.mean(self.counts, axis=0)
        self.counts -= self.mean

    def standardize(self):
        """divide out the standard deviations from columns of the count matrix"""
        if self.std is True:
            self.std = np.std(self.counts, axis=0)
        self.counts /= self.std

    def get_counts(self):
        """Generates kmer counts for a fasta file"""
        self.counts = np.zeros([len(self.seqs), 4**self.k], dtype=np.float32)
        seqs = self._progress()
        for i, seq in enumerate(seqs):
            self.counts[i] = self.occurrences(self.counts[i], seq)
        if self.mean is not False:
            self.center()
        if self.std is not False:
            self.standardize()

    def save(self, names=None):
        """Saves the counts appropriately based on current settings.

        There are four output methods for the counts:
        1. Binary. This saves just the counts as a binary numpy array.
        2. No labels. Saves in plain text, but without any labels.
        3. Default names. If no names are provided, fasta headers will be used as labels.
        4. Custom names. Provide a list of names if you want to label lncRNAs with your own names.

        Parameters
        ----------
        names : [str] (default=None)
            Unique names for rows of the Dataframe.
        """
        assert not (self.binary and self.label
                    ), 'You cannot label a binary file. Set only one as True.'
        assert self.outfile is not None, 'Please provide an outfile location.'
        if self.binary:
            np.save(self.outfile, self.counts)
        elif self.label:
            if names is None:
                if self.reader is None:
                    self.reader = Reader(self.infasta)

                names = self.reader.get_headers()
            df = DataFrame(data=self.counts, index=names, columns=self.kmers)
            df.to_csv(self.outfile)
        else:
            np.savetxt(self.outfile, self.counts, delimiter=',', fmt='%1.6f')

    def make_count_file(self, names=None):
        """Wrapper function for the most common way to generate count files.
        Given a numpy file name, it will save a numpy file where counts have been:
        cast as a dense array, centered, and standardized.

        Parameters
        ----------
        names : [str] (default=None)
            lncRNA names to pass to self.save
        """
        self.get_counts()
        if self.outfile is not None:
            self.save(names)
        return self.counts