Exemplo n.º 1
0
    def estimate_tree_frequencies(self, region='global', pivots=24):
        '''
        estimate frequencies of clades in the tree, possibly region specific
        '''
        if region == 'global':
            node_filter_func = None
        else:
            node_filter_func = lambda x: x.attr['region'] == region

        if not hasattr(self, 'pivots'):
            tps = np.array(
                [x.attributes['num_date'] for x in self.seqs.seqs.values()])
            self.pivots = make_pivots(pivots, tps)
        else:
            print('estimate_tree_frequencies: using self.pivots')
        if not hasattr(self, 'tree_frequencies'):
            self.tree_frequencies = {}
            self.tree_frequency_confidence = {}
            self.tree_frequency_counts = {}

        tree_freqs = tree_frequencies(
            self.tree.tree,
            self.pivots,
            node_filter=node_filter_func,
            ws=max(2,
                   self.tree.tree.count_terminals() // 10),
            **self.kwargs)

        tree_freqs.estimate_clade_frequencies()
        conf = tree_freqs.calc_confidence()
        self.tree_frequencies[region] = tree_freqs.frequencies
        self.tree_frequency_confidence[region] = conf
        self.tree_frequency_counts[region] = tree_freqs.counts
Exemplo n.º 2
0
    def estimate_tree_frequencies(self, region='global', pivots=24, stiffness=20.0):
        '''
        estimate frequencies of clades in the tree, possibly region specific
        '''
        if not hasattr(self, 'tree_frequencies'):
            self.restore_tree_frequencies()

        if region in self.tree_frequencies:
            self.log.notify("Skipping tree frequency estimation for region: %s" % region)
            return

        if not hasattr(self, 'pivots'):
            tps = np.array([np.mean(x.attributes['num_date']) for x in self.seqs.seqs.values()])
            self.pivots=make_pivots(pivots, tps)

        self.log.notify('Estimate tree frequencies for %s: using self.pivots' % (region))

        # Omit strains sampled prior to the first pivot from frequency calculations.
        if region=='global':
            node_filter_func = lambda node: node.attr["num_date"] >= self.pivots[0]
        else:
            node_filter_func = lambda node: (node.attr['region'] == region) and (node.attr["num_date"] >= self.pivots[0])

        tree_freqs = tree_frequencies(self.tree.tree, self.pivots, method='SLSQP',
                                      node_filter = node_filter_func,
                                      ws = max(2,self.tree.tree.count_terminals()//10),
                                      stiffness = stiffness)

        tree_freqs.estimate_clade_frequencies()
        conf = tree_freqs.calc_confidence()
        self.tree_frequencies[region] = tree_freqs.frequencies
        self.tree_frequency_confidence[region] = conf
        self.tree_frequency_counts[region] = tree_freqs.counts

        self.save_tree_frequencies()
Exemplo n.º 3
0
    def estimate_tree_frequencies(self, region='global', pivots=24):
        '''
        estimate frequencies of clades in the tree, possibly region specific
        '''
        if region == 'global':
            node_filter_func = None
        else:
            node_filter_func = lambda x: x.attr['region'] == region

        if not hasattr(self, 'tree_frequencies'):
            self.restore_tree_frequencies()

        if region in self.tree_frequencies:
            self.log.notify(
                "Skipping tree frequency estimation for region: %s" % region)
            return

        if not hasattr(self, 'pivots'):
            tps = np.array([
                np.mean(x.attributes['num_date'])
                for x in self.seqs.seqs.values()
            ])
            self.pivots = make_pivots(pivots, tps)

        self.log.notify('Estimate tree frequencies for %s: using self.pivots' %
                        (region))

        tree_freqs = tree_frequencies(
            self.tree.tree,
            self.pivots,
            method='SLSQP',
            node_filter=node_filter_func,
            ws=max(2,
                   self.tree.tree.count_terminals() // 10))
        # who knows what kwargs are needed here
        #   **self.kwargs)

        tree_freqs.estimate_clade_frequencies()
        conf = tree_freqs.calc_confidence()
        self.tree_frequencies[region] = tree_freqs.frequencies
        self.tree_frequency_confidence[region] = conf
        self.tree_frequency_counts[region] = tree_freqs.counts

        self.save_tree_frequencies()
Exemplo n.º 4
0
    def estimate_mutation_frequencies(self, region="global", pivots=24):
        '''
        calculate the frequencies of mutation in a particular region
        currently the global frequencies should be estimated first
        because this defines the set of positions at which frequencies in
        other regions are estimated.
        '''
        if not hasattr(self.seqs, 'aln'):
            print("Align sequences first")
            return

        def filter_alignment(aln, region=None, lower_tp=None, upper_tp=None):
            from Bio.Align import MultipleSeqAlignment
            tmp = aln
            if region is not None:
                if type(region) == str:
                    tmp = [s for s in tmp if s.attributes['region'] == region]
                elif type(region) == list:
                    tmp = [s for s in tmp if s.attributes['region'] in region]
                else:
                    print("region must be string or list")
                    return
            if lower_tp is not None:
                tmp = [
                    s for s in tmp
                    if np.mean(s.attributes['num_date']) >= lower_tp
                ]
            if upper_tp is not None:
                tmp = [
                    s for s in tmp
                    if np.mean(s.attributes['num_date']) < upper_tp
                ]
            return MultipleSeqAlignment(tmp)

        if not hasattr(self, 'pivots'):
            tps = np.array([
                np.mean(x.attributes['num_date'])
                for x in self.seqs.seqs.values()
            ])
            self.pivots = make_pivots(pivots, tps)
        else:
            print('estimate_mutation_frequencies: using self.pivots')

        if not hasattr(self, 'mutation_frequencies'):
            self.mutation_frequencies = {}
            self.mutation_frequency_confidence = {}
            self.mutation_frequency_counts = {}

        # loop over nucleotide sequences and translations and calcuate
        # region specific frequencies of mutations above a certain threshold
        if type(region) == str:
            region_name = region
            region_match = region
        elif type(region) == tuple:
            region_name = region[0]
            region_match = region[1]
        else:
            print("region must be string or tuple")
            return
        for prot, aln in [('nuc', self.seqs.aln)
                          ] + self.seqs.translations.items():
            if region_match == "global":
                tmp_aln = filter_alignment(aln,
                                           lower_tp=self.pivots[0],
                                           upper_tp=self.pivots[-1])
                include_set = []
            else:
                tmp_aln = filter_alignment(aln,
                                           region=region_match,
                                           lower_tp=self.pivots[0],
                                           upper_tp=self.pivots[-1])
                include_set = set([
                    pos for (pos, mut) in self.mutation_frequencies[('global',
                                                                     prot)]
                ])
            time_points = [np.mean(x.attributes['num_date']) for x in tmp_aln]
            if len(time_points) == 0:
                print('no samples in region', region_name, prot)
                self.mutation_frequency_counts[region_name] = np.zeros_like(
                    self.pivots)
                continue

            aln_frequencies = alignment_frequencies(
                tmp_aln,
                time_points,
                self.pivots,
                ws=max(2,
                       len(time_points) // 10),
                **self.kwargs)
            aln_frequencies.mutation_frequencies(min_freq=0.01)
            self.mutation_frequencies[(region_name,
                                       prot)] = aln_frequencies.frequencies
            self.mutation_frequency_confidence[(
                region_name, prot)] = aln_frequencies.calc_confidence()
            self.mutation_frequency_counts[
                region_name] = aln_frequencies.counts
Exemplo n.º 5
0
    def estimate_mutation_frequencies(self,
                                      inertia=0.0,
                                      min_freq=0.01,
                                      stiffness=20.0,
                                      pivots=24,
                                      region="global",
                                      include_set={}):
        '''
        calculate the frequencies of mutation in a particular region
        currently the global frequencies should be estimated first
        because this defines the set of positions at which frequencies in
        other regions are estimated.
        '''
        if not hasattr(self.seqs, 'aln'):
            self.log.warn("Align sequences first")
            return

        def filter_alignment(aln, region=None, lower_tp=None, upper_tp=None):
            from Bio.Align import MultipleSeqAlignment
            tmp = aln
            if region is not None:
                if type(region) == str:
                    tmp = [s for s in tmp if s.attributes['region'] == region]
                elif type(region) == list:
                    tmp = [s for s in tmp if s.attributes['region'] in region]
                else:
                    self.log.warn("region must be string or list")
                    return
            if lower_tp is not None:
                tmp = [
                    s for s in tmp
                    if np.mean(s.attributes['num_date']) >= lower_tp
                ]
            if upper_tp is not None:
                tmp = [
                    s for s in tmp
                    if np.mean(s.attributes['num_date']) < upper_tp
                ]
            return MultipleSeqAlignment(tmp)

        if not hasattr(self, 'pivots'):
            tps = np.array([
                np.mean(x.attributes['num_date'])
                for x in self.seqs.seqs.values()
            ])
            self.pivots = make_pivots(pivots, tps)
        # else:
        #     self.log.notify('estimate_mutation_frequencies: using self.pivots')

        if not hasattr(self, 'mutation_frequencies'):
            self.restore_mutation_frequencies()

        # loop over nucleotide sequences and translations and calcuate
        # region specific frequencies of mutations above a certain threshold
        if type(region) == str:
            region_name = region
            region_match = region
        elif type(region) == tuple:
            region_name = region[0]
            region_match = region[1]
        else:
            self.log.warn("region must be string or tuple")
            return

        # loop over different alignment types
        for prot, aln in [('nuc', self.seqs.aln)
                          ] + self.seqs.translations.items():
            if (region_name, prot) in self.mutation_frequencies:
                self.log.notify(
                    "Skipping Frequency Estimation for region \"{}\", protein \"{}\""
                    .format(region_name, prot))
                continue
            self.log.notify(
                "Starting Frequency Estimation for region \"{}\", protein \"{}\""
                .format(region_name, prot))

            # determine set of positions that have to have a frequency calculated
            if prot in include_set:
                tmp_include_set = [x for x in include_set[prot]]
            else:
                tmp_include_set = []

            tmp_aln = filter_alignment(
                aln,
                region=None if region == 'global' else region_match,
                lower_tp=self.pivots[0],
                upper_tp=self.pivots[-1])

            if ('global', prot) in self.mutation_frequencies:
                tmp_include_set += set([
                    pos for (pos, mut) in self.mutation_frequencies[('global',
                                                                     prot)]
                ])

            time_points = [np.mean(x.attributes['num_date']) for x in tmp_aln]
            if len(time_points) == 0:
                self.log.notify('no samples in region {} (protein: {})'.format(
                    region_name, prot))
                self.mutation_frequency_counts[region_name] = np.zeros_like(
                    self.pivots)
                continue

            # instantiate alignment frequency
            aln_frequencies = alignment_frequencies(
                tmp_aln,
                time_points,
                self.pivots,
                ws=max(2,
                       len(time_points) // 10),
                inertia=inertia,
                stiffness=stiffness,
                method='SLSQP')
            if prot == 'nuc':  # if this is a nucleotide alignment, set all non-canonical states to N
                A = aln_frequencies.aln
                A[~((A == 'A') | (A == 'C') | (A == 'G') | (A == 'T') |
                    ('A' == '-'))] = 'N'

            aln_frequencies.mutation_frequencies(
                min_freq=min_freq,
                include_set=tmp_include_set,
                ignore_char='N' if prot == 'nuc' else 'X')
            self.mutation_frequencies[(region_name,
                                       prot)] = aln_frequencies.frequencies
            self.mutation_frequency_confidence[(
                region_name, prot)] = aln_frequencies.calc_confidence()
            self.mutation_frequency_counts[
                region_name] = aln_frequencies.counts

        self.log.notify("Saving mutation frequencies (pickle)")
        with open(self.output_path + "_mut_freqs.pickle", 'wb') as fh:
            pickle.dump(set(self.seqs.seqs.keys()),
                        fh,
                        protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(
                (self.mutation_frequencies, self.mutation_frequency_confidence,
                 self.mutation_frequency_counts),
                fh,
                protocol=pickle.HIGHEST_PROTOCOL)