Exemplo n.º 1
0
    def makeBinDist(self, transformedCP, averageCoverages, kmerNormPC1,
                    kmerPCs, contigGCs, contigLengths):
        """Determine the distribution of the points in this bin

        The distribution is largely normal, except at the boundaries.
        """
        #print("MBD", self.id, self.binSize)
        self.binSize = self.rowIndices.shape[0]
        if (0 == np.size(self.rowIndices)):
            return

        # get the centroids
        (self.covMedians,
         self.covStdevs) = self.getCentroidStats(transformedCP)
        (self.lengthMean,
         self.lengthStd) = self.getCentroidStats(contigLengths)

        self.kValMeanNormPC1 = np_median(kmerPCs[self.rowIndices])
        self.kValStdevNormPC1 = np_std(kmerPCs[self.rowIndices])

        self.kMedian = np_median(kmerPCs[self.rowIndices], axis=0)
        self.kStdevs = np_std(kmerPCs[self.rowIndices], axis=0)

        cvals = self.getAverageCoverageDist(averageCoverages)
        self.cValMedian = np_around(np_median(cvals), decimals=3)
        self.cValStdev = np_around(np_std(cvals), decimals=3)

        self.gcMedian = np_median(contigGCs[self.rowIndices])
        self.gcStdev = np_std(contigGCs[self.rowIndices])

        # work out the total size
        self.totalBP = sum([contigLengths[i] for i in self.rowIndices])

        # set the acceptance ranges
        self.makeLimits()
Exemplo n.º 2
0
    def makeBinDist(self, transformedCP, averageCoverages, kmerNormPC1, kmerPCs, contigGCs, contigLengths):
        """Determine the distribution of the points in this bin

        The distribution is largely normal, except at the boundaries.
        """
        #print "MBD", self.id, self.binSize
        self.binSize = self.rowIndices.shape[0]
        if(0 == np.size(self.rowIndices)):
            return

        # get the centroids
        (self.covMedians, self.covStdevs) = self.getCentroidStats(transformedCP)
        (self.lengthMean, self.lengthStd) = self.getCentroidStats(contigLengths)

        self.kValMeanNormPC1 = np_median(kmerPCs[self.rowIndices])
        self.kValStdevNormPC1 = np_std(kmerPCs[self.rowIndices])

        self.kMedian = np_median(kmerPCs[self.rowIndices], axis=0)
        self.kStdevs = np_std(kmerPCs[self.rowIndices], axis=0)

        cvals = self.getAverageCoverageDist(averageCoverages)
        self.cValMedian = np_around(np_median(cvals), decimals=3)
        self.cValStdev = np_around(np_std(cvals), decimals=3)

        self.gcMedian = np_median(contigGCs[self.rowIndices])
        self.gcStdev = np_std(contigGCs[self.rowIndices])

        # work out the total size
        self.totalBP = sum([contigLengths[i] for i in self.rowIndices])

        # set the acceptance ranges
        self.makeLimits()
Exemplo n.º 3
0
    def expandSelection(self, startIndex, vals, stdevCutoff=0.05, maxSpread=0.1):
        """Expand a selection left and right from a staring index in a list of values
        
        Keep expanding unless the stdev of the values goes above the cutoff
        Return a list of indices into the original list
        """
        ret_list = [startIndex]  # this is what we will give back
        start_val = vals[startIndex]
        value_store = [start_val]

        sorted_indices = np_argsort(vals)
        max_index = len(vals)

        # set the upper and lower to point to the position
        # where the start resides
        lower_index = 0
        upper_index = 0
        for i in range(max_index):
            if sorted_indices[i] == startIndex:
                break
            lower_index += 1
            upper_index += 1
        do_lower = True
        do_upper = True
        max_index -= 1

        while do_lower or do_upper:
            if do_lower:
                do_lower = False
                if lower_index > 0:
                    try_val = vals[sorted_indices[lower_index - 1]]
                    if np_abs(try_val - start_val) < maxSpread:
                        try_array = value_store + [try_val]
                        if np_std(try_array) < stdevCutoff:
                            value_store = try_array
                            lower_index -= 1
                            ret_list.append(sorted_indices[lower_index])
                            do_lower = True
            if do_upper:
                do_upper = False
                if upper_index < max_index:
                    try_val = vals[sorted_indices[upper_index + 1]]
                    if np_abs(try_val - start_val) < maxSpread:
                        try_array = value_store + [try_val]
                        if np_std(try_array) < stdevCutoff:
                            value_store = try_array
                            upper_index += 1
                            ret_list.append(sorted_indices[upper_index])
                            do_upper = True
        return sorted(ret_list)
Exemplo n.º 4
0
    def noise_dwt(cls, coeff, w):
        """Return the estimation of the DWT components noise level

        coeff: DWT coefficients
        w: pywt wavelet object
        """
        n_boot = 1000
        k_th = 10
        k_std = 1. / np_sqrt(2)
        std_l = []
        std_a = np_zeros(n_boot)
        wcomp = cls.wavecomp(coeff, w, len(coeff) - 1)

        for ii in xrange(n_boot):
            std_a[ii] = np_std(bootstrap_resample(wcomp, 10))

        stdv = np_median(std_a)
        std_l.append(stdv)
        for ll in xrange(len(coeff) - 2, 0, -1):
            stdv = stdv * k_std
            std_l.append(stdv)
        std_l.append(0)

        std_l.reverse()
        return np_array(std_l) * k_th
Exemplo n.º 5
0
    def write_rank_count(self, ranks_below_taxon, results_table):
        """Write table indicating number of ranks below each taxa.

        Parameters
        ----------
        ranks_below_taxon : d[taxon][rank prefix] -> count, or list of counts
            Number of ranks below named taxon.
        results_table : str
            Desired output file.
        """
        
        # determine if count is a scalar or vectors
        taxon = ranks_below_taxon.keys()[0]
        rank_prefix = ranks_below_taxon[taxon].keys()[0]
        count = ranks_below_taxon[taxon][rank_prefix]
        
        count_is_scalar = True
        if isinstance(count, (list, tuple)):
            count_is_scalar = False
        
        # write out results sorted by taxonomic rank        
        sorted_taxon = []
        for rank_prefix in (['root'] + list(Taxonomy.rank_prefixes) + ['RS_', 'GB_', 'U_']):
            taxa_at_rank = []
            for taxon in ranks_below_taxon:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)
                    
            sorted_taxon += sorted(taxa_at_rank)
            
        fout = open(results_table, 'w')
        fout.write('Taxon')
        for rank_prefix in Taxonomy.rank_prefixes:
            if count_is_scalar:
                fout.write('\t%s' % rank_prefix.capitalize())
            else:
                fout.write('\t%s\t%s\t%s\t%s' % ('Mean: ' + rank_prefix.capitalize(), 
                                                    'Std: ' + rank_prefix.capitalize(),
                                                    'Min: ' + rank_prefix.capitalize(),
                                                    'Max: ' + rank_prefix.capitalize()))
        fout.write('\n')
            
        for taxon in sorted_taxon:
            fout.write(taxon)
            
            for rank_prefix in Taxonomy.rank_prefixes:
                count = ranks_below_taxon[taxon][rank_prefix.capitalize()]
                if count_is_scalar:
                    fout.write('\t%d' % count)
                else:
                    if len(count) > 0:
                        fout.write('\t%.1f\t%.2f\t%d\t%d' % (np_mean(count), np_std(count), min(count), max(count)))
                    else:
                        fout.write('\t%d\t%d\t%d\t%d' % (0, 0, 0, 0))
                    
            fout.write('\n')
                
        fout.close()
Exemplo n.º 6
0
    def write_rank_count(self, ranks_below_taxon, results_table):
        """Write table indicating number of ranks below each taxa.

        Parameters
        ----------
        ranks_below_taxon : d[taxon][rank prefix] -> count, or list of counts
            Number of ranks below named taxon.
        results_table : str
            Desired output file.
        """
        
        # determine if count is a scalar or vectors
        taxon = list(ranks_below_taxon.keys())[0]
        rank_prefix = list(ranks_below_taxon[taxon].keys())[0]
        count = ranks_below_taxon[taxon][rank_prefix]
        
        count_is_scalar = True
        if isinstance(count, (list, tuple)):
            count_is_scalar = False
        
        # write out results sorted by taxonomic rank        
        sorted_taxon = []
        for rank_prefix in (['root'] + list(Taxonomy.rank_prefixes) + ['RS_', 'GB_', 'U_']):
            taxa_at_rank = []
            for taxon in ranks_below_taxon:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)
                    
            sorted_taxon += sorted(taxa_at_rank)
            
        fout = open(results_table, 'w')
        fout.write('Taxon')
        for rank_prefix in Taxonomy.rank_prefixes:
            if count_is_scalar:
                fout.write('\t%s' % rank_prefix.capitalize())
            else:
                fout.write('\t%s\t%s\t%s\t%s' % ('Mean: ' + rank_prefix.capitalize(), 
                                                    'Std: ' + rank_prefix.capitalize(),
                                                    'Min: ' + rank_prefix.capitalize(),
                                                    'Max: ' + rank_prefix.capitalize()))
        fout.write('\n')
            
        for taxon in sorted_taxon:
            fout.write(taxon)
            
            for rank_prefix in Taxonomy.rank_prefixes:
                count = ranks_below_taxon[taxon][rank_prefix.capitalize()]
                if count_is_scalar:
                    fout.write('\t%d' % count)
                else:
                    if len(count) > 0:
                        fout.write('\t%.1f\t%.2f\t%d\t%d' % (np_mean(count), np_std(count), min(count), max(count)))
                    else:
                        fout.write('\t%d\t%d\t%d\t%d' % (0, 0, 0, 0))
                    
            fout.write('\n')
                
        fout.close()
Exemplo n.º 7
0
    def subsample_msa(self, seqs, markers):
        # type: (dict, list) -> (list, dict)
        """Sample columns from each marker in multiple sequence alignment."""

        alignment_length = len(seqs.values()[0])
        sampled_cols = []
        start = 0
        lack_sufficient_cols = 0
        lack_cols_marker_ids = []
        avg_perc_cols = []
        for marker_id, marker_name, marker_len in markers:
            end = start + marker_len

            valid_cols = self.identify_valid_columns(start, end, seqs)
            assert (len(valid_cols) <= marker_len)  # sanity check

            self.logger.info(
                '%s: S:%d, E:%d, LEN:%d, COLS:%d, PERC:%.1f' %
                (marker_name, start, end, marker_len, len(valid_cols),
                 len(valid_cols) * 100.0 / marker_len))

            avg_perc_cols.append(len(valid_cols) * 100.0 / marker_len)

            if len(valid_cols) < self.subset:
                self.logger.warning('Marker has <%d columns after filtering.' %
                                    self.subset)
                lack_sufficient_cols += 1
                lack_cols_marker_ids.append(marker_id)

            offset_valid_cols = [i + start for i in valid_cols]
            sel_cols = random.sample(offset_valid_cols,
                                     min(self.subset, len(offset_valid_cols)))
            sampled_cols.extend(sel_cols)

            start = end

        mask = [1 if i in sampled_cols else 0 for i in range(alignment_length)]

        self.logger.info(
            'Identified %d of %d marker genes with <%d columns for sampling:' %
            (lack_sufficient_cols, len(markers), self.subset))
        self.logger.info('%s' % ', '.join(lack_cols_marker_ids))
        self.logger.info(
            'Marker genes had %.1f+/-%.1f%% of columns available for selection on average.'
            % (np_mean(avg_perc_cols), np_std(avg_perc_cols)))
        self.logger.info('Final MSA contains %d columns.' % len(sampled_cols))

        # trim columns
        output_seqs = {}
        for seq_id, seq in seqs.iteritems():
            masked_seq = ''.join(
                [seq[i] for i in range(0, len(mask)) if mask[i]])
            output_seqs[seq_id] = masked_seq

        return mask, output_seqs
Exemplo n.º 8
0
    def getCentroidStats(self, profile):
        """Calculate the centroids of the profile"""
        working_list = profile[self.rowIndices]

        # return the mean and stdev
        # we divide by std so we need to make sure it's never 0
        tmp_stds = np_std(working_list, axis=0)
        mean_std = np_mean(tmp_stds)
        try:
            std = np_array([x if x != 0 else mean_std for x in tmp_stds])
        except:
            std = mean_std
        return (np_median(working_list, axis=0), std)
Exemplo n.º 9
0
 def getCentroidStats(self, profile):
     """Calculate the centroids of the profile"""
     working_list = profile[self.rowIndices]
     
     # return the mean and stdev
     # we divide by std so we need to make sure it's never 0
     tmp_stds = np_std(working_list, axis=0)
     mean_std = np_mean(tmp_stds)
     try:
         std = np_array([x if x != 0 else mean_std for x in tmp_stds])
     except:
         std = mean_std
     return (np_median(working_list,axis=0), std)
Exemplo n.º 10
0
	def generate_d3_JSON_ParallelCoords(self):
		# Generate 'History'
		n = 120 # 10 years
		n_fd = 12
		series = util_Tst.create_test_data_correlated_returns(n=n, numDims=1, includeResponse=False)
		dt = util_Tst.create_monthly_date_range(n=n)
		vals = series['data']
		json_history = [DATA_SERIES_TO_JSON(d,v) for (d,v) in zip(dt, vals)]
		# Generate Predictions
		std = np_std(transform_FOD_BackwardLooking(vals,{utl_Trns.FIRST_ORDER_DIFF_TIME:1}))
		end_val = vals[-1,0]
		def get_random_prediction_values(per_fd):
			numPreds = 40
			preds = []
			for i in xrange(numPreds):
				preds.append(end_val + normal()*std*sqrt(per_fd))
			return (range(numPreds), preds)
		def get_model_metadata(model_idx):
			return {
				JSON_MODEL_ID :			model_idx, 
				JSON_MODEL_CONFIDENCE :	random(), 
				JSON_MODEL_DESC :		'junkdesc ' + str(model_idx)	
			}
		end_dt = dt[-1]
		prd_dt = util_Tst.create_monthly_date_range(n=n_fd+1, startEpoch=end_dt+10000) #hacky, but end of next month
		models = {}
		preds = []
		for (i, dt) in enumerate(prd_dt):
			(model_idxs, pred_values) = get_random_prediction_values(i)
			models.update(dict.fromkeys(model_idxs))
			for (md, vl) in zip(model_idxs, pred_values):
				preds.append({
					JSON_MODEL_ID: md,
					JSON_DATE_KEY: dt_epoch_to_str_Y_M_D(dt),
					JSON_VALUE_KEY: vl
				})
		for md in models.keys():
			models[md] = get_model_metadata(md)
		# Save data
		dataName = 'test1'
		filePath = get_json_history_path(dataName)
		save_to_JSON(filePath, json_history)
		filePath = get_json_predictions_path(dataName)
		save_to_JSON(filePath, preds)
		filePath = get_json_model_path(dataName)
		save_to_JSON(filePath, models)


# if __name__ == '__main__':
# 	generator = EMF_TestDataGenerator()
# 	generator.generate_d3_JSON_ParallelCoords()
    def rep_genome_stats(self, clusters, genome_files):
        """Calculate statistics relative to representative genome."""

        self.logger.info('Calculating statistics to cluster representatives:')
        stats = {}
        for idx, (rid, cids) in enumerate(clusters.items()):
            if len(cids) == 0:
                stats[rid] = self.RepStats(min_ani=-1,
                                           mean_ani=-1,
                                           std_ani=-1,
                                           median_ani=-1)
            else:
                # calculate ANI to representative genome
                gid_pairs = []
                for cid in cids:
                    gid_pairs.append((cid, rid))
                    gid_pairs.append((rid, cid))

                if True:  # *** DEBUGGING
                    ani_af = self.fastani.pairs(gid_pairs,
                                                genome_files,
                                                report_progress=False)
                else:
                    ani_af = self.fastani.ani_cache

                # calculate statistics
                anis = [FastANI.symmetric_ani(ani_af, cid, rid)[
                    0] for cid in cids]

                stats[rid] = self.RepStats(min_ani=min(anis),
                                           mean_ani=np_mean(anis),
                                           std_ani=np_std(anis),
                                           median_ani=np_median(anis))

            statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % (
                idx+1,
                len(clusters),
                float((idx+1)*100)/len(clusters))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

        sys.stdout.write('\n')

        return stats
Exemplo n.º 12
0
    def __init__(self, tree, parent, sax, cardinality, sequences):
        """
        Initialization function of the InternalNode class

        :returns: a root node
        :rtype: RootNode
        """

        """ inherits the init function of the rootnode class """
        RootNode.__init__(self, tree=tree, parent=parent,
                          sax=sax, cardinality=cardinality)

        """ transforms the list sequences from PAA"""
        list_ts_paa = self.tree.isax.transform_paa(sequences)
        tmp_mean = np_mean(list_ts_paa, axis=0)
        tmp_stdev = np_std(list_ts_paa, axis=0)

        """ as it is an internal node, it necessarily has at least one downhill node so : """
        """ we calculate the future candidate cardinalities """
        cardinality_next_tmp = np_copy(self.cardinality)
        # if max_card
        if self.tree.boolean_card_max:
            # we multiply by 2 only the cardinalities not exceeding the authorized threshold
            cardinality_next_tmp[cardinality_next_tmp <= self.tree.max_card_alphabet] *= 2
        else:
            # We multiply by 2 all the cardinalities  (they are all candidates)
            cardinality_next_tmp *= 2
        # The self.split function choses the cardinality index to multiply by 2
        position_min = self.split(cardinality_next_tmp, tmp_mean, tmp_stdev)

        """ We write the next cardinality (for its leaf nodes) """
        self.cardinality_next = np_copy(self.cardinality)
        self.cardinality_next[position_min] *= 2
        if self.tree.bigger_current_cardinality < self.cardinality_next[position_min]:
            self.tree.bigger_current_cardinality = self.cardinality_next[position_min]

        self.level = parent.level + 1
Exemplo n.º 13
0
 def _rep_genome_stats(self, clusters, genome_files):
     """Calculate statistics relative to representative genome."""
     
     self.logger.info('Calculating statistics to cluster representatives:')
     stats = {}
     for idx, (rid, cids) in enumerate(clusters.items()):
         if len(cids) == 0:
             stats[rid] = self.RepStats(min_ani = -1,
                                         mean_ani = -1,
                                         std_ani = -1,
                                         median_ani = -1)
         else:
             # calculate ANI to representative genome
             gid_pairs = []
             for cid in cids:
                 gid_pairs.append((cid, rid))
             ani_af = self.ani_cache.fastani_pairs(gid_pairs, 
                                                     genome_files, 
                                                     report_progress=False)
             
             # calculate statistics
             anis = [ani_af[cid][rid][0] for cid in cids]
             stats[rid] = self.RepStats(min_ani = min(anis),
                                         mean_ani = np_mean(anis),
                                         std_ani = np_std(anis),
                                         median_ani = np_median(anis))
                                         
         statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % (
                             idx+1, 
                             len(clusters), 
                             float((idx+1)*100)/len(clusters))
         sys.stdout.write('%s\r' % statusStr)
         sys.stdout.flush()
             
     sys.stdout.write('\n')
         
     return stats
Exemplo n.º 14
0
    def subsample_msa(self, seqs, markers, cols_per_gene, max_gaps,
                      min_identical_aa, max_identical_aa, rnd_seed):
        """Sample columns from each marker in multiple sequence alignment."""

        alignment_length = len(seqs.values()[0])
        sampled_cols = []
        start = 0
        lack_sufficient_cols = 0
        lack_cols_marker_ids = []
        avg_perc_cols = []

        count_wrong_pa = 0
        count_wrong_cons = 0

        random.seed(rnd_seed)

        for marker_id, marker_name, marker_len in markers:
            end = start + marker_len

            valid_cols, count_wrong_pa, count_wrong_cons = self.identify_valid_columns(
                marker_name, count_wrong_pa, count_wrong_cons, start, end,
                seqs, max_gaps, min_identical_aa, max_identical_aa)

            assert (len(valid_cols) <= marker_len)  # sanity check

            self.logger.info(
                '%s: S:%d, E:%d, LEN:%d, COLS:%d, PERC:%.1f' %
                (marker_name, start, end, marker_len, len(valid_cols),
                 len(valid_cols) * 100.0 / marker_len))

            avg_perc_cols.append(len(valid_cols) * 100.0 / marker_len)

            if len(valid_cols) < cols_per_gene:
                self.logger.warning('Marker has <%d columns after filtering.' %
                                    cols_per_gene)
                lack_sufficient_cols += 1
                lack_cols_marker_ids.append(marker_id)

            offset_valid_cols = [i + start for i in valid_cols]
            sampled_cols.extend(
                random.sample(offset_valid_cols,
                              min(cols_per_gene, len(offset_valid_cols))))

            start = end
        mask = [1 if i in sampled_cols else 0 for i in range(alignment_length)]

        self.logger.info(
            'Identified %d of %d marker genes with <%d columns for sampling:' %
            (lack_sufficient_cols, len(markers), cols_per_gene))
        self.logger.info('%s' % ', '.join(lack_cols_marker_ids))
        self.logger.info(
            'Marker genes had %.1f+/-%.1f%% of columns available for selection on average.'
            % (np_mean(avg_perc_cols), np_std(avg_perc_cols)))

        # trim columns
        output_seqs = {}
        for seq_id, seq in seqs.iteritems():
            masked_seq = ''.join(
                [seq[i] for i in xrange(0, len(mask)) if mask[i]])
            output_seqs[seq_id] = masked_seq

        self.logger.info(
            'Trimmed alignment from %d to %d AA (%d by minimum taxa percent, %d by consensus, maximum of %d columns per genes).'
            % (len(
                seqs[seqs.keys()[0]]), len(output_seqs[output_seqs.keys()[0]]),
               count_wrong_pa, count_wrong_cons, cols_per_gene))
        self.logger.info('Final MSA contains %d columns.' % len(sampled_cols))

        return mask, output_seqs
Exemplo n.º 15
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)
        
        
        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) < 2:
                continue
                
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

    
        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))
            
            mono = []
            poly = []
            no_inference = []
            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(dist)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(dist)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(dist)
            
                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1,-1,-1] + [str(percentile_outlier)]
                
                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        
            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference))
            n = 0
            if len(mono) > 0:
                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                n, b, p = ax.hist(mono, bins=bins,
                          color=(0.0, 0.0, 1.0),
                          alpha=0.25,
                          weights=0.9 * w * mono_weights,
                          bottom=i,
                          lw=0,
                          zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)
                          
        fout.close()

    
        # overlay scatter plot elements
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Exemplo n.º 16
0
 def get_value_for_data_only(self, values):
     """
     Return the standard deviation points
     """
     return np_std(values, ddof=1)
Exemplo n.º 17
0
    def get_value_for_data_only(self, values):
        """
        Returns the mean, standard deviation and number of values
        """

        return np_mean(values), np_std(values, ddof=1), np.size(values)
Exemplo n.º 18
0
    def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir):
        """Calculate distribution of branch lengths at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        trusted_taxa_file : str
            File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        taxonomy_file : str
            File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree).
        output_dir : str
            Desired output directory.
        """

        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            
        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
        
        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, set(), min_children, -1)
        
        # determine branch lengths to leaves for named lineages
        rank_bl_dist = defaultdict(list)
        taxa_bl_dist = defaultdict(list)
        taxa_at_rank = defaultdict(list)
        for node in tree.postorder_node_iter():
            if node.is_leaf() or not node.label:
                continue
                
            _support, taxon, _auxiliary_info = parse_label(node.label)
            if not taxon:
                continue
                
            # get most specific rank in multi-rank taxa string
            taxa = [t.strip() for t in taxon.split(';')]
            taxon = taxa[-1]
            
            most_specific_rank = taxon[0:3]
            taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon)
                
            for n in node.leaf_iter():
                dist_to_node = 0
                while n != node:
                    dist_to_node += n.edge_length
                    n = n.parent_node
                
                for t in taxa:
                    taxa_bl_dist[t].append(dist_to_node)

            rank = Taxonomy.rank_labels[Taxonomy.rank_index[most_specific_rank]]
            if rank != 'species' or Taxonomy().validate_species_name(taxon):
                if taxon in taxa_for_dist_inference:
                    rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon]))
                            
        # report number of taxa at each rank
        print ''
        print 'Rank\tTaxa\tTaxa for Inference'
        for rank, taxa in taxa_at_rank.iteritems():
            taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
            print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
        print ''
                    
        # report results sorted by rank
        sorted_taxon = []
        for rank_prefix in Taxonomy.rank_prefixes:
            taxa_at_rank = []
            for taxon in taxa_bl_dist:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)
                    
            sorted_taxon += sorted(taxa_at_rank)
                
        # report results for each named group
        taxa_file = os.path.join(output_dir, 'taxa_bl_dist.tsv')
        fout = open(taxa_file, 'w')
        fout.write('Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for taxon in sorted_taxon:
            dist = taxa_bl_dist[taxon]

            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon,
                                                                str(taxon in taxa_for_dist_inference),
                                                                np_mean(dist),
                                                                np_std(dist),
                                                                p[0], p[1], p[2], p[3], p[4]))
        fout.close()
        
        # report results for each taxonomic rank
        rank_file = os.path.join(output_dir, 'rank_bl_dist.tsv')
        fout = open(rank_file, 'w')
        fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for rank in Taxonomy.rank_labels:
            dist = rank_bl_dist[rank]
            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank,
                                                                np_mean(dist),
                                                                np_std(dist),
                                                                p[0], p[1], p[2], p[3], p[4]))
        fout.close()
        
    def pairwise_stats(self, clusters, genome_files):
        """Calculate statistics for all pairwise comparisons in a species cluster."""

        self.logger.info(
            f'Restricting pairwise comparisons to {self.max_genomes_for_stats:,} randomly selected genomes.')
        self.logger.info(
            'Calculating statistics for all pairwise comparisons in a species cluster:')

        stats = {}
        for idx, (rid, cids) in enumerate(clusters.items()):
            statusStr = '-> Processing {:,} of {:,} ({:2f}%) clusters (size = {:,}).'.ljust(86).format(
                idx+1,
                len(clusters),
                float((idx+1)*100)/len(clusters),
                len(cids))
            sys.stdout.write('{}\r'.format(statusStr))
            sys.stdout.flush()

            if len(cids) == 0:
                stats[rid] = self.PairwiseStats(min_ani=-1,
                                                mean_ani=-1,
                                                std_ani=-1,
                                                median_ani=-1,
                                                ani_to_medoid=-1,
                                                mean_ani_to_medoid=-1,
                                                mean_ani_to_rep=-1,
                                                ani_below_95=-1)
            else:
                if len(cids) > self.max_genomes_for_stats:
                    cids = set(random.sample(cids, self.max_genomes_for_stats))

                # calculate ANI to representative genome
                gid_pairs = []
                gids = list(cids.union([rid]))
                for gid1, gid2 in combinations(gids, 2):
                    gid_pairs.append((gid1, gid2))
                    gid_pairs.append((gid2, gid1))

                if True:  # ***DEBUGGING
                    ani_af = self.fastani.pairs(gid_pairs,
                                                genome_files,
                                                report_progress=False)
                else:
                    ani_af = self.fastani.ani_cache

                # calculate medoid point
                if len(gids) > 2:
                    dist_mat = np_zeros((len(gids), len(gids)))
                    for i, gid1 in enumerate(gids):
                        for j, gid2 in enumerate(gids):
                            if i < j:
                                ani, _af = FastANI.symmetric_ani(
                                    ani_af, gid1, gid2)
                                dist_mat[i, j] = 100 - ani
                                dist_mat[j, i] = 100 - ani

                    medoid_idx = np_argmin(dist_mat.sum(axis=0))
                    medoid_gid = gids[medoid_idx]
                else:
                    # with only 2 genomes in a cluster, the representative is the
                    # natural medoid at least for reporting statistics for the
                    # individual species cluster
                    medoid_gid = rid

                mean_ani_to_medoid = np_mean([FastANI.symmetric_ani(ani_af, gid, medoid_gid)[0]
                                              for gid in gids if gid != medoid_gid])

                mean_ani_to_rep = np_mean([FastANI.symmetric_ani(ani_af, gid, rid)[0]
                                           for gid in gids if gid != rid])

                if mean_ani_to_medoid < mean_ani_to_rep:
                    self.logger.error('mean_ani_to_medoid < mean_ani_to_rep')
                    sys.exit(-1)

                # calculate statistics
                anis = []
                for gid1, gid2 in combinations(gids, 2):
                    ani, _af = FastANI.symmetric_ani(ani_af, gid1, gid2)
                    anis.append(ani)

                stats[rid] = self.PairwiseStats(
                    min_ani=min(anis),
                    mean_ani=np_mean(anis),
                    std_ani=np_std(anis),
                    median_ani=np_median(anis),
                    ani_to_medoid=FastANI.symmetric_ani(
                        ani_af, rid, medoid_gid)[0],
                    mean_ani_to_medoid=mean_ani_to_medoid,
                    mean_ani_to_rep=mean_ani_to_rep,
                    ani_below_95=sum([1 for ani in anis if ani < 95]))

        sys.stdout.write('\n')

        return stats
    def action_naming_priority(self, prev_genomes, cur_genomes,
                               new_updated_sp_clusters):
        """Check if representative should be replace with genome with higher nomenclatural priority."""

        self.logger.info(
            'Identifying genomes with naming priority in GTDB species clusters.'
        )

        out_file = os.path.join(self.output_dir, 'update_priority.tsv')
        fout = open(out_file, 'w')
        fout.write(
            'NCBI species\tGTDB species\tRepresentative\tStrain IDs\tRepresentative type sources\tPriority year\tGTDB type species\tGTDB type strain\tNCBI assembly type'
        )
        fout.write(
            '\tNCBI synonym\tGTDB synonym\tSynonym genome\tSynonym strain IDs\tSynonym type sources\tPriority year\tGTDB type species\tGTDB type strain\tSynonym NCBI assembly type'
        )
        fout.write('\tANI\tAF\tPriority note\n')

        num_higher_priority = 0
        assembly_score_change = []
        anis = []
        afs = []
        for idx, prev_rid in enumerate(prev_genomes.sp_clusters):
            # get type strain genomes in GTDB species cluster, including genomes new to this release
            type_strain_gids = [
                gid for gid in prev_genomes.sp_clusters[prev_rid]
                if gid in cur_genomes
                and cur_genomes[gid].is_effective_type_strain()
            ]
            if prev_rid in new_updated_sp_clusters:
                new_type_strain_gids = [
                    gid for gid in new_updated_sp_clusters[prev_rid]
                    if cur_genomes[gid].is_effective_type_strain()
                ]
                type_strain_gids.extend(new_type_strain_gids)

            if len(type_strain_gids) == 0:
                continue

            # check if representative has already been updated
            updated_rid = self.get_updated_rid(prev_rid)

            type_strain_sp = set([
                cur_genomes[gid].ncbi_taxa.species for gid in type_strain_gids
            ])
            if len(type_strain_sp) == 1 and updated_rid in type_strain_gids:
                continue

            updated_sp = cur_genomes[updated_rid].ncbi_taxa.species
            highest_priority_gid = updated_rid

            if updated_rid not in type_strain_gids:
                highest_priority_gid = None
                if updated_sp in type_strain_sp:
                    sp_gids = [
                        gid for gid in type_strain_gids
                        if cur_genomes[gid].ncbi_taxa.species == updated_sp
                    ]
                    hq_gid = select_highest_quality(sp_gids, cur_genomes)
                    highest_priority_gid = hq_gid

                #self.logger.warning('Representative is a non-type strain genome even though type strain genomes exist in species cluster: {}: {}, {}: {}'.format(
                #                    prev_rid, cur_genomes[prev_rid].is_effective_type_strain(), updated_rid, cur_genomes[updated_rid].is_effective_type_strain()))
                #self.logger.warning('Type strain genomes: {}'.format(','.join(type_strain_gids)))

            # find highest priority genome
            for sp in type_strain_sp:
                if sp == updated_sp:
                    continue

                # get highest quality genome from species
                sp_gids = [
                    gid for gid in type_strain_gids
                    if cur_genomes[gid].ncbi_taxa.species == sp
                ]
                hq_gid = select_highest_quality(sp_gids, cur_genomes)

                if highest_priority_gid is None:
                    highest_priority_gid = hq_gid
                else:
                    highest_priority_gid, note = self.sp_priority_mngr.priority(
                        cur_genomes, highest_priority_gid, hq_gid)

            # check if representative should be updated
            if highest_priority_gid != updated_rid:
                num_higher_priority += 1

                ani, af = self.fastani.symmetric_ani_cached(
                    updated_rid, highest_priority_gid,
                    cur_genomes[updated_rid].genomic_file,
                    cur_genomes[highest_priority_gid].genomic_file)

                anis.append(ani)
                afs.append(af)

                d = cur_genomes[highest_priority_gid].score_assembly(
                ) - cur_genomes[updated_rid].score_assembly()
                assembly_score_change.append(d)

                action = 'NOMENCLATURE_PRIORITY:REPLACED'
                params = {}
                params['prev_ncbi_species'] = cur_genomes[
                    updated_rid].ncbi_taxa.species
                params['prev_year_of_priority'] = cur_genomes[
                    updated_rid].year_of_priority()
                params['new_ncbi_species'] = cur_genomes[
                    highest_priority_gid].ncbi_taxa.species
                params['new_year_of_priority'] = cur_genomes[
                    highest_priority_gid].year_of_priority()
                params['new_rid'] = highest_priority_gid
                params['ani'] = ani
                params['af'] = af
                params['priority_note'] = note

                self.update_rep(prev_rid, highest_priority_gid, action)
                self.action_log.write('{}\t{}\t{}\t{}\n'.format(
                    prev_rid, cur_genomes[updated_rid].gtdb_taxa.species,
                    action, params))

                fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                    cur_genomes[highest_priority_gid].ncbi_taxa.species,
                    cur_genomes[highest_priority_gid].gtdb_taxa.species,
                    highest_priority_gid, ','.join(
                        sorted(
                            cur_genomes[highest_priority_gid].strain_ids())),
                    ','.join(
                        sorted(cur_genomes[highest_priority_gid].
                               gtdb_type_sources())).upper().replace(
                                   'STRAININFO', 'StrainInfo'),
                    cur_genomes[highest_priority_gid].year_of_priority(),
                    cur_genomes[highest_priority_gid].is_gtdb_type_species(),
                    cur_genomes[highest_priority_gid].is_gtdb_type_strain(),
                    cur_genomes[highest_priority_gid].ncbi_type_material))
                fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                    cur_genomes[updated_rid].ncbi_taxa.species,
                    cur_genomes[updated_rid].gtdb_taxa.species, updated_rid,
                    ','.join(sorted(cur_genomes[updated_rid].strain_ids())),
                    ','.join(
                        sorted(cur_genomes[updated_rid].gtdb_type_sources())
                    ).upper().replace('STRAININFO', 'StrainInfo'),
                    cur_genomes[updated_rid].year_of_priority(),
                    cur_genomes[updated_rid].is_gtdb_type_species(),
                    cur_genomes[updated_rid].is_gtdb_type_strain(),
                    cur_genomes[updated_rid].ncbi_type_material))
                fout.write('\t{:.3f}\t{:.4f}\t{}\n'.format(ani, af, note))

        fout.close()

        self.logger.info(
            f' ... identified {num_higher_priority:,} species with representative changed to genome with higher nomenclatural priority.'
        )
        self.logger.info(
            ' ... change in assembly score for new representatives: {:.2f} +/- {:.2f}'
            .format(np_mean(assembly_score_change),
                    np_std(assembly_score_change)))
        self.logger.info(' ... ANI: {:.2f} +/- {:.2f}'.format(
            np_mean(anis), np_std(anis)))
        self.logger.info(' ... AF: {:.2f} +/- {:.2f}'.format(
            np_mean(afs), np_std(afs)))
Exemplo n.º 21
0
    def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file,
            output_dir):
        """Calculate distribution of branch lengths at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        trusted_taxa_file : str
            File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        taxonomy_file : str
            File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree).
        output_dir : str
            Desired output directory.
        """

        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir,
                                         '%s.taxonomy.tsv' % input_tree_name)
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(
            tree, taxonomy, set(), min_children, -1)

        # determine branch lengths to leaves for named lineages
        rank_bl_dist = defaultdict(list)
        taxa_bl_dist = defaultdict(list)
        taxa_at_rank = defaultdict(list)
        for node in tree.postorder_node_iter():
            if node.is_leaf() or not node.label:
                continue

            _support, taxon, _auxiliary_info = parse_label(node.label)
            if not taxon:
                continue

            # get most specific rank in multi-rank taxa string
            taxa = [t.strip() for t in taxon.split(';')]
            taxon = taxa[-1]

            most_specific_rank = taxon[0:3]
            taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon)

            for n in node.leaf_iter():
                dist_to_node = self._dist_to_ancestor(n, node)

                for t in taxa:
                    taxa_bl_dist[t].append(dist_to_node)

            rank = Taxonomy.rank_labels[
                Taxonomy.rank_index[most_specific_rank]]
            if rank != 'species' or Taxonomy().validate_species_name(taxon):
                if taxon in taxa_for_dist_inference:
                    rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon]))

        # report number of taxa at each rank
        print('')
        print('Rank\tTaxa\tTaxa for Inference')
        for rank, taxa in taxa_at_rank.items():
            taxa_for_inference = [
                x for x in taxa if x in taxa_for_dist_inference
            ]
            print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa),
                                  len(taxa_for_inference)))
        print('')

        # report results sorted by rank
        sorted_taxon = []
        for rank_prefix in Taxonomy.rank_prefixes:
            taxa_at_rank = []
            for taxon in taxa_bl_dist:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)

            sorted_taxon += sorted(taxa_at_rank)

        # report results for each named group
        taxa_file = os.path.join(output_dir,
                                 '%s.taxa_bl_dist.tsv' % input_tree_name)
        fout = open(taxa_file, 'w')
        fout.write(
            'Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n'
        )
        for taxon in sorted_taxon:
            dist = taxa_bl_dist[taxon]

            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write(
                '%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' %
                (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist),
                 np_std(dist), p[0], p[1], p[2], p[3], p[4]))
        fout.close()

        # report results for each taxonomic rank
        rank_file = os.path.join(output_dir,
                                 '%s.rank_bl_dist.tsv' % input_tree_name)
        fout = open(rank_file, 'w')
        fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for rank in Taxonomy.rank_labels:
            dist = rank_bl_dist[rank]
            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' %
                       (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2],
                        p[3], p[4]))
        fout.close()

        # report results for each node
        output_bl_file = os.path.join(output_dir,
                                      '%s.node_bl_dist.tsv' % input_tree_name)
        self._write_bl_dist(tree, output_bl_file)
Exemplo n.º 22
0
def stats_for_qlp_well(well, compute_clusters=False, override_thresholds=None):
    """
    Return statistics about a QLWell object read from a QLP file.
    The QLWell object should have a populated `peaks` attribute (reading from QLBs won't work)

    For parameter explanations and return values, see :func:`stats_for_qlp_well`.
    """
    from pyqlb.nstats.peaks import cluster_1d, channel_amplitudes
    from pyqlb.nstats.well import accepted_peaks, above_min_amplitude_peaks, well_channel_sp_values, well_cluster_peaks
    from pyqlb.nstats.well import well_observed_positives_negatives, well_s2d_values, getClusters
    from pyqlb.nstats.well import high_flier_droplets, low_flier_droplets, singleRain_droplets, doubleRain_droplets, diagonal_scatter
    from numpy import mean as np_mean, std as np_std

    if not override_thresholds:
        override_thresholds = (None, None)

    statistics = well_statistics(well, override_thresholds=override_thresholds)
    accepted = len(accepted_peaks(well))
    num_above_min = len(above_min_amplitude_peaks(well))

    if num_above_min > 0 and accepted > 0:
        if well.sum_amplitude_bins:
            peaksets, boundaries, amps = revb_polydisperse_peaks(well, 0, threshold=override_thresholds[0])
            poly_peaks = sum([len(p) for p in peaksets])
            statistics[0].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min
        else:
            peaksets, boundaries, width_gates = polydisperse_peaks(well, 0, threshold=override_thresholds[0])
            poly_peaks = sum([len(p) for p in peaksets])
            statistics[0].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min
    else:
        statistics[0].revb_polydispersity_pct = 0

    s, p_plus, p, p_minus = well_channel_sp_values(well, 0, override_threshold=override_thresholds[0])
    statistics[0].s_value = s
    statistics[0].p_plus = p_plus
    statistics[0].p_plus_drops = int(p_plus*accepted) if p_plus is not None else None
    statistics[0].p = p
    statistics[0].p_drops = int(p*accepted) if p is not None else None
    statistics[0].p_minus = p_minus
    statistics[0].p_minus_drops = int(p_minus*accepted) if p_minus is not None else None

    if num_above_min > 0 and accepted > 0:
        if well.sum_amplitude_bins:
            peaksets, boundaries, amps = revb_polydisperse_peaks(well, 1, threshold=override_thresholds[1])
            poly_peaks = sum([len(p) for p in peaksets])
            statistics[1].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min
        else:
            peaksets, boundaries, width_gates = polydisperse_peaks(well, 1, threshold=override_thresholds[1])
            poly_peaks = sum([len(p) for p in peaksets])
            statistics[1].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min
    else:
        statistics[1].revb_polydispersity_pct = 0

    s, p_plus, p, p_minus = well_channel_sp_values(well, 1, override_threshold=override_thresholds[1])
    statistics[1].s_value = s
    statistics[1].p_plus = p_plus
    statistics[1].p_plus_drops = int(p_plus*accepted) if p_plus is not None else None
    statistics[1].p = p
    statistics[1].p_drops = int(p*accepted) if p is not None else None
    statistics[1].p_minus = p_minus
    statistics[1].p_minus_drops = int(p_minus*accepted) if p_minus is not None else None

    ## compute s2d plots
    s2d_vals = well_s2d_values( well, thresholds=override_thresholds)
    statistics[0].s2d_value = s2d_vals[0] if s2d_vals is not None else None
    statistics[1].s2d_value = s2d_vals[1] if s2d_vals is not None else None

    ## compute extra cluster metrics
    clusters = getClusters( well, override_thresholds )
    dscatter = diagonal_scatter( clusters )
    statistics.diagonal_scatter = dscatter[1] if dscatter is not None else None
    statistics.diagonal_scatter_pct  = dscatter[2] *100 if dscatter is not None else None
    for channel in [0,1]:
        high_fliers = high_flier_droplets( clusters, channel )
        statistics[channel].high_flier_value = high_fliers[1] if high_fliers is not None else None
        statistics[channel].high_flier_pct = high_fliers[2] * 100 if high_fliers is not None else None

        low_fliers  = low_flier_droplets( clusters, channel )
        statistics[channel].low_flier_value  = low_fliers[1] if low_fliers is not None else None
        statistics[channel].low_flier_pct    = low_fliers[2] * 100 if low_fliers is not None else None
        
        singleRain  = singleRain_droplets( clusters, channel )
        statistics[channel].single_rain_value  = singleRain[1] if singleRain is not None else None
        statistics[channel].single_rain_pct  = singleRain[2] * 100 if singleRain is not None else None
        
        doubleRain  = doubleRain_droplets( clusters, channel )
        statistics[channel].double_rain_value = doubleRain[1] if doubleRain is not None else None
        statistics[channel].double_rain_pct = doubleRain[2] * 100 if doubleRain is not None else None


    if compute_clusters:
        clusters = well_cluster_peaks(well, override_thresholds)
    else:
        clusters = {'positive_peaks': {'positive_peaks': [], 'negative_peaks': []},
                    'negative_peaks': {'positive_peaks': [], 'negative_peaks': []}}
 
    # cheap hack
    statistics.alg_version = "%s.%s/%s.%s" % (well.statistics.peak_alg_major_version,
                                              well.statistics.peak_alg_minor_version,
                                              well.statistics.quant_alg_major_version,
                                              well.statistics.quant_alg_minor_version)
    statistics.ref_copy_num = well.ref_copy_num
    statistics[0].decision_tree = well.channels[0].decision_tree_verbose
    statistics[1].decision_tree = well.channels[1].decision_tree_verbose
    # end cheap hack

    # SNR
    for chan in (0,1):
        if override_thresholds[chan]:
            # TODO add this to pyqlb.nstats.well instead
            pos, neg = cluster_1d(accepted_peaks(well), chan, override_thresholds[chan])
        else:
            pos, neg, unknown = well_observed_positives_negatives(well, chan)

        for attr, coll in (('positive_snr', pos),('negative_snr',neg)):
            if len(pos) > 0:
                amps = channel_amplitudes(coll, chan)
                amp_mean = np_mean(amps)
                amp_std = np_std(amps)
                if amp_std > 0:
                    setattr(statistics[chan], attr, amp_mean/amp_std)
                else:
                    setattr(statistics[chan], attr, 10000)
            else:
                setattr(statistics[chan], attr, 0)

    for channel in [0,1]:
        means,stds = total_events_amplitude_vals(well,channel) 
        statistics[channel].total_events_amplitude_mean = means if means is not None else None
        statistics[channel].total_events_amplitude_stdev = stds if stds is not None else None

    return statistics, clusters
Exemplo n.º 23
0
    def eventstudy(self,
                   data=None,
                   model='m',
                   estwin=100,
                   gap=50,
                   evtwins=-10,
                   evtwine=10,
                   minval=70,
                   output='df'):
        """
            Paramaters passed to the event study method.

            data        =   event data (event date & permno combinations)
            model       =   madj (market-adjusted model)
                            m (market model)
                            ff (fama french)
                            ffm (fama french with momentum factor)
            estwin      =   estimation window
            gap         =   gap between estimation window and event window
            evtwins =   days preceding event date to begin event window
            evtwine =   days after event date to close the event window
            minval      =   minimum number of non-missing return observations (per event) to be regressed on
            output      =   output format of the event study results
                            xls (output an excel file to output path)
                            csv (output a csv file to output path)
                            json (output a json file to output path)
                            df (returns a dictionary of pandas dataframes)
                            print (outputs results to the console - not available via qsub)
        """

        ####################################################################################
        #  STEP 1 - SET ESTIMATION, EVENT, AND GAP WINDOWS AND GRAB DATA FROM EVENTS FILE  #
        ####################################################################################

        estwins = (estwin + gap + np_abs(evtwins))  # Estimation window start
        estwine = (gap + np_abs(evtwins) + 1)  # Estimation window end
        evtwinx = (
            estwins + 1
        )  # evt time value (0=event date, -10=window start, 10=window end)
        evtwins = np_abs(
            evtwins
        )  # convert the negative to positive as we will use lag function)
        evtrang = (evtwins + evtwine + 1
                   )  # total event window days (lag + lead + the day itself)
        """
            With the event date as a fixed point, calculate the number of days needed to pass
            to sql lag and lead functions to identify estimation window, gap, and event window.

            evtwins:    event date minus number of preceding days
                        ("event date" - "number of days before event to start [evtwins parameter]")

            evtwine:    event date plus number of following days
                        ("event date" + "number of days after event to end [evtwine parameter]")

            gap:    number of days between the end of the "estimation window"
                    and the beginning of the "event window"

            estwins:     start date of the estimation window
                        ("event date" - "number of days before event to start [evtwins parameter]"
                                      - "number of days in gap [gap parameter]"
                                      - "number of days in estimation window [estwin parameter]")

            evtrang:    entire time range of the event study even from estimate start, through gap,
                        until event window end
                        (evtwins + evtwine + 1)
        """

        # default the event data in case it was not passed, otherwise read what was passed
        evtdata = [{"edate": "05/29/2012", "permno": "10002"}]
        if data is not None:
            evtdata = json_dumps(data)

        # init values wrapped up to be passed to sql statement
        params = {
            'estwins': estwins,
            'estwine': estwine,
            'evtwins': evtwins,
            'evtwine': evtwine,
            'evtwinx': evtwinx,
            'evtdata': evtdata
        }

        #############################################
        #  STEP 2 - GET RETURNS DATA FROM POSTGRES  #
        #############################################

        # Create a database connection
        wconn = self.connect()

        ##############################################################################
        #  Get the initial data from the database and put it in a pandas dataframe   #
        ##############################################################################

        # create a pandas dataframe that will hold data
        df = wconn.raw_sql("""
        SELECT
                a.*,
                x.*,
                c.date as rdate,
                c.ret as ret1,
                (f.mktrf+f.rf) as mkt,
                f.mktrf,
                f.rf,
                f.smb,
                f.hml,
                f.umd,
                (1+c.ret)*(coalesce(d.dlret,0.00)+1)-1-(f.mktrf+f.rf) as exret,
                (1+c.ret)*(coalesce(d.dlret,0.00)+1)-1 as ret,
                case when c.date between a.estwin1 and a.estwin2 then 1 else 0 end as isest,
                case when c.date between a.evtwin1 and a.evtwin2 then 1 else 0 end as isevt,
                case
                  when c.date between a.evtwin1 and a.evtwin2 then (rank() OVER (PARTITION BY x.evtid ORDER BY c.date)-%(evtwinx)s)
                  else (rank() OVER (PARTITION BY x.evtid ORDER BY c.date))
                end as evttime
        FROM
          (
            SELECT
              date,
              lag(date, %(estwins)s ) over (order by date) as estwin1,
              lag(date, %(estwine)s )  over (order by date) as estwin2,
              lag(date, %(evtwins)s )  over (order by date) as evtwin1,
              lead(date, %(evtwine)s )  over (order by date) as evtwin2
            FROM crsp_a_stock.dsi
          ) as a
        JOIN
        (select
                to_char(x.edate, 'ddMONYYYY') || trim(to_char(x.permno,'999999999')) as evtid,
                x.permno,
                x.edate
        from
        json_to_recordset('%(evtdata)s') as x(edate date, permno int)
        ) as x
          ON a.date=x.edate
        JOIN crsp_a_stock.dsf c
            ON x.permno=c.permno
            AND c.date BETWEEN a.estwin1 and a.evtwin2
        JOIN ff_all.factors_daily f
            ON c.date=f.date
        LEFT JOIN crsp_a_stock.dsedelist d
            ON x.permno=d.permno
            AND c.date=d.dlstdt
        WHERE f.mktrf is not null
        AND c.ret is not null
        ORDER BY x.evtid, x.permno, a.date, c.date
        """ % params)

        # Columns coming from the database query
        df.columns = [
            'date', 'estwin1', 'estwin2', 'evtwin1', 'evtwin2', 'evtid',
            'permno', 'edate', 'rdate', 'ret1', 'mkt', 'mktrf', 'rf', 'smb',
            'hml', 'umd', 'exret', 'ret', 'isest', 'isevt', 'evttime'
        ]

        # Additional columns that will hold computed values (post-query)
        addcols = [
            'RMSE', 'INTERCEPT', 'var_estp', 'expret', 'abret', 'alpha',
            '_nobs', '_p_', '_edf_', 'rsq', 'cret', 'cexpret', 'car', 'scar',
            'sar', 'pat_scale', 'bhar', 'lastevtwin', 'cret_edate',
            'scar_edate', 'car_edate', 'bhar_edate', 'pat_scale_edate', 'xyz'
        ]

        # Add them to the dataframe
        for c in addcols:
            if c == 'lastevtwin':
                df[c] = 0
            else:
                df[c] = np_nan

        ###################################################################################
        #  STEP 3 - FOR EACH EVENT, CALCULATE ABNORMAL RETURN BASED ON CHOSEN RISK MODEL  #
        ###################################################################################

        # Loop on every category
        for evt in data:

            permno = evt['permno']
            xdate = evt['edate']
            edate = datetime.strptime(xdate, "%m/%d/%Y").date()

            est_mask = (df['permno'] == permno) & (df['edate'] == edate) & (
                df['isest'] == 1)
            evt_mask = (df['permno'] == permno) & (df['edate'] == edate) & (
                df['isevt'] == 1)

            #######################################################
            #  Check to see it meets the min obs for est window   #
            #######################################################
            _nobs = df["ret"][est_mask].count()

            # Only carry out the analysis if the number of obsevations meets the minimum threshold
            if _nobs >= minval:

                #######################################################
                #  Regression based on model choices=''               #
                #######################################################

                # Market-Adjusted Model
                if model == 'madj':
                    # Set y to the estimation window records
                    y = df["exret"][est_mask]

                    # Calculate mean and standard deviation of returns for the estimation period
                    mean = np_mean(y)
                    stdv = np_std(y, ddof=1)

                    # Update the columns in the original dataframe (reusing the names from SAS code to help with continuity)
                    df.loc[evt_mask, 'INTERCEPT'] = mean
                    df.loc[evt_mask, 'RMSE'] = stdv
                    df.loc[evt_mask, '_nobs'] = len(y)
                    df.loc[evt_mask, 'var_estp'] = stdv**2
                    df.loc[evt_mask, 'alpha'] = mean
                    df.loc[evt_mask, 'rsq'] = 0
                    df.loc[evt_mask, '_p_'] = 1
                    df.loc[evt_mask, '_edf_'] = (len(y) - 1)
                    df.loc[evt_mask, 'expret'] = df.loc[evt_mask, 'mkt']
                    df.loc[evt_mask, 'abret'] = df.loc[evt_mask, 'exret']
                    df_est = df[est_mask]
                    _nobs = len(df_est[df_est.ret.notnull()])

                    nloc = {'const': 0}

                    def f_cret(row):
                        tmp = ((row['ret'] * nloc['const']) +
                               (row['ret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret,
                                                                  axis=1)
                    df.loc[evt_mask, 'cret_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_cexpret(row):
                        tmp = ((row['expret'] * nloc['const']) +
                               (row['expret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret,
                                                                     axis=1)

                    nloc = {'const': 0}

                    def f_car(row):
                        tmp = (row['abret'] + nloc['const'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1)
                    df.loc[evt_mask, 'car_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_sar(row):
                        tmp = (row['abret'] / np_sqrt(row['var_estp']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1)
                    df.loc[evt_mask, 'sar_edate'] = nloc['const']

                    nloc = {'const': 0, 'evtrang': evtrang}

                    def f_scar(row):
                        tmp = (row['car'] / np_sqrt(
                            (evtrang * row['var_estp'])))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar,
                                                                  axis=1)
                    df.loc[evt_mask, 'scar_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_bhar(row):
                        tmp = (row['cret'] - row['cexpret'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar,
                                                                  axis=1)
                    df.loc[evt_mask, 'bhar_edate'] = nloc['const']

                    df.loc[evt_mask,
                           'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00)
                    df.loc[evt_mask,
                           'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00)

                # Market Model
                elif model == 'm':
                    # Set y to the estimation window records
                    X = df["mktrf"][est_mask]
                    y = df["ret"][est_mask]

                    # Fit an OLS model with intercept on mktrf
                    X = sm_add_constant(X)
                    est = sm_OLS(y, X).fit()

                    # Set the variables from the output
                    df_est = df[(df['permno'] == permno)
                                & (df['edate'] == edate) & (df['isest'] == 1)]
                    _nobs = len(
                        df_est[df_est.ret.notnull()])  # not null observations

                    # aggregate variables
                    # cret_edate = np_nan
                    # scar_edate = np_nan
                    # car_edate = np_nan
                    # bhar_edate = np_nan
                    # pat_scale_edate = np_nan
                    alpha = est.params.__getitem__('const')
                    beta1 = est.params.__getitem__('mktrf')

                    df.loc[evt_mask, 'INTERCEPT'] = alpha
                    df.loc[evt_mask, 'alpha'] = alpha
                    df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid)
                    df.loc[evt_mask, '_nobs'] = _nobs
                    df.loc[evt_mask, 'var_estp'] = est.mse_resid
                    df.loc[evt_mask, 'rsq'] = est.rsquared
                    df.loc[evt_mask, '_p_'] = 2
                    df.loc[evt_mask, '_edf_'] = (len(y) - 2)

                    nloc = {'alpha': alpha, 'beta1': beta1, 'const': 0}

                    def f_expret(row):
                        return (nloc['alpha'] + (nloc['beta1'] * row['mktrf']))

                    df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret,
                                                                    axis=1)

                    nloc = {'alpha': alpha, 'beta1': beta1, 'const': 0}

                    def f_abret(row):
                        return (row['ret'] - (nloc['alpha'] +
                                              (nloc['beta1'] * row['mktrf'])))

                    df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret,
                                                                   axis=1)

                    nloc = {'const': 0}

                    def f_cret(row):
                        tmp = ((row['ret'] * nloc['const']) +
                               (row['ret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret,
                                                                  axis=1)
                    df.loc[evt_mask, 'cret_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_cexpret(row):
                        tmp = ((row['expret'] * nloc['const']) +
                               (row['expret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret,
                                                                     axis=1)

                    nloc = {'const': 0}

                    def f_car(row):
                        # nonlocal const
                        tmp = (row['abret'] + nloc['const'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1)
                    df.loc[evt_mask, 'car_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_sar(row):
                        tmp = (row['abret'] / np_sqrt(row['var_estp']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1)
                    df.loc[evt_mask, 'sar_edate'] = nloc['const']

                    nloc = {'const': 0, 'evtrang': evtrang}

                    def f_scar(row):
                        tmp = (row['car'] / np_sqrt(
                            (evtrang * row['var_estp'])))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar,
                                                                  axis=1)
                    df.loc[evt_mask, 'scar_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_bhar(row):
                        tmp = (row['cret'] - row['cexpret'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar,
                                                                  axis=1)
                    df.loc[evt_mask, 'bhar_edate'] = nloc['const']

                    df.loc[evt_mask,
                           'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00)
                    df.loc[evt_mask,
                           'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00)

                # Fama-French Three Factor Model
                elif model == 'ff':
                    # Set y to the estimation window records
                    df_est = df[(df['permno'] == permno)
                                & (df['edate'] == edate) & (df['isest'] == 1)]
                    X = df_est[['smb', 'hml', 'mktrf']]
                    y = df_est['ret']

                    # Fit an OLS model with intercept on mktrf, smb, hml
                    X = sm_add_constant(X)
                    est = sm_OLS(y, X).fit()
                    # est = smf.ols(formula='ret ~ smb + hml + mktrf', data=df_est).fit()

                    alpha = est.params.__getitem__('const')
                    beta1 = est.params.__getitem__('mktrf')
                    beta2 = est.params.__getitem__('smb')
                    beta3 = est.params.__getitem__('hml')

                    df.loc[evt_mask, 'INTERCEPT'] = alpha
                    df.loc[evt_mask, 'alpha'] = alpha
                    df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid)
                    df.loc[evt_mask, '_nobs'] = _nobs
                    df.loc[evt_mask, 'var_estp'] = est.mse_resid
                    df.loc[evt_mask, 'rsq'] = est.rsquared
                    df.loc[evt_mask, '_p_'] = 2
                    df.loc[evt_mask, '_edf_'] = (len(y) - 2)

                    nloc = {
                        'alpha': alpha,
                        'beta1': beta1,
                        'beta2': beta2,
                        'beta3': beta3,
                        'const': 0
                    }

                    def f_expret(row):
                        return ((nloc['alpha'] +
                                 (nloc['beta1'] * row['mktrf']) +
                                 (nloc['beta2'] * row['smb']) +
                                 (nloc['beta3'] * row['hml'])))

                    df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret,
                                                                    axis=1)

                    nloc = {
                        'alpha': alpha,
                        'beta1': beta1,
                        'beta2': beta2,
                        'beta3': beta3,
                        'const': 0
                    }

                    def f_abret(row):
                        return (row['ret'] - ((nloc['alpha'] +
                                               (nloc['beta1'] * row['mktrf']) +
                                               (nloc['beta2'] * row['smb']) +
                                               (nloc['beta3'] * row['hml']))))

                    df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret,
                                                                   axis=1)

                    nloc = {'const': 0}

                    def f_cret(row):
                        tmp = ((row['ret'] * nloc['const']) +
                               (row['ret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret,
                                                                  axis=1)
                    df.loc[evt_mask, 'cret_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_cexpret(row):
                        tmp = ((row['expret'] * nloc['const']) +
                               (row['expret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret,
                                                                     axis=1)
                    nloc = {'const': 0}

                    def f_car(row):
                        tmp = (row['abret'] + nloc['const'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1)
                    df.loc[evt_mask, 'car_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_sar(row):
                        tmp = (row['abret'] / np_sqrt(row['var_estp']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1)
                    df.loc[evt_mask, 'sar_edate'] = nloc['const']

                    nloc = {'const': 0, 'evtrang': evtrang}

                    def f_scar(row):
                        tmp = (row['car'] / np_sqrt(
                            (evtrang * row['var_estp'])))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar,
                                                                  axis=1)
                    df.loc[evt_mask, 'scar_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_bhar(row):
                        tmp = (row['cret'] - row['cexpret'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar,
                                                                  axis=1)
                    df.loc[evt_mask, 'bhar_edate'] = nloc['const']

                    df.loc[evt_mask,
                           'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00)
                    df.loc[evt_mask,
                           'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00)

                # Fama-French Plus Momentum
                elif model == 'ffm':
                    # Set y to the estimation window records
                    df_est = df[(df['permno'] == permno)
                                & (df['edate'] == edate) & (df['isest'] == 1)]

                    X = df_est[['mktrf', 'smb', 'hml',
                                'umd']]  # indicator variables
                    y = df_est['ret']  # response variables

                    # Fit an OLS (ordinary least squares) model with intercept on mktrf, smb, hml, and umd
                    X = sm_add_constant(X)
                    est = sm_OLS(y, X).fit()

                    alpha = est.params.__getitem__('const')
                    beta1 = est.params.__getitem__('mktrf')
                    beta2 = est.params.__getitem__('smb')
                    beta3 = est.params.__getitem__('hml')
                    beta4 = est.params.__getitem__('umd')

                    df.loc[evt_mask, 'INTERCEPT'] = alpha
                    df.loc[evt_mask, 'alpha'] = alpha
                    df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid)
                    df.loc[evt_mask, '_nobs'] = _nobs
                    df.loc[evt_mask, 'var_estp'] = est.mse_resid
                    df.loc[evt_mask, 'rsq'] = est.rsquared
                    df.loc[evt_mask, '_p_'] = 2
                    df.loc[evt_mask, '_edf_'] = (len(y) - 2)

                    nloc = {
                        'alpha': alpha,
                        'beta1': beta1,
                        'beta2': beta2,
                        'beta3': beta3,
                        'beta4': beta4,
                        'const': 0
                    }

                    def f_expret(row):
                        return ((nloc['alpha'] +
                                 (nloc['beta1'] * row['mktrf']) +
                                 (nloc['beta2'] * row['smb']) +
                                 (nloc['beta3'] * row['hml']) +
                                 (nloc['beta4'] * row['umd'])))

                    df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret,
                                                                    axis=1)

                    nloc = {
                        'alpha': alpha,
                        'beta1': beta1,
                        'beta2': beta2,
                        'beta3': beta3,
                        'beta4': beta4,
                        'const': 0
                    }

                    def f_abret(row):
                        return (row['ret'] - ((nloc['alpha'] +
                                               (nloc['beta1'] * row['mktrf']) +
                                               (nloc['beta2'] * row['smb']) +
                                               (nloc['beta3'] * row['hml']) +
                                               (nloc['beta4'] * row['umd']))))

                    df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret,
                                                                   axis=1)

                    nloc = {'const': 0}

                    def f_cret(row):
                        tmp = ((row['ret'] * nloc['const']) +
                               (row['ret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret,
                                                                  axis=1)
                    df.loc[evt_mask, 'cret_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_cexpret(row):
                        tmp = ((row['expret'] * nloc['const']) +
                               (row['expret'] + nloc['const']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret,
                                                                     axis=1)
                    nloc = {'const': 0}

                    def f_car(row):
                        tmp = (row['abret'] + nloc['const'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1)
                    df.loc[evt_mask, 'car_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_sar(row):
                        tmp = (row['abret'] / np_sqrt(row['var_estp']))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1)
                    df.loc[evt_mask, 'sar_edate'] = nloc['const']

                    nloc = {'const': 0, 'evtrang': evtrang}

                    def f_scar(row):
                        tmp = (row['car'] / np_sqrt(
                            (evtrang * row['var_estp'])))
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar,
                                                                  axis=1)
                    df.loc[evt_mask, 'scar_edate'] = nloc['const']

                    nloc = {'const': 0}

                    def f_bhar(row):
                        tmp = (row['cret'] - row['cexpret'])
                        nloc['const'] = tmp
                        return tmp

                    df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar,
                                                                  axis=1)
                    df.loc[evt_mask, 'bhar_edate'] = nloc['const']

                    df.loc[evt_mask,
                           'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00)
                    df.loc[evt_mask,
                           'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00)
                # Something erroneous was passed
                else:
                    df['isest'][evt_mask] = -2

        #################################
        #  STEP 4 - OUTPUT THE RESULTS  #
        #################################
        df_sta = df[df['isevt'] == 1]
        levt = df_sta['evttime'].unique()

        columns = [
            'evttime', 'car_m', 'ret_m', 'abret_m', 'abret_t', 'sar_t',
            'pat_ar', 'cret_edate_m', 'car_edate_m', 'pat_car_edate_m',
            'car_edate_t', 'scar_edate_t', 'bhar_edate_m'
        ]

        idxlist = list(levt)
        df_stats = pd_DataFrame(index=idxlist, columns=columns)
        df_stats = df_stats.fillna(0.00000000)  # with 0s rather than NaNs

        # Event
        df_stats['evttime'] = df_sta.groupby(['evttime'])['evttime'].unique()
        # Means
        df_stats['abret_m'] = df_sta.groupby(['evttime'])['abret'].mean()
        df_stats['bhar_edate_m'] = df_sta.groupby(['evttime'
                                                   ])['bhar_edate'].mean()
        df_stats['car_edate_m'] = df_sta.groupby(['evttime'
                                                  ])['car_edate'].mean()
        df_stats['car_m'] = df_sta.groupby(['evttime'])['car'].mean()
        df_stats['cret_edate_m'] = df_sta.groupby(['evttime'
                                                   ])['cret_edate'].mean()
        df_stats['pat_scale_m'] = df_sta.groupby(['evttime'
                                                  ])['pat_scale'].mean()
        df_stats['pat_car_edate_mean'] = 0
        df_stats['ret_m'] = df_sta.groupby(['evttime'])['ret'].mean()
        df_stats['sar_m'] = df_sta.groupby(['evttime'])['sar'].mean()
        df_stats['scar_edate_m'] = df_sta.groupby(['evttime'
                                                   ])['scar_edate'].mean()
        df_stats['scar_m'] = df_sta.groupby(['evttime'])['scar'].mean()
        # Standard deviations
        df_stats['car_v'] = df_sta.groupby(['evttime'])['car'].std()
        df_stats['abret_v'] = df_sta.groupby(['evttime'])['abret'].std()
        df_stats['sar_v'] = df_sta.groupby(['evttime'])['sar'].std()
        df_stats['pat_scale_v'] = df_sta.groupby(['evttime'
                                                  ])['pat_scale'].std()
        df_stats['car_edate_v'] = df_sta.groupby(['evttime'
                                                  ])['car_edate'].std()
        df_stats['scar_edate_v'] = df_sta.groupby(['evttime'
                                                   ])['scar_edate'].std()
        df_stats['scar_v'] = df_sta.groupby(['evttime'])['scar'].std()
        # Counts
        df_stats['scar_n'] = df_sta.groupby(['evttime'])['scar'].count()
        df_stats['scar_edate_n'] = df_sta.groupby(['evttime'
                                                   ])['scar_edate'].count()
        df_stats['sar_n'] = df_sta.groupby(['evttime'])['sar'].count()
        df_stats['car_n'] = df_sta.groupby(['evttime'])['car'].count()
        df_stats['n'] = df_sta.groupby(['evttime'])['evttime'].count()
        # Sums
        df_stats['pat_scale_edate_s'] = df_sta.groupby(
            ['evttime'])['pat_scale_edate'].sum()
        df_stats['pat_scale_s'] = df_sta.groupby(['evttime'
                                                  ])['pat_scale'].sum()

        # T statistics 1
        def tstat(row, m, v, n):
            return row[m] / (row[v] / np_sqrt(row[n]))

        df_stats['abret_t'] = df_stats.apply(tstat,
                                             axis=1,
                                             args=('abret_m', 'abret_v', 'n'))
        df_stats['sar_t'] = df_stats.apply(tstat,
                                           axis=1,
                                           args=('sar_m', 'sar_v', 'n'))
        df_stats['car_edate_t'] = df_stats.apply(tstat,
                                                 axis=1,
                                                 args=('car_edate_m',
                                                       'car_edate_v', 'n'))
        df_stats['scar_edate_t'] = df_stats.apply(tstat,
                                                  axis=1,
                                                  args=('scar_edate_m',
                                                        'scar_edate_v',
                                                        'scar_edate_n'))

        # T statistics 2
        def tstat2(row, m, s, n):
            return row[m] / (np_sqrt(row[s]) / row[n])

        df_stats['pat_car'] = df_stats.apply(tstat2,
                                             axis=1,
                                             args=('scar_m', 'pat_scale_s',
                                                   'scar_n'))
        df_stats['pat_car_edate_m'] = df_stats.apply(tstat2,
                                                     axis=1,
                                                     args=('scar_edate_m',
                                                           'pat_scale_edate_s',
                                                           'scar_edate_n'))
        df_stats['pat_ar'] = df_stats.apply(tstat2,
                                            axis=1,
                                            args=('sar_m', 'pat_scale_s',
                                                  'sar_n'))

        # FILE 2
        # EVENT WINDOW
        df_evtw = df.ix[
            (df['isevt'] == 1),
            ['permno', 'edate', 'rdate', 'evttime', 'ret', 'abret']]
        df_evtw.sort_values(['permno', 'evttime'], ascending=[True, True])

        # FILE 1
        # EVENT DATE
        maxv = max(levt)
        df_evtd = df.ix[(df['isevt'] == 1) & (df['evttime'] == maxv),
                        ['permno', 'edate', 'cret', 'car', 'bhar']]
        df_evtd.sort_values(['permno', 'edate'], ascending=[True, True])

        if output == 'df':
            retval = {}
            retval['event_stats'] = df_stats
            retval['event_window'] = df_evtw
            retval['event_date'] = df_evtd
            return retval
        elif output == 'print':
            retval = {}
            print(
                tabulate(df_evtd.sort_values(['permno', 'edate'],
                                             ascending=[True, True]),
                         headers='keys',
                         tablefmt='psql'))
            print(tabulate(df_evtw, headers='keys', tablefmt='psql'))
            print(tabulate(df_stats, headers='keys', tablefmt='psql'))
            return retval
        elif output == 'json':
            retval = {}
            retval['event_stats'] = df_stats.to_dict(orient='split')
            retval['event_window'] = df_evtw.to_dict(orient='split')
            retval['event_date'] = df_evtd.to_dict(orient='split')
            # Write this to a file
            with open(os.path.join(self.output_path, 'EventStudy.json'),
                      'w') as outfile:
                json_dump(retval, outfile, cls=EncoderJson)
            # Return the output in case they are doing something programmatically
            return json_dumps(retval, cls=EncoderJson)
        elif output == 'csv':
            retval = ''
            es = StringIO_StringIO()
            df_stats.to_csv(es)
            retval += es.getvalue()
            ew = StringIO_StringIO()
            df_evtw.to_csv(ew)
            retval += "\r"
            retval += ew.getvalue()
            ed = StringIO_StringIO()
            df_evtd.to_csv(ed)
            retval += ed.getvalue()

            # write this to a file
            with open(os.path.join(self.output_path, 'EventStudy.csv'),
                      'w') as outfile:
                outfile.write(retval)

            # return the output in case they are doing something programmatically
            return retval
        elif output == 'xls':
            retval = {}
            xlswriter = pd_ExcelWriter(
                os.path.join(self.output_path, 'EventStudy.xls'))
            df_stats.to_excel(xlswriter, 'Stats')
            df_evtw.to_excel(xlswriter, 'Event Window')
            df_evtd.to_excel(xlswriter, 'Event Date')
            xlswriter.save()
            return retval
        else:
            pass
Exemplo n.º 24
0
    def _distribution_plot(self, rel_dists, rel_dist_thresholds,
                           taxa_for_dist_inference, distribution_table,
                           plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        rel_dist_thresholds: list
            Relative distances cutoffs for defining ranks.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [
                dist for taxa, dist in rel_dists[rank].iteritems()
                if taxa in taxa_for_dist_inference
            ]
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [
                dist for taxa, dist in rel_dists[rank].iteritems()
                if taxa in taxa_for_dist_inference
            ]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write(
            'Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n'
        )
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        rel_dist_thresholds += [1.0]  # append boundry for species
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))

            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if clade_label in taxa_for_dist_inference:
                    c.append((0.0, 0.0, 0.5))
                else:
                    c.append((0.5, 0.5, 0.5))

                p10, p50, p90 = percentiles[i]
                percentile_outlier = not (dist >= p10 and dist <= p90)

                if i == 0:
                    rank_cutoff = rel_dist_thresholds[i]
                    rank_outlier = dist > rank_cutoff
                else:
                    rank_cutoff = rel_dist_thresholds[i]
                    upper_rank_cutoff = rel_dist_thresholds[i - 1]
                    rank_outlier = not (dist >= upper_rank_cutoff
                                        and dist <= rank_cutoff)

                v = [clade_label, dist, rank_cutoff, str(rank_outlier)]
                v += percentiles[i] + [str(percentile_outlier)]
                fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' %
                           tuple(v))
        fout.close()

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # plot relative divergence threshold lines
        y_min, y_max = ax.get_ylim()
        for threshold in rel_dist_thresholds[
                0:-1]:  # don't draw species boundary
            ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--')
            ax.text(threshold + 0.001,
                    y_max,
                    '%.3f' % threshold,
                    horizontalalignment='center')

        # make plot interactive
        mpld3.plugins.connect(
            self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig,
                              mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=96)
    def run(self, 
                cur_gtdb_metadata_file,
                cur_genomic_path_file,
                qc_passed_file,
                ncbi_genbank_assembly_file,
                ltp_taxonomy_file,
                gtdb_type_strains_ledger,
                untrustworthy_type_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""
        
        # get species in LTP reference database
        self.logger.info('Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(f' ... identified {len(ltp_defined_species):,} species.')
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=None,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')
        
        # update current genomes with GTDB-Tk classifications
        self.logger.info('Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(gtdbtk_classify_file, prev_genomes)
        self.logger.info(f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.')
        
        # parsing genomes manually established to be untrustworthy as type
        self.logger.info('Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = {}
        with open(untrustworthy_type_ledger) as f:
            header = f.readline().strip().split('\t')
            
            ncbi_sp_index = header.index('NCBI species')
            reason_index = header.index('Reason for declaring untrustworthy')
            
            for line in f:
                tokens = line.strip().split('\t')
                
                gid = canonical_gid(tokens[0])
                manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index])
        self.logger.info(f' ... identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.')

        # identify NCBI species with multiple genomes assembled from type strain of species
        self.logger.info('Determining number of type strain genomes in each NCBI species.')
        sp_type_strain_genomes = defaultdict(set)
        for gid in cur_genomes:
            if cur_genomes[gid].is_effective_type_strain():
                ncbi_sp = cur_genomes[gid].ncbi_taxa.species
                if ncbi_sp != 's__':
                    # yes, NCBI has genomes marked as assembled from type material
                    # that do not actually have a binomial species name
                    sp_type_strain_genomes[ncbi_sp].add(gid)

        multi_type_strains_sp = [ncbi_sp for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1]
        self.logger.info(f' ... identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.')
        
        # sort by number of genome assemblies
        self.logger.info('Calculating ANI between type strain genomes in each species.')
        
        fout = open(os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w')
        fout.write('NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n')
        
        fout_genomes = open(os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write('Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment')
        fout_genomes.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_unresolved = open(os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_high_divergence = open(os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_untrustworthy = open(os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w')
        fout_untrustworthy.write('Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n')
        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                                        gid, 
                                        ncbi_sp, 
                                        cur_genomes[gid].gtdb_taxa.species,
                                        '<not tested>',
                                        'n/a',
                                        'Manual curation: ' + reason))
        
        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0
        
        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0
        
        use_pickled_results = False #***
        if use_pickled_results:
            self.logger.warning('Using previously calculated ANI results in: {}'.format(self.ani_pickle_dir))
        
        prev_gtdb_sp_conflicts = 0
        for ncbi_sp, type_gids in sorted(sp_type_strain_genomes.items(), key=lambda kv: len(kv[1])):
            if len(type_gids) == 1:
                continue
                
            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                                ncbi_sp, 
                                len(type_gids),
                                processed+1, 
                                len(multi_type_strains_sp),
                                (processed+1)*100.0/len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_')
            if not use_pickled_results: #***
                ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files)
                pickle.dump(ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb'))
            else:
                ani_af = pickle.load(open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb'))
            
            anis = []
            afs = []
            gid_anis = defaultdict(lambda: {})
            gid_afs = defaultdict(lambda: {})
            all_similar = True
            for gid1, gid2 in combinations(type_gids, 2):
                ani, af = symmetric_ani(ani_af, gid1, gid2)
                if ani < 99 or af < 0.65:
                    all_similar = False
                    
                anis.append(ani)
                afs.append(af)
                
                gid_anis[gid1][gid2] = ani
                gid_anis[gid2][gid1] = ani
                
                gid_afs[gid1][gid2] = af
                gid_afs[gid2][gid1] = af
                
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            unresolved_species = False
            
            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            if not all_similar:
                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True
                
                # write out highly divergent cases for manual inspection; 
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_high_divergence.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                                        gid,
                                                        ncbi_sp,
                                                        cur_genomes[gid].gtdb_taxa.genus,
                                                        cur_genomes[gid].gtdb_taxa.species,
                                                        ' / '.join(ltp_species),
                                                        np_mean(list(gid_anis[gid].values())),
                                                        np_std(list(gid_anis[gid].values())),
                                                        np_mean(list(gid_afs[gid].values())),
                                                        np_std(list(gid_afs[gid].values())),
                                                        cur_genomes[gid].excluded_from_refseq_note,
                                                        cur_genomes[gid].ncbi_taxa,
                                                        cur_genomes[gid].gtdb_taxa))
                
                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(gid_anis, 
                                                                                                    ncbi_sp, 
                                                                                                    type_gids, 
                                                                                                    ltp_metadata, 
                                                                                                    ltp_defined_species,
                                                                                                    cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1
                
                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1
                           
                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1
                        
                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                if resolved:
                    unresolved_species = False
                    
                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(specific_epithet(cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True

                    # write results to file
                    for gid, reason in untrustworthy_gids.items():
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                        
                        if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note:
                            reason += "; considered `untrustworthy as type` at NCBI"
                        fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(gid,
                                                                                ncbi_sp,
                                                                                cur_genomes[gid].gtdb_taxa.species,
                                                                                ' / '.join(ltp_species),
                                                                                reason))
                                                                                
                        # Sanity check that if the untrustworthy genome has an LTP to only the
                        # expected species, that all other genomes also have a hit to the 
                        # expected species (or potentially no hit). Otherwise, more consideration
                        # should be given to the genome with the conflicting LTP hit.
                        if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                            other_sp = set()
                            for test_gid in type_gids:
                                ltp_species = self.ltp_species(test_gid, ltp_metadata)
                                if ltp_species and ncbi_sp not in ltp_species:
                                    other_sp.update(ltp_species)
                                
                            if other_sp:
                                self.logger.warning(f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.')
                                
                    num_ncbi_untrustworthy = sum([1 for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note])
                    if num_ncbi_untrustworthy != len(type_gids):
                        for gid in type_gids:
                            if (gid not in untrustworthy_gids 
                                and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note):
                                self.logger.warning("Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy].".format(
                                                        gid, 
                                                        ncbi_sp,
                                                        num_ncbi_untrustworthy,
                                                        len(type_gids)))
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1
                    
                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_unresolved.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                    gid,
                                    ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)
                    
                fout_genomes.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                            gid,
                            gid in untrustworthy_gids,
                            ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species),
                            gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa))

            fout.write('{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                        ncbi_sp,
                        len(type_gids),
                        all_similar,
                        np_mean(anis),
                        np_std(anis),
                        np_mean(afs),
                        np_std(afs),
                        note,
                        ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()
        
        self.logger.info(f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.')
        self.logger.info(f' ... resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.')
        self.logger.info(f' ... resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.')
        self.logger.info(f' ... resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.')
        self.logger.info(f' ... resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.')
        self.logger.info(f' ... resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.')

        if unresolved_sp_count > 0:
            self.logger.warning(f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.')
            self.logger.warning('These should be handled before proceeding with the next step of GTDB species updating.')
            self.logger.warning("This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'.")
        
        self.logger.info(f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.')
Exemplo n.º 26
0
    def _pairwise_stats(self, clusters, genome_files):
        """Calculate statistics for all pairwise comparisons in a species cluster."""
        
        self.logger.info('Calculating statistics for all pairwise comparisons in a species cluster:')
        stats = {}
        for idx, (rid, cids) in enumerate(clusters.items()):
            statusStr = '-> Processing %d of %d (%.2f%%) clusters (size = %d).'.ljust(86) % (
                                idx+1, 
                                len(clusters), 
                                float((idx+1)*100)/len(clusters),
                                len(cids))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
                                
            if len(cids) == 0:
                stats[rid] = self.PairwiseStats(min_ani = -1,
                                                mean_ani = -1,
                                                std_ani = -1,
                                                median_ani = -1,
                                                ani_to_medoid = -1,
                                                mean_ani_to_medoid = -1,
                                                ani_below_95 = -1)
            else:
                if len(cids) > self.max_genomes_for_stats:
                    cids = set(random.sample(cids, self.max_genomes_for_stats))
                
                # calculate ANI to representative genome
                gid_pairs = []
                gids = list(cids.union([rid]))
                for gid1, gid2 in combinations(gids, 2):
                    gid_pairs.append((gid1, gid2))
                    gid_pairs.append((gid2, gid1))
                    
                ani_af = self.ani_cache.fastani_pairs(gid_pairs, 
                                                        genome_files, 
                                                        report_progress=False)
                                                        
                # calculate medoid point
                if len(gids) > 2:
                    dist_mat = np_zeros((len(gids), len(gids)))
                    for i, gid1 in enumerate(gids):
                        for j, gid2 in enumerate(gids):
                            if i < j:
                                ani, af = symmetric_ani(ani_af, gid1, gid2)
                                dist_mat[i, j] = ani
                                dist_mat[j, i] = ani

                    medoid_idx = np_argmin(dist_mat.sum(axis=0))
                    medoid_gid = gids[medoid_idx]
                else:
                    # with only 2 genomes in a cluster, the representative is the
                    # natural medoid at least for reporting statistics for the
                    # individual species cluster
                    medoid_gid = rid
                    
                mean_ani_to_medoid = np_mean([symmetric_ani(ani_af, gid, medoid_gid)[0] 
                                                for gid in gids if gid != medoid_gid])

                # calculate statistics
                anis = []
                for gid1, gid2 in combinations(gids, 2):
                    ani, af = symmetric_ani(ani_af, gid1, gid2)
                    anis.append(ani)
                    
                stats[rid] = self.PairwiseStats(min_ani = min(anis),
                                                mean_ani = np_mean(anis),
                                                std_ani = np_std(anis),
                                                median_ani = np_median(anis),
                                                ani_to_medoid = symmetric_ani(ani_af, rid, medoid_gid)[0],
                                                mean_ani_to_medoid = mean_ani_to_medoid,
                                                ani_below_95 = sum([1 for ani in anis if ani < 95]))

        sys.stdout.write('\n')
            
        return stats
Exemplo n.º 27
0
    def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        rel_dist_thresholds: list
            Relative distances cutoffs for defining ranks.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        rel_dist_thresholds += [1.0]  # append boundry for species
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))

            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if clade_label in taxa_for_dist_inference:
                    c.append((0.0, 0.0, 0.5))
                else:
                    c.append((0.5, 0.5, 0.5))

                p10, p50, p90 = percentiles[i]
                percentile_outlier = not (dist >= p10 and dist <= p90)

                if i == 0:
                    rank_cutoff = rel_dist_thresholds[i]
                    rank_outlier = dist > rank_cutoff
                else:
                    rank_cutoff = rel_dist_thresholds[i]
                    upper_rank_cutoff = rel_dist_thresholds[i - 1]
                    rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff)

                v = [clade_label, dist, rank_cutoff, str(rank_outlier)]
                v += percentiles[i] + [str(percentile_outlier)]
                fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        fout.close()

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # plot relative divergence threshold lines
        y_min, y_max = ax.get_ylim()
        for threshold in rel_dist_thresholds[0:-1]:  # don't draw species boundary
            ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--')
            ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center')

        # make plot interactive
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=96)
Exemplo n.º 28
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)
        
        
        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference]
            if len(v) < 2:
                continue
                
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

    
        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))
            
            mono = []
            poly = []
            no_inference = []
            for clade_label, dist in rel_dists[rank].items():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(dist)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(dist)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(dist)
            
                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1,-1,-1] + [str(percentile_outlier)]
                
                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        
            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            d = len(mono) + len(poly) + len(no_inference)
            if d == 0:
                break
                
            w = float(len(mono)) / d
            n = 0
            if len(mono) > 0:
                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                n, b, p = ax.hist(mono, bins=bins,
                          color=(0.0, 0.0, 1.0),
                          alpha=0.25,
                          weights=0.9 * w * mono_weights,
                          bottom=i,
                          lw=0,
                          zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)
                          
        fout.close()

    
        # overlay scatter plot elements
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(range(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Exemplo n.º 29
0
 def get_value_for_data_only(self, values):
     """
     Return the standard deviation points
     """
     return np_std(values, ddof=1)
    def action_genomic_update(self, rep_change_summary_file, prev_genomes,
                              cur_genomes, new_updated_sp_clusters):
        """Handle representatives with updated genomes."""

        # get genomes with specific changes
        self.logger.info(
            'Identifying representatives with updated genomic files.')
        genomic_update_gids = self.rep_change_gids(rep_change_summary_file,
                                                   'GENOMIC_CHANGE', 'UPDATED')
        self.logger.info(
            f' ... identified {len(genomic_update_gids):,} genomes.')

        # calculate ANI between previous and current genomes
        assembly_score_change = []
        for prev_rid, prev_gtdb_sp in genomic_update_gids.items():
            # check that genome hasn't been lost which should
            # be handled differently
            assert prev_rid in cur_genomes

            ani, af = self.fastani.symmetric_ani_cached(
                f'{prev_rid}-P', f'{prev_rid}-C',
                prev_genomes[prev_rid].genomic_file,
                cur_genomes[prev_rid].genomic_file)

            params = {}
            params['ani'] = ani
            params['af'] = af
            params['prev_ncbi_accession'] = prev_genomes[prev_rid].ncbi_accn
            params['cur_ncbi_accession'] = cur_genomes[prev_rid].ncbi_accn
            assert prev_genomes[prev_rid].ncbi_accn != cur_genomes[
                prev_rid].ncbi_accn

            if ani >= self.genomic_update_ani and af >= self.genomic_update_af:
                params['prev_assembly_quality'] = prev_genomes[
                    prev_rid].score_assembly()
                params['new_assembly_quality'] = cur_genomes[
                    prev_rid].score_assembly()
                action = 'GENOMIC_CHANGE:UPDATED:MINOR_CHANGE'

                d = cur_genomes[prev_rid].score_assembly(
                ) - prev_genomes[prev_rid].score_assembly()
                assembly_score_change.append(d)
            else:
                sp_cids = self.genomes_in_current_sp_cluster(
                    prev_rid, prev_genomes, new_updated_sp_clusters,
                    cur_genomes)

                if sp_cids:
                    new_rid, top_score, ani, af = self.top_ani_score_prev_rep(
                        prev_rid, sp_cids, prev_genomes, cur_genomes)

                    if new_rid == prev_rid:
                        params['prev_assembly_quality'] = prev_genomes[
                            prev_rid].score_assembly()
                        params['new_assembly_quality'] = cur_genomes[
                            prev_rid].score_assembly()
                        action = 'GENOMIC_CHANGE:UPDATED:RETAINED'
                    else:
                        action = 'GENOMIC_CHANGE:UPDATED:REPLACED'
                        params['new_rid'] = new_rid
                        params['ani'] = ani
                        params['af'] = af
                        params['new_assembly_quality'] = cur_genomes[
                            new_rid].score_assembly()
                        params['prev_assembly_quality'] = prev_genomes[
                            prev_rid].score_assembly()

                        self.update_rep(prev_rid, new_rid, action)
                else:
                    action = 'GENOMIC_CHANGE:UPDATED:SPECIES_RETIRED'
                    self.update_rep(prev_rid, None, action)

            self.action_log.write('{}\t{}\t{}\t{}\n'.format(
                prev_rid, prev_gtdb_sp, action, params))

        self.logger.info(
            ' ... change in assembly score for updated genomes: {:.2f} +/- {:.2f}'
            .format(np_mean(assembly_score_change),
                    np_std(assembly_score_change)))
Exemplo n.º 31
0
    def run(self, cur_gtdb_metadata_file, cur_genomic_path_file,
            qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file,
            gtdb_type_strains_ledger, untrustworthy_type_ledger,
            ncbi_env_bioproject_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""

        # get species in LTP reference database
        self.logger.info(
            'Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(
            f' - identified {len(ltp_defined_species):,} species.')

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_ledger,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)

        # parsing genomes manually established to be untrustworthy as type
        self.logger.info(
            'Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = self.parse_untrustworthy_type_ledger(
            untrustworthy_type_ledger)
        self.logger.info(
            f' - identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.'
        )

        # Identify NCBI species with multiple genomes assembled from type strain of species. This
        # is done using a series of heuristics that aim to ensure that the selected type strain
        # genome is reliable. More formal evaluation and a manuscript descirbing this selection
        # process is ultimately required. Ideally, the community will eventually adopt a
        # database that indicates a single `type genome assembly` for each species instead
        # of just indicating a type strain from which many (sometimes dissimilar) assemblies exist.
        self.logger.info(
            'Determining number of type strain genomes in each NCBI species.')
        multi_type_strains_sp = self.sp_with_mult_type_strains(cur_genomes)
        self.logger.info(
            f' - identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.'
        )

        # resolve species with multiple type strain genomes
        fout = open(
            os.path.join(self.output_dir, 'multi_type_strain_species.tsv'),
            'w')
        fout.write(
            'NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n'
        )

        fout_genomes = open(
            os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write(
            'Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment'
        )
        fout_genomes.write(
            '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\tReason for GTDB untrustworthy as type\n'
        )

        fout_unresolved = open(
            os.path.join(self.output_dir,
                         'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write(
            'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write(
            '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n'
        )

        fout_high_divergence = open(
            os.path.join(self.output_dir,
                         'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write(
            'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n'
        )

        fout_untrustworthy = open(
            os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'),
            'w')
        fout_untrustworthy.write(
            'Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n'
        )

        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species,
                '<not tested>', 'n/a', 'Manual curation: ' + reason))

        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0

        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        ncbi_rep_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0

        # *** Perhaps should be an external flag, but used right now to speed up debugging
        use_pickled_results = False
        if use_pickled_results:
            self.logger.warning(
                'Using previously calculated ANI results in: {}'.format(
                    self.ani_pickle_dir))

        prev_gtdb_sp_conflicts = 0

        self.logger.info(
            'Resolving species with multiple type strain genomes:')
        for ncbi_sp, type_gids in sorted(multi_type_strains_sp.items(),
                                         key=lambda kv: len(kv[1])):
            assert len(type_gids) > 1

            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                ncbi_sp, len(type_gids), processed + 1,
                len(multi_type_strains_sp), (processed + 1) * 100.0 /
                len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            all_similar, anis, afs, gid_anis, gid_afs = self.calculate_type_strain_ani(
                ncbi_sp, type_gids, cur_genomes, use_pickled_results)

            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            unresolved_species = False
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            if not all_similar:
                note = ''

                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True

                # write out highly divergent cases for manual inspection;
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)

                        fout_high_divergence.write(
                            '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'
                            .format(gid, ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(
                    gid_anis, ncbi_sp, type_gids, ltp_metadata,
                    ltp_defined_species, cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(
                        gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(
                        gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1

                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1

                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1

                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(
                        gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                # try to resovle by considering genomes annotated as representative genomes at NCBI
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_reps(
                        gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by considering NCBI representative genomes'
                        ncbi_rep_resolved += 1

                if resolved:
                    unresolved_species = False

                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(
                            specific_epithet(
                                cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1

                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)

                        fout_unresolved.write(
                            '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'
                            .format(gid, ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            # remove genomes marked as untrustworthy as type at NCBI if one or more potential type strain genomes remaining
            ncbi_untrustworthy_gids = set([
                gid for gid in type_gids if 'untrustworthy as type' in
                cur_genomes[gid].excluded_from_refseq_note
            ])
            if len(type_gids - set(untrustworthy_gids) -
                   ncbi_untrustworthy_gids) >= 1:
                for gid in ncbi_untrustworthy_gids:
                    untrustworthy_gids[
                        gid] = "Genome annotated as `untrustworthy as type` at NCBI and there are other potential type strain genomes available"

            # report cases where genomes marked as untrustworthy as type at NCBI are being retained as potential type strain genomes
            num_ncbi_untrustworthy = len(ncbi_untrustworthy_gids)
            for gid in type_gids:
                if (gid not in untrustworthy_gids and 'untrustworthy as type'
                        in cur_genomes[gid].excluded_from_refseq_note):
                    self.logger.warning(
                        "Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy]."
                        .format(gid, ncbi_sp, num_ncbi_untrustworthy,
                                len(type_gids)))

            # write out genomes identified as being untrustworthy
            for gid, reason in untrustworthy_gids.items():
                ltp_species = self.ltp_species(gid, ltp_metadata)

                if 'untrustworthy as type' in cur_genomes[
                        gid].excluded_from_refseq_note:
                    reason += "; considered `untrustworthy as type` at NCBI"
                fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                    gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species,
                    ' / '.join(ltp_species), reason))

                # Sanity check that if the untrustworthy genome has an LTP to only the
                # expected species, that all other genomes also have a hit to the
                # expected species (or potentially no hit). Otherwise, more consideration
                # should be given to the genome with the conflicting LTP hit.
                if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                    other_sp = set()
                    for test_gid in type_gids:
                        ltp_species = self.ltp_species(test_gid, ltp_metadata)
                        if ltp_species and ncbi_sp not in ltp_species:
                            other_sp.update(ltp_species)

                    if other_sp:
                        self.logger.warning(
                            f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.'
                        )

            # write out information about all type genomes
            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)

                fout_genomes.write(
                    '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\t{}\n'
                    .format(gid, gid in untrustworthy_gids, ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species), gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa,
                            untrustworthy_gids.get(gid, '')))

            fout.write(
                '{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                    ncbi_sp, len(type_gids), all_similar, np_mean(anis),
                    np_std(anis), np_mean(afs), np_std(afs), note,
                    ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()

        self.logger.info(
            f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.'
        )
        self.logger.info(
            f' - resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.'
        )
        self.logger.info(
            f' - resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.'
        )
        self.logger.info(
            f' - resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.'
        )
        self.logger.info(
            f' - resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.'
        )
        self.logger.info(
            f' - resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.'
        )
        self.logger.info(
            f' - resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.'
        )
        self.logger.info(
            f' - resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.'
        )
        self.logger.info(
            f' - resolved {ncbi_rep_resolved:,} species by considering RefSeq reference and representative designations at NCBI.'
        )

        if unresolved_sp_count > 0:
            self.logger.warning(
                f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.'
            )
            self.logger.warning(
                'These should be handled before proceeding with the next step of GTDB species updating.'
            )
            self.logger.warning(
                "This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'."
            )

        self.logger.info(
            f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.'
        )
Exemplo n.º 32
0
    def get_value_for_data_only(self, values):
        """
        Returns the mean, standard deviation and number of values
        """

        return np_mean(values), np_std(values, ddof=1), np.size(values)