def makeBinDist(self, transformedCP, averageCoverages, kmerNormPC1, kmerPCs, contigGCs, contigLengths): """Determine the distribution of the points in this bin The distribution is largely normal, except at the boundaries. """ #print("MBD", self.id, self.binSize) self.binSize = self.rowIndices.shape[0] if (0 == np.size(self.rowIndices)): return # get the centroids (self.covMedians, self.covStdevs) = self.getCentroidStats(transformedCP) (self.lengthMean, self.lengthStd) = self.getCentroidStats(contigLengths) self.kValMeanNormPC1 = np_median(kmerPCs[self.rowIndices]) self.kValStdevNormPC1 = np_std(kmerPCs[self.rowIndices]) self.kMedian = np_median(kmerPCs[self.rowIndices], axis=0) self.kStdevs = np_std(kmerPCs[self.rowIndices], axis=0) cvals = self.getAverageCoverageDist(averageCoverages) self.cValMedian = np_around(np_median(cvals), decimals=3) self.cValStdev = np_around(np_std(cvals), decimals=3) self.gcMedian = np_median(contigGCs[self.rowIndices]) self.gcStdev = np_std(contigGCs[self.rowIndices]) # work out the total size self.totalBP = sum([contigLengths[i] for i in self.rowIndices]) # set the acceptance ranges self.makeLimits()
def makeBinDist(self, transformedCP, averageCoverages, kmerNormPC1, kmerPCs, contigGCs, contigLengths): """Determine the distribution of the points in this bin The distribution is largely normal, except at the boundaries. """ #print "MBD", self.id, self.binSize self.binSize = self.rowIndices.shape[0] if(0 == np.size(self.rowIndices)): return # get the centroids (self.covMedians, self.covStdevs) = self.getCentroidStats(transformedCP) (self.lengthMean, self.lengthStd) = self.getCentroidStats(contigLengths) self.kValMeanNormPC1 = np_median(kmerPCs[self.rowIndices]) self.kValStdevNormPC1 = np_std(kmerPCs[self.rowIndices]) self.kMedian = np_median(kmerPCs[self.rowIndices], axis=0) self.kStdevs = np_std(kmerPCs[self.rowIndices], axis=0) cvals = self.getAverageCoverageDist(averageCoverages) self.cValMedian = np_around(np_median(cvals), decimals=3) self.cValStdev = np_around(np_std(cvals), decimals=3) self.gcMedian = np_median(contigGCs[self.rowIndices]) self.gcStdev = np_std(contigGCs[self.rowIndices]) # work out the total size self.totalBP = sum([contigLengths[i] for i in self.rowIndices]) # set the acceptance ranges self.makeLimits()
def expandSelection(self, startIndex, vals, stdevCutoff=0.05, maxSpread=0.1): """Expand a selection left and right from a staring index in a list of values Keep expanding unless the stdev of the values goes above the cutoff Return a list of indices into the original list """ ret_list = [startIndex] # this is what we will give back start_val = vals[startIndex] value_store = [start_val] sorted_indices = np_argsort(vals) max_index = len(vals) # set the upper and lower to point to the position # where the start resides lower_index = 0 upper_index = 0 for i in range(max_index): if sorted_indices[i] == startIndex: break lower_index += 1 upper_index += 1 do_lower = True do_upper = True max_index -= 1 while do_lower or do_upper: if do_lower: do_lower = False if lower_index > 0: try_val = vals[sorted_indices[lower_index - 1]] if np_abs(try_val - start_val) < maxSpread: try_array = value_store + [try_val] if np_std(try_array) < stdevCutoff: value_store = try_array lower_index -= 1 ret_list.append(sorted_indices[lower_index]) do_lower = True if do_upper: do_upper = False if upper_index < max_index: try_val = vals[sorted_indices[upper_index + 1]] if np_abs(try_val - start_val) < maxSpread: try_array = value_store + [try_val] if np_std(try_array) < stdevCutoff: value_store = try_array upper_index += 1 ret_list.append(sorted_indices[upper_index]) do_upper = True return sorted(ret_list)
def noise_dwt(cls, coeff, w): """Return the estimation of the DWT components noise level coeff: DWT coefficients w: pywt wavelet object """ n_boot = 1000 k_th = 10 k_std = 1. / np_sqrt(2) std_l = [] std_a = np_zeros(n_boot) wcomp = cls.wavecomp(coeff, w, len(coeff) - 1) for ii in xrange(n_boot): std_a[ii] = np_std(bootstrap_resample(wcomp, 10)) stdv = np_median(std_a) std_l.append(stdv) for ll in xrange(len(coeff) - 2, 0, -1): stdv = stdv * k_std std_l.append(stdv) std_l.append(0) std_l.reverse() return np_array(std_l) * k_th
def write_rank_count(self, ranks_below_taxon, results_table): """Write table indicating number of ranks below each taxa. Parameters ---------- ranks_below_taxon : d[taxon][rank prefix] -> count, or list of counts Number of ranks below named taxon. results_table : str Desired output file. """ # determine if count is a scalar or vectors taxon = ranks_below_taxon.keys()[0] rank_prefix = ranks_below_taxon[taxon].keys()[0] count = ranks_below_taxon[taxon][rank_prefix] count_is_scalar = True if isinstance(count, (list, tuple)): count_is_scalar = False # write out results sorted by taxonomic rank sorted_taxon = [] for rank_prefix in (['root'] + list(Taxonomy.rank_prefixes) + ['RS_', 'GB_', 'U_']): taxa_at_rank = [] for taxon in ranks_below_taxon: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) fout = open(results_table, 'w') fout.write('Taxon') for rank_prefix in Taxonomy.rank_prefixes: if count_is_scalar: fout.write('\t%s' % rank_prefix.capitalize()) else: fout.write('\t%s\t%s\t%s\t%s' % ('Mean: ' + rank_prefix.capitalize(), 'Std: ' + rank_prefix.capitalize(), 'Min: ' + rank_prefix.capitalize(), 'Max: ' + rank_prefix.capitalize())) fout.write('\n') for taxon in sorted_taxon: fout.write(taxon) for rank_prefix in Taxonomy.rank_prefixes: count = ranks_below_taxon[taxon][rank_prefix.capitalize()] if count_is_scalar: fout.write('\t%d' % count) else: if len(count) > 0: fout.write('\t%.1f\t%.2f\t%d\t%d' % (np_mean(count), np_std(count), min(count), max(count))) else: fout.write('\t%d\t%d\t%d\t%d' % (0, 0, 0, 0)) fout.write('\n') fout.close()
def write_rank_count(self, ranks_below_taxon, results_table): """Write table indicating number of ranks below each taxa. Parameters ---------- ranks_below_taxon : d[taxon][rank prefix] -> count, or list of counts Number of ranks below named taxon. results_table : str Desired output file. """ # determine if count is a scalar or vectors taxon = list(ranks_below_taxon.keys())[0] rank_prefix = list(ranks_below_taxon[taxon].keys())[0] count = ranks_below_taxon[taxon][rank_prefix] count_is_scalar = True if isinstance(count, (list, tuple)): count_is_scalar = False # write out results sorted by taxonomic rank sorted_taxon = [] for rank_prefix in (['root'] + list(Taxonomy.rank_prefixes) + ['RS_', 'GB_', 'U_']): taxa_at_rank = [] for taxon in ranks_below_taxon: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) fout = open(results_table, 'w') fout.write('Taxon') for rank_prefix in Taxonomy.rank_prefixes: if count_is_scalar: fout.write('\t%s' % rank_prefix.capitalize()) else: fout.write('\t%s\t%s\t%s\t%s' % ('Mean: ' + rank_prefix.capitalize(), 'Std: ' + rank_prefix.capitalize(), 'Min: ' + rank_prefix.capitalize(), 'Max: ' + rank_prefix.capitalize())) fout.write('\n') for taxon in sorted_taxon: fout.write(taxon) for rank_prefix in Taxonomy.rank_prefixes: count = ranks_below_taxon[taxon][rank_prefix.capitalize()] if count_is_scalar: fout.write('\t%d' % count) else: if len(count) > 0: fout.write('\t%.1f\t%.2f\t%d\t%d' % (np_mean(count), np_std(count), min(count), max(count))) else: fout.write('\t%d\t%d\t%d\t%d' % (0, 0, 0, 0)) fout.write('\n') fout.close()
def subsample_msa(self, seqs, markers): # type: (dict, list) -> (list, dict) """Sample columns from each marker in multiple sequence alignment.""" alignment_length = len(seqs.values()[0]) sampled_cols = [] start = 0 lack_sufficient_cols = 0 lack_cols_marker_ids = [] avg_perc_cols = [] for marker_id, marker_name, marker_len in markers: end = start + marker_len valid_cols = self.identify_valid_columns(start, end, seqs) assert (len(valid_cols) <= marker_len) # sanity check self.logger.info( '%s: S:%d, E:%d, LEN:%d, COLS:%d, PERC:%.1f' % (marker_name, start, end, marker_len, len(valid_cols), len(valid_cols) * 100.0 / marker_len)) avg_perc_cols.append(len(valid_cols) * 100.0 / marker_len) if len(valid_cols) < self.subset: self.logger.warning('Marker has <%d columns after filtering.' % self.subset) lack_sufficient_cols += 1 lack_cols_marker_ids.append(marker_id) offset_valid_cols = [i + start for i in valid_cols] sel_cols = random.sample(offset_valid_cols, min(self.subset, len(offset_valid_cols))) sampled_cols.extend(sel_cols) start = end mask = [1 if i in sampled_cols else 0 for i in range(alignment_length)] self.logger.info( 'Identified %d of %d marker genes with <%d columns for sampling:' % (lack_sufficient_cols, len(markers), self.subset)) self.logger.info('%s' % ', '.join(lack_cols_marker_ids)) self.logger.info( 'Marker genes had %.1f+/-%.1f%% of columns available for selection on average.' % (np_mean(avg_perc_cols), np_std(avg_perc_cols))) self.logger.info('Final MSA contains %d columns.' % len(sampled_cols)) # trim columns output_seqs = {} for seq_id, seq in seqs.iteritems(): masked_seq = ''.join( [seq[i] for i in range(0, len(mask)) if mask[i]]) output_seqs[seq_id] = masked_seq return mask, output_seqs
def getCentroidStats(self, profile): """Calculate the centroids of the profile""" working_list = profile[self.rowIndices] # return the mean and stdev # we divide by std so we need to make sure it's never 0 tmp_stds = np_std(working_list, axis=0) mean_std = np_mean(tmp_stds) try: std = np_array([x if x != 0 else mean_std for x in tmp_stds]) except: std = mean_std return (np_median(working_list, axis=0), std)
def getCentroidStats(self, profile): """Calculate the centroids of the profile""" working_list = profile[self.rowIndices] # return the mean and stdev # we divide by std so we need to make sure it's never 0 tmp_stds = np_std(working_list, axis=0) mean_std = np_mean(tmp_stds) try: std = np_array([x if x != 0 else mean_std for x in tmp_stds]) except: std = mean_std return (np_median(working_list,axis=0), std)
def generate_d3_JSON_ParallelCoords(self): # Generate 'History' n = 120 # 10 years n_fd = 12 series = util_Tst.create_test_data_correlated_returns(n=n, numDims=1, includeResponse=False) dt = util_Tst.create_monthly_date_range(n=n) vals = series['data'] json_history = [DATA_SERIES_TO_JSON(d,v) for (d,v) in zip(dt, vals)] # Generate Predictions std = np_std(transform_FOD_BackwardLooking(vals,{utl_Trns.FIRST_ORDER_DIFF_TIME:1})) end_val = vals[-1,0] def get_random_prediction_values(per_fd): numPreds = 40 preds = [] for i in xrange(numPreds): preds.append(end_val + normal()*std*sqrt(per_fd)) return (range(numPreds), preds) def get_model_metadata(model_idx): return { JSON_MODEL_ID : model_idx, JSON_MODEL_CONFIDENCE : random(), JSON_MODEL_DESC : 'junkdesc ' + str(model_idx) } end_dt = dt[-1] prd_dt = util_Tst.create_monthly_date_range(n=n_fd+1, startEpoch=end_dt+10000) #hacky, but end of next month models = {} preds = [] for (i, dt) in enumerate(prd_dt): (model_idxs, pred_values) = get_random_prediction_values(i) models.update(dict.fromkeys(model_idxs)) for (md, vl) in zip(model_idxs, pred_values): preds.append({ JSON_MODEL_ID: md, JSON_DATE_KEY: dt_epoch_to_str_Y_M_D(dt), JSON_VALUE_KEY: vl }) for md in models.keys(): models[md] = get_model_metadata(md) # Save data dataName = 'test1' filePath = get_json_history_path(dataName) save_to_JSON(filePath, json_history) filePath = get_json_predictions_path(dataName) save_to_JSON(filePath, preds) filePath = get_json_model_path(dataName) save_to_JSON(filePath, models) # if __name__ == '__main__': # generator = EMF_TestDataGenerator() # generator.generate_d3_JSON_ParallelCoords()
def rep_genome_stats(self, clusters, genome_files): """Calculate statistics relative to representative genome.""" self.logger.info('Calculating statistics to cluster representatives:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): if len(cids) == 0: stats[rid] = self.RepStats(min_ani=-1, mean_ani=-1, std_ani=-1, median_ani=-1) else: # calculate ANI to representative genome gid_pairs = [] for cid in cids: gid_pairs.append((cid, rid)) gid_pairs.append((rid, cid)) if True: # *** DEBUGGING ani_af = self.fastani.pairs(gid_pairs, genome_files, report_progress=False) else: ani_af = self.fastani.ani_cache # calculate statistics anis = [FastANI.symmetric_ani(ani_af, cid, rid)[ 0] for cid in cids] stats[rid] = self.RepStats(min_ani=min(anis), mean_ani=np_mean(anis), std_ani=np_std(anis), median_ani=np_median(anis)) statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % ( idx+1, len(clusters), float((idx+1)*100)/len(clusters)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') return stats
def __init__(self, tree, parent, sax, cardinality, sequences): """ Initialization function of the InternalNode class :returns: a root node :rtype: RootNode """ """ inherits the init function of the rootnode class """ RootNode.__init__(self, tree=tree, parent=parent, sax=sax, cardinality=cardinality) """ transforms the list sequences from PAA""" list_ts_paa = self.tree.isax.transform_paa(sequences) tmp_mean = np_mean(list_ts_paa, axis=0) tmp_stdev = np_std(list_ts_paa, axis=0) """ as it is an internal node, it necessarily has at least one downhill node so : """ """ we calculate the future candidate cardinalities """ cardinality_next_tmp = np_copy(self.cardinality) # if max_card if self.tree.boolean_card_max: # we multiply by 2 only the cardinalities not exceeding the authorized threshold cardinality_next_tmp[cardinality_next_tmp <= self.tree.max_card_alphabet] *= 2 else: # We multiply by 2 all the cardinalities (they are all candidates) cardinality_next_tmp *= 2 # The self.split function choses the cardinality index to multiply by 2 position_min = self.split(cardinality_next_tmp, tmp_mean, tmp_stdev) """ We write the next cardinality (for its leaf nodes) """ self.cardinality_next = np_copy(self.cardinality) self.cardinality_next[position_min] *= 2 if self.tree.bigger_current_cardinality < self.cardinality_next[position_min]: self.tree.bigger_current_cardinality = self.cardinality_next[position_min] self.level = parent.level + 1
def _rep_genome_stats(self, clusters, genome_files): """Calculate statistics relative to representative genome.""" self.logger.info('Calculating statistics to cluster representatives:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): if len(cids) == 0: stats[rid] = self.RepStats(min_ani = -1, mean_ani = -1, std_ani = -1, median_ani = -1) else: # calculate ANI to representative genome gid_pairs = [] for cid in cids: gid_pairs.append((cid, rid)) ani_af = self.ani_cache.fastani_pairs(gid_pairs, genome_files, report_progress=False) # calculate statistics anis = [ani_af[cid][rid][0] for cid in cids] stats[rid] = self.RepStats(min_ani = min(anis), mean_ani = np_mean(anis), std_ani = np_std(anis), median_ani = np_median(anis)) statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % ( idx+1, len(clusters), float((idx+1)*100)/len(clusters)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') return stats
def subsample_msa(self, seqs, markers, cols_per_gene, max_gaps, min_identical_aa, max_identical_aa, rnd_seed): """Sample columns from each marker in multiple sequence alignment.""" alignment_length = len(seqs.values()[0]) sampled_cols = [] start = 0 lack_sufficient_cols = 0 lack_cols_marker_ids = [] avg_perc_cols = [] count_wrong_pa = 0 count_wrong_cons = 0 random.seed(rnd_seed) for marker_id, marker_name, marker_len in markers: end = start + marker_len valid_cols, count_wrong_pa, count_wrong_cons = self.identify_valid_columns( marker_name, count_wrong_pa, count_wrong_cons, start, end, seqs, max_gaps, min_identical_aa, max_identical_aa) assert (len(valid_cols) <= marker_len) # sanity check self.logger.info( '%s: S:%d, E:%d, LEN:%d, COLS:%d, PERC:%.1f' % (marker_name, start, end, marker_len, len(valid_cols), len(valid_cols) * 100.0 / marker_len)) avg_perc_cols.append(len(valid_cols) * 100.0 / marker_len) if len(valid_cols) < cols_per_gene: self.logger.warning('Marker has <%d columns after filtering.' % cols_per_gene) lack_sufficient_cols += 1 lack_cols_marker_ids.append(marker_id) offset_valid_cols = [i + start for i in valid_cols] sampled_cols.extend( random.sample(offset_valid_cols, min(cols_per_gene, len(offset_valid_cols)))) start = end mask = [1 if i in sampled_cols else 0 for i in range(alignment_length)] self.logger.info( 'Identified %d of %d marker genes with <%d columns for sampling:' % (lack_sufficient_cols, len(markers), cols_per_gene)) self.logger.info('%s' % ', '.join(lack_cols_marker_ids)) self.logger.info( 'Marker genes had %.1f+/-%.1f%% of columns available for selection on average.' % (np_mean(avg_perc_cols), np_std(avg_perc_cols))) # trim columns output_seqs = {} for seq_id, seq in seqs.iteritems(): masked_seq = ''.join( [seq[i] for i in xrange(0, len(mask)) if mask[i]]) output_seqs[seq_id] = masked_seq self.logger.info( 'Trimmed alignment from %d to %d AA (%d by minimum taxa percent, %d by consensus, maximum of %d columns per genes).' % (len( seqs[seqs.keys()[0]]), len(output_seqs[output_seqs.keys()[0]]), count_wrong_pa, count_wrong_cons, cols_per_gene)) self.logger.info('Final MSA contains %d columns.' % len(sampled_cols)) return mask, output_seqs
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) < 2: continue u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) mono = [] poly = [] no_inference = [] for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(dist) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(dist) else: c.append((0.0, 0.0, 1.0)) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1,-1,-1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n = 0 if len(mono) > 0: mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def get_value_for_data_only(self, values): """ Return the standard deviation points """ return np_std(values, ddof=1)
def get_value_for_data_only(self, values): """ Returns the mean, standard deviation and number of values """ return np_mean(values), np_std(values, ddof=1), np.size(values)
def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir): """Calculate distribution of branch lengths at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. taxonomy_file : str File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree). output_dir : str Desired output directory. """ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, set(), min_children, -1) # determine branch lengths to leaves for named lineages rank_bl_dist = defaultdict(list) taxa_bl_dist = defaultdict(list) taxa_at_rank = defaultdict(list) for node in tree.postorder_node_iter(): if node.is_leaf() or not node.label: continue _support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue # get most specific rank in multi-rank taxa string taxa = [t.strip() for t in taxon.split(';')] taxon = taxa[-1] most_specific_rank = taxon[0:3] taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon) for n in node.leaf_iter(): dist_to_node = 0 while n != node: dist_to_node += n.edge_length n = n.parent_node for t in taxa: taxa_bl_dist[t].append(dist_to_node) rank = Taxonomy.rank_labels[Taxonomy.rank_index[most_specific_rank]] if rank != 'species' or Taxonomy().validate_species_name(taxon): if taxon in taxa_for_dist_inference: rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon])) # report number of taxa at each rank print '' print 'Rank\tTaxa\tTaxa for Inference' for rank, taxa in taxa_at_rank.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' # report results sorted by rank sorted_taxon = [] for rank_prefix in Taxonomy.rank_prefixes: taxa_at_rank = [] for taxon in taxa_bl_dist: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) # report results for each named group taxa_file = os.path.join(output_dir, 'taxa_bl_dist.tsv') fout = open(taxa_file, 'w') fout.write('Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for taxon in sorted_taxon: dist = taxa_bl_dist[taxon] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each taxonomic rank rank_file = os.path.join(output_dir, 'rank_bl_dist.tsv') fout = open(rank_file, 'w') fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for rank in Taxonomy.rank_labels: dist = rank_bl_dist[rank] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close()
def pairwise_stats(self, clusters, genome_files): """Calculate statistics for all pairwise comparisons in a species cluster.""" self.logger.info( f'Restricting pairwise comparisons to {self.max_genomes_for_stats:,} randomly selected genomes.') self.logger.info( 'Calculating statistics for all pairwise comparisons in a species cluster:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): statusStr = '-> Processing {:,} of {:,} ({:2f}%) clusters (size = {:,}).'.ljust(86).format( idx+1, len(clusters), float((idx+1)*100)/len(clusters), len(cids)) sys.stdout.write('{}\r'.format(statusStr)) sys.stdout.flush() if len(cids) == 0: stats[rid] = self.PairwiseStats(min_ani=-1, mean_ani=-1, std_ani=-1, median_ani=-1, ani_to_medoid=-1, mean_ani_to_medoid=-1, mean_ani_to_rep=-1, ani_below_95=-1) else: if len(cids) > self.max_genomes_for_stats: cids = set(random.sample(cids, self.max_genomes_for_stats)) # calculate ANI to representative genome gid_pairs = [] gids = list(cids.union([rid])) for gid1, gid2 in combinations(gids, 2): gid_pairs.append((gid1, gid2)) gid_pairs.append((gid2, gid1)) if True: # ***DEBUGGING ani_af = self.fastani.pairs(gid_pairs, genome_files, report_progress=False) else: ani_af = self.fastani.ani_cache # calculate medoid point if len(gids) > 2: dist_mat = np_zeros((len(gids), len(gids))) for i, gid1 in enumerate(gids): for j, gid2 in enumerate(gids): if i < j: ani, _af = FastANI.symmetric_ani( ani_af, gid1, gid2) dist_mat[i, j] = 100 - ani dist_mat[j, i] = 100 - ani medoid_idx = np_argmin(dist_mat.sum(axis=0)) medoid_gid = gids[medoid_idx] else: # with only 2 genomes in a cluster, the representative is the # natural medoid at least for reporting statistics for the # individual species cluster medoid_gid = rid mean_ani_to_medoid = np_mean([FastANI.symmetric_ani(ani_af, gid, medoid_gid)[0] for gid in gids if gid != medoid_gid]) mean_ani_to_rep = np_mean([FastANI.symmetric_ani(ani_af, gid, rid)[0] for gid in gids if gid != rid]) if mean_ani_to_medoid < mean_ani_to_rep: self.logger.error('mean_ani_to_medoid < mean_ani_to_rep') sys.exit(-1) # calculate statistics anis = [] for gid1, gid2 in combinations(gids, 2): ani, _af = FastANI.symmetric_ani(ani_af, gid1, gid2) anis.append(ani) stats[rid] = self.PairwiseStats( min_ani=min(anis), mean_ani=np_mean(anis), std_ani=np_std(anis), median_ani=np_median(anis), ani_to_medoid=FastANI.symmetric_ani( ani_af, rid, medoid_gid)[0], mean_ani_to_medoid=mean_ani_to_medoid, mean_ani_to_rep=mean_ani_to_rep, ani_below_95=sum([1 for ani in anis if ani < 95])) sys.stdout.write('\n') return stats
def action_naming_priority(self, prev_genomes, cur_genomes, new_updated_sp_clusters): """Check if representative should be replace with genome with higher nomenclatural priority.""" self.logger.info( 'Identifying genomes with naming priority in GTDB species clusters.' ) out_file = os.path.join(self.output_dir, 'update_priority.tsv') fout = open(out_file, 'w') fout.write( 'NCBI species\tGTDB species\tRepresentative\tStrain IDs\tRepresentative type sources\tPriority year\tGTDB type species\tGTDB type strain\tNCBI assembly type' ) fout.write( '\tNCBI synonym\tGTDB synonym\tSynonym genome\tSynonym strain IDs\tSynonym type sources\tPriority year\tGTDB type species\tGTDB type strain\tSynonym NCBI assembly type' ) fout.write('\tANI\tAF\tPriority note\n') num_higher_priority = 0 assembly_score_change = [] anis = [] afs = [] for idx, prev_rid in enumerate(prev_genomes.sp_clusters): # get type strain genomes in GTDB species cluster, including genomes new to this release type_strain_gids = [ gid for gid in prev_genomes.sp_clusters[prev_rid] if gid in cur_genomes and cur_genomes[gid].is_effective_type_strain() ] if prev_rid in new_updated_sp_clusters: new_type_strain_gids = [ gid for gid in new_updated_sp_clusters[prev_rid] if cur_genomes[gid].is_effective_type_strain() ] type_strain_gids.extend(new_type_strain_gids) if len(type_strain_gids) == 0: continue # check if representative has already been updated updated_rid = self.get_updated_rid(prev_rid) type_strain_sp = set([ cur_genomes[gid].ncbi_taxa.species for gid in type_strain_gids ]) if len(type_strain_sp) == 1 and updated_rid in type_strain_gids: continue updated_sp = cur_genomes[updated_rid].ncbi_taxa.species highest_priority_gid = updated_rid if updated_rid not in type_strain_gids: highest_priority_gid = None if updated_sp in type_strain_sp: sp_gids = [ gid for gid in type_strain_gids if cur_genomes[gid].ncbi_taxa.species == updated_sp ] hq_gid = select_highest_quality(sp_gids, cur_genomes) highest_priority_gid = hq_gid #self.logger.warning('Representative is a non-type strain genome even though type strain genomes exist in species cluster: {}: {}, {}: {}'.format( # prev_rid, cur_genomes[prev_rid].is_effective_type_strain(), updated_rid, cur_genomes[updated_rid].is_effective_type_strain())) #self.logger.warning('Type strain genomes: {}'.format(','.join(type_strain_gids))) # find highest priority genome for sp in type_strain_sp: if sp == updated_sp: continue # get highest quality genome from species sp_gids = [ gid for gid in type_strain_gids if cur_genomes[gid].ncbi_taxa.species == sp ] hq_gid = select_highest_quality(sp_gids, cur_genomes) if highest_priority_gid is None: highest_priority_gid = hq_gid else: highest_priority_gid, note = self.sp_priority_mngr.priority( cur_genomes, highest_priority_gid, hq_gid) # check if representative should be updated if highest_priority_gid != updated_rid: num_higher_priority += 1 ani, af = self.fastani.symmetric_ani_cached( updated_rid, highest_priority_gid, cur_genomes[updated_rid].genomic_file, cur_genomes[highest_priority_gid].genomic_file) anis.append(ani) afs.append(af) d = cur_genomes[highest_priority_gid].score_assembly( ) - cur_genomes[updated_rid].score_assembly() assembly_score_change.append(d) action = 'NOMENCLATURE_PRIORITY:REPLACED' params = {} params['prev_ncbi_species'] = cur_genomes[ updated_rid].ncbi_taxa.species params['prev_year_of_priority'] = cur_genomes[ updated_rid].year_of_priority() params['new_ncbi_species'] = cur_genomes[ highest_priority_gid].ncbi_taxa.species params['new_year_of_priority'] = cur_genomes[ highest_priority_gid].year_of_priority() params['new_rid'] = highest_priority_gid params['ani'] = ani params['af'] = af params['priority_note'] = note self.update_rep(prev_rid, highest_priority_gid, action) self.action_log.write('{}\t{}\t{}\t{}\n'.format( prev_rid, cur_genomes[updated_rid].gtdb_taxa.species, action, params)) fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( cur_genomes[highest_priority_gid].ncbi_taxa.species, cur_genomes[highest_priority_gid].gtdb_taxa.species, highest_priority_gid, ','.join( sorted( cur_genomes[highest_priority_gid].strain_ids())), ','.join( sorted(cur_genomes[highest_priority_gid]. gtdb_type_sources())).upper().replace( 'STRAININFO', 'StrainInfo'), cur_genomes[highest_priority_gid].year_of_priority(), cur_genomes[highest_priority_gid].is_gtdb_type_species(), cur_genomes[highest_priority_gid].is_gtdb_type_strain(), cur_genomes[highest_priority_gid].ncbi_type_material)) fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( cur_genomes[updated_rid].ncbi_taxa.species, cur_genomes[updated_rid].gtdb_taxa.species, updated_rid, ','.join(sorted(cur_genomes[updated_rid].strain_ids())), ','.join( sorted(cur_genomes[updated_rid].gtdb_type_sources()) ).upper().replace('STRAININFO', 'StrainInfo'), cur_genomes[updated_rid].year_of_priority(), cur_genomes[updated_rid].is_gtdb_type_species(), cur_genomes[updated_rid].is_gtdb_type_strain(), cur_genomes[updated_rid].ncbi_type_material)) fout.write('\t{:.3f}\t{:.4f}\t{}\n'.format(ani, af, note)) fout.close() self.logger.info( f' ... identified {num_higher_priority:,} species with representative changed to genome with higher nomenclatural priority.' ) self.logger.info( ' ... change in assembly score for new representatives: {:.2f} +/- {:.2f}' .format(np_mean(assembly_score_change), np_std(assembly_score_change))) self.logger.info(' ... ANI: {:.2f} +/- {:.2f}'.format( np_mean(anis), np_std(anis))) self.logger.info(' ... AF: {:.2f} +/- {:.2f}'.format( np_mean(afs), np_std(afs)))
def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir): """Calculate distribution of branch lengths at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. taxonomy_file : str File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree). output_dir : str Desired output directory. """ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference( tree, taxonomy, set(), min_children, -1) # determine branch lengths to leaves for named lineages rank_bl_dist = defaultdict(list) taxa_bl_dist = defaultdict(list) taxa_at_rank = defaultdict(list) for node in tree.postorder_node_iter(): if node.is_leaf() or not node.label: continue _support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue # get most specific rank in multi-rank taxa string taxa = [t.strip() for t in taxon.split(';')] taxon = taxa[-1] most_specific_rank = taxon[0:3] taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon) for n in node.leaf_iter(): dist_to_node = self._dist_to_ancestor(n, node) for t in taxa: taxa_bl_dist[t].append(dist_to_node) rank = Taxonomy.rank_labels[ Taxonomy.rank_index[most_specific_rank]] if rank != 'species' or Taxonomy().validate_species_name(taxon): if taxon in taxa_for_dist_inference: rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon])) # report number of taxa at each rank print('') print('Rank\tTaxa\tTaxa for Inference') for rank, taxa in taxa_at_rank.items(): taxa_for_inference = [ x for x in taxa if x in taxa_for_dist_inference ] print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))) print('') # report results sorted by rank sorted_taxon = [] for rank_prefix in Taxonomy.rank_prefixes: taxa_at_rank = [] for taxon in taxa_bl_dist: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) # report results for each named group taxa_file = os.path.join(output_dir, '%s.taxa_bl_dist.tsv' % input_tree_name) fout = open(taxa_file, 'w') fout.write( 'Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n' ) for taxon in sorted_taxon: dist = taxa_bl_dist[taxon] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write( '%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each taxonomic rank rank_file = os.path.join(output_dir, '%s.rank_bl_dist.tsv' % input_tree_name) fout = open(rank_file, 'w') fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for rank in Taxonomy.rank_labels: dist = rank_bl_dist[rank] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each node output_bl_file = os.path.join(output_dir, '%s.node_bl_dist.tsv' % input_tree_name) self._write_bl_dist(tree, output_bl_file)
def stats_for_qlp_well(well, compute_clusters=False, override_thresholds=None): """ Return statistics about a QLWell object read from a QLP file. The QLWell object should have a populated `peaks` attribute (reading from QLBs won't work) For parameter explanations and return values, see :func:`stats_for_qlp_well`. """ from pyqlb.nstats.peaks import cluster_1d, channel_amplitudes from pyqlb.nstats.well import accepted_peaks, above_min_amplitude_peaks, well_channel_sp_values, well_cluster_peaks from pyqlb.nstats.well import well_observed_positives_negatives, well_s2d_values, getClusters from pyqlb.nstats.well import high_flier_droplets, low_flier_droplets, singleRain_droplets, doubleRain_droplets, diagonal_scatter from numpy import mean as np_mean, std as np_std if not override_thresholds: override_thresholds = (None, None) statistics = well_statistics(well, override_thresholds=override_thresholds) accepted = len(accepted_peaks(well)) num_above_min = len(above_min_amplitude_peaks(well)) if num_above_min > 0 and accepted > 0: if well.sum_amplitude_bins: peaksets, boundaries, amps = revb_polydisperse_peaks(well, 0, threshold=override_thresholds[0]) poly_peaks = sum([len(p) for p in peaksets]) statistics[0].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min else: peaksets, boundaries, width_gates = polydisperse_peaks(well, 0, threshold=override_thresholds[0]) poly_peaks = sum([len(p) for p in peaksets]) statistics[0].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min else: statistics[0].revb_polydispersity_pct = 0 s, p_plus, p, p_minus = well_channel_sp_values(well, 0, override_threshold=override_thresholds[0]) statistics[0].s_value = s statistics[0].p_plus = p_plus statistics[0].p_plus_drops = int(p_plus*accepted) if p_plus is not None else None statistics[0].p = p statistics[0].p_drops = int(p*accepted) if p is not None else None statistics[0].p_minus = p_minus statistics[0].p_minus_drops = int(p_minus*accepted) if p_minus is not None else None if num_above_min > 0 and accepted > 0: if well.sum_amplitude_bins: peaksets, boundaries, amps = revb_polydisperse_peaks(well, 1, threshold=override_thresholds[1]) poly_peaks = sum([len(p) for p in peaksets]) statistics[1].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min else: peaksets, boundaries, width_gates = polydisperse_peaks(well, 1, threshold=override_thresholds[1]) poly_peaks = sum([len(p) for p in peaksets]) statistics[1].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min else: statistics[1].revb_polydispersity_pct = 0 s, p_plus, p, p_minus = well_channel_sp_values(well, 1, override_threshold=override_thresholds[1]) statistics[1].s_value = s statistics[1].p_plus = p_plus statistics[1].p_plus_drops = int(p_plus*accepted) if p_plus is not None else None statistics[1].p = p statistics[1].p_drops = int(p*accepted) if p is not None else None statistics[1].p_minus = p_minus statistics[1].p_minus_drops = int(p_minus*accepted) if p_minus is not None else None ## compute s2d plots s2d_vals = well_s2d_values( well, thresholds=override_thresholds) statistics[0].s2d_value = s2d_vals[0] if s2d_vals is not None else None statistics[1].s2d_value = s2d_vals[1] if s2d_vals is not None else None ## compute extra cluster metrics clusters = getClusters( well, override_thresholds ) dscatter = diagonal_scatter( clusters ) statistics.diagonal_scatter = dscatter[1] if dscatter is not None else None statistics.diagonal_scatter_pct = dscatter[2] *100 if dscatter is not None else None for channel in [0,1]: high_fliers = high_flier_droplets( clusters, channel ) statistics[channel].high_flier_value = high_fliers[1] if high_fliers is not None else None statistics[channel].high_flier_pct = high_fliers[2] * 100 if high_fliers is not None else None low_fliers = low_flier_droplets( clusters, channel ) statistics[channel].low_flier_value = low_fliers[1] if low_fliers is not None else None statistics[channel].low_flier_pct = low_fliers[2] * 100 if low_fliers is not None else None singleRain = singleRain_droplets( clusters, channel ) statistics[channel].single_rain_value = singleRain[1] if singleRain is not None else None statistics[channel].single_rain_pct = singleRain[2] * 100 if singleRain is not None else None doubleRain = doubleRain_droplets( clusters, channel ) statistics[channel].double_rain_value = doubleRain[1] if doubleRain is not None else None statistics[channel].double_rain_pct = doubleRain[2] * 100 if doubleRain is not None else None if compute_clusters: clusters = well_cluster_peaks(well, override_thresholds) else: clusters = {'positive_peaks': {'positive_peaks': [], 'negative_peaks': []}, 'negative_peaks': {'positive_peaks': [], 'negative_peaks': []}} # cheap hack statistics.alg_version = "%s.%s/%s.%s" % (well.statistics.peak_alg_major_version, well.statistics.peak_alg_minor_version, well.statistics.quant_alg_major_version, well.statistics.quant_alg_minor_version) statistics.ref_copy_num = well.ref_copy_num statistics[0].decision_tree = well.channels[0].decision_tree_verbose statistics[1].decision_tree = well.channels[1].decision_tree_verbose # end cheap hack # SNR for chan in (0,1): if override_thresholds[chan]: # TODO add this to pyqlb.nstats.well instead pos, neg = cluster_1d(accepted_peaks(well), chan, override_thresholds[chan]) else: pos, neg, unknown = well_observed_positives_negatives(well, chan) for attr, coll in (('positive_snr', pos),('negative_snr',neg)): if len(pos) > 0: amps = channel_amplitudes(coll, chan) amp_mean = np_mean(amps) amp_std = np_std(amps) if amp_std > 0: setattr(statistics[chan], attr, amp_mean/amp_std) else: setattr(statistics[chan], attr, 10000) else: setattr(statistics[chan], attr, 0) for channel in [0,1]: means,stds = total_events_amplitude_vals(well,channel) statistics[channel].total_events_amplitude_mean = means if means is not None else None statistics[channel].total_events_amplitude_stdev = stds if stds is not None else None return statistics, clusters
def eventstudy(self, data=None, model='m', estwin=100, gap=50, evtwins=-10, evtwine=10, minval=70, output='df'): """ Paramaters passed to the event study method. data = event data (event date & permno combinations) model = madj (market-adjusted model) m (market model) ff (fama french) ffm (fama french with momentum factor) estwin = estimation window gap = gap between estimation window and event window evtwins = days preceding event date to begin event window evtwine = days after event date to close the event window minval = minimum number of non-missing return observations (per event) to be regressed on output = output format of the event study results xls (output an excel file to output path) csv (output a csv file to output path) json (output a json file to output path) df (returns a dictionary of pandas dataframes) print (outputs results to the console - not available via qsub) """ #################################################################################### # STEP 1 - SET ESTIMATION, EVENT, AND GAP WINDOWS AND GRAB DATA FROM EVENTS FILE # #################################################################################### estwins = (estwin + gap + np_abs(evtwins)) # Estimation window start estwine = (gap + np_abs(evtwins) + 1) # Estimation window end evtwinx = ( estwins + 1 ) # evt time value (0=event date, -10=window start, 10=window end) evtwins = np_abs( evtwins ) # convert the negative to positive as we will use lag function) evtrang = (evtwins + evtwine + 1 ) # total event window days (lag + lead + the day itself) """ With the event date as a fixed point, calculate the number of days needed to pass to sql lag and lead functions to identify estimation window, gap, and event window. evtwins: event date minus number of preceding days ("event date" - "number of days before event to start [evtwins parameter]") evtwine: event date plus number of following days ("event date" + "number of days after event to end [evtwine parameter]") gap: number of days between the end of the "estimation window" and the beginning of the "event window" estwins: start date of the estimation window ("event date" - "number of days before event to start [evtwins parameter]" - "number of days in gap [gap parameter]" - "number of days in estimation window [estwin parameter]") evtrang: entire time range of the event study even from estimate start, through gap, until event window end (evtwins + evtwine + 1) """ # default the event data in case it was not passed, otherwise read what was passed evtdata = [{"edate": "05/29/2012", "permno": "10002"}] if data is not None: evtdata = json_dumps(data) # init values wrapped up to be passed to sql statement params = { 'estwins': estwins, 'estwine': estwine, 'evtwins': evtwins, 'evtwine': evtwine, 'evtwinx': evtwinx, 'evtdata': evtdata } ############################################# # STEP 2 - GET RETURNS DATA FROM POSTGRES # ############################################# # Create a database connection wconn = self.connect() ############################################################################## # Get the initial data from the database and put it in a pandas dataframe # ############################################################################## # create a pandas dataframe that will hold data df = wconn.raw_sql(""" SELECT a.*, x.*, c.date as rdate, c.ret as ret1, (f.mktrf+f.rf) as mkt, f.mktrf, f.rf, f.smb, f.hml, f.umd, (1+c.ret)*(coalesce(d.dlret,0.00)+1)-1-(f.mktrf+f.rf) as exret, (1+c.ret)*(coalesce(d.dlret,0.00)+1)-1 as ret, case when c.date between a.estwin1 and a.estwin2 then 1 else 0 end as isest, case when c.date between a.evtwin1 and a.evtwin2 then 1 else 0 end as isevt, case when c.date between a.evtwin1 and a.evtwin2 then (rank() OVER (PARTITION BY x.evtid ORDER BY c.date)-%(evtwinx)s) else (rank() OVER (PARTITION BY x.evtid ORDER BY c.date)) end as evttime FROM ( SELECT date, lag(date, %(estwins)s ) over (order by date) as estwin1, lag(date, %(estwine)s ) over (order by date) as estwin2, lag(date, %(evtwins)s ) over (order by date) as evtwin1, lead(date, %(evtwine)s ) over (order by date) as evtwin2 FROM crsp_a_stock.dsi ) as a JOIN (select to_char(x.edate, 'ddMONYYYY') || trim(to_char(x.permno,'999999999')) as evtid, x.permno, x.edate from json_to_recordset('%(evtdata)s') as x(edate date, permno int) ) as x ON a.date=x.edate JOIN crsp_a_stock.dsf c ON x.permno=c.permno AND c.date BETWEEN a.estwin1 and a.evtwin2 JOIN ff_all.factors_daily f ON c.date=f.date LEFT JOIN crsp_a_stock.dsedelist d ON x.permno=d.permno AND c.date=d.dlstdt WHERE f.mktrf is not null AND c.ret is not null ORDER BY x.evtid, x.permno, a.date, c.date """ % params) # Columns coming from the database query df.columns = [ 'date', 'estwin1', 'estwin2', 'evtwin1', 'evtwin2', 'evtid', 'permno', 'edate', 'rdate', 'ret1', 'mkt', 'mktrf', 'rf', 'smb', 'hml', 'umd', 'exret', 'ret', 'isest', 'isevt', 'evttime' ] # Additional columns that will hold computed values (post-query) addcols = [ 'RMSE', 'INTERCEPT', 'var_estp', 'expret', 'abret', 'alpha', '_nobs', '_p_', '_edf_', 'rsq', 'cret', 'cexpret', 'car', 'scar', 'sar', 'pat_scale', 'bhar', 'lastevtwin', 'cret_edate', 'scar_edate', 'car_edate', 'bhar_edate', 'pat_scale_edate', 'xyz' ] # Add them to the dataframe for c in addcols: if c == 'lastevtwin': df[c] = 0 else: df[c] = np_nan ################################################################################### # STEP 3 - FOR EACH EVENT, CALCULATE ABNORMAL RETURN BASED ON CHOSEN RISK MODEL # ################################################################################### # Loop on every category for evt in data: permno = evt['permno'] xdate = evt['edate'] edate = datetime.strptime(xdate, "%m/%d/%Y").date() est_mask = (df['permno'] == permno) & (df['edate'] == edate) & ( df['isest'] == 1) evt_mask = (df['permno'] == permno) & (df['edate'] == edate) & ( df['isevt'] == 1) ####################################################### # Check to see it meets the min obs for est window # ####################################################### _nobs = df["ret"][est_mask].count() # Only carry out the analysis if the number of obsevations meets the minimum threshold if _nobs >= minval: ####################################################### # Regression based on model choices='' # ####################################################### # Market-Adjusted Model if model == 'madj': # Set y to the estimation window records y = df["exret"][est_mask] # Calculate mean and standard deviation of returns for the estimation period mean = np_mean(y) stdv = np_std(y, ddof=1) # Update the columns in the original dataframe (reusing the names from SAS code to help with continuity) df.loc[evt_mask, 'INTERCEPT'] = mean df.loc[evt_mask, 'RMSE'] = stdv df.loc[evt_mask, '_nobs'] = len(y) df.loc[evt_mask, 'var_estp'] = stdv**2 df.loc[evt_mask, 'alpha'] = mean df.loc[evt_mask, 'rsq'] = 0 df.loc[evt_mask, '_p_'] = 1 df.loc[evt_mask, '_edf_'] = (len(y) - 1) df.loc[evt_mask, 'expret'] = df.loc[evt_mask, 'mkt'] df.loc[evt_mask, 'abret'] = df.loc[evt_mask, 'exret'] df_est = df[est_mask] _nobs = len(df_est[df_est.ret.notnull()]) nloc = {'const': 0} def f_cret(row): tmp = ((row['ret'] * nloc['const']) + (row['ret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret, axis=1) df.loc[evt_mask, 'cret_edate'] = nloc['const'] nloc = {'const': 0} def f_cexpret(row): tmp = ((row['expret'] * nloc['const']) + (row['expret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret, axis=1) nloc = {'const': 0} def f_car(row): tmp = (row['abret'] + nloc['const']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1) df.loc[evt_mask, 'car_edate'] = nloc['const'] nloc = {'const': 0} def f_sar(row): tmp = (row['abret'] / np_sqrt(row['var_estp'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1) df.loc[evt_mask, 'sar_edate'] = nloc['const'] nloc = {'const': 0, 'evtrang': evtrang} def f_scar(row): tmp = (row['car'] / np_sqrt( (evtrang * row['var_estp']))) nloc['const'] = tmp return tmp df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar, axis=1) df.loc[evt_mask, 'scar_edate'] = nloc['const'] nloc = {'const': 0} def f_bhar(row): tmp = (row['cret'] - row['cexpret']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar, axis=1) df.loc[evt_mask, 'bhar_edate'] = nloc['const'] df.loc[evt_mask, 'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00) df.loc[evt_mask, 'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00) # Market Model elif model == 'm': # Set y to the estimation window records X = df["mktrf"][est_mask] y = df["ret"][est_mask] # Fit an OLS model with intercept on mktrf X = sm_add_constant(X) est = sm_OLS(y, X).fit() # Set the variables from the output df_est = df[(df['permno'] == permno) & (df['edate'] == edate) & (df['isest'] == 1)] _nobs = len( df_est[df_est.ret.notnull()]) # not null observations # aggregate variables # cret_edate = np_nan # scar_edate = np_nan # car_edate = np_nan # bhar_edate = np_nan # pat_scale_edate = np_nan alpha = est.params.__getitem__('const') beta1 = est.params.__getitem__('mktrf') df.loc[evt_mask, 'INTERCEPT'] = alpha df.loc[evt_mask, 'alpha'] = alpha df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid) df.loc[evt_mask, '_nobs'] = _nobs df.loc[evt_mask, 'var_estp'] = est.mse_resid df.loc[evt_mask, 'rsq'] = est.rsquared df.loc[evt_mask, '_p_'] = 2 df.loc[evt_mask, '_edf_'] = (len(y) - 2) nloc = {'alpha': alpha, 'beta1': beta1, 'const': 0} def f_expret(row): return (nloc['alpha'] + (nloc['beta1'] * row['mktrf'])) df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret, axis=1) nloc = {'alpha': alpha, 'beta1': beta1, 'const': 0} def f_abret(row): return (row['ret'] - (nloc['alpha'] + (nloc['beta1'] * row['mktrf']))) df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret, axis=1) nloc = {'const': 0} def f_cret(row): tmp = ((row['ret'] * nloc['const']) + (row['ret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret, axis=1) df.loc[evt_mask, 'cret_edate'] = nloc['const'] nloc = {'const': 0} def f_cexpret(row): tmp = ((row['expret'] * nloc['const']) + (row['expret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret, axis=1) nloc = {'const': 0} def f_car(row): # nonlocal const tmp = (row['abret'] + nloc['const']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1) df.loc[evt_mask, 'car_edate'] = nloc['const'] nloc = {'const': 0} def f_sar(row): tmp = (row['abret'] / np_sqrt(row['var_estp'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1) df.loc[evt_mask, 'sar_edate'] = nloc['const'] nloc = {'const': 0, 'evtrang': evtrang} def f_scar(row): tmp = (row['car'] / np_sqrt( (evtrang * row['var_estp']))) nloc['const'] = tmp return tmp df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar, axis=1) df.loc[evt_mask, 'scar_edate'] = nloc['const'] nloc = {'const': 0} def f_bhar(row): tmp = (row['cret'] - row['cexpret']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar, axis=1) df.loc[evt_mask, 'bhar_edate'] = nloc['const'] df.loc[evt_mask, 'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00) df.loc[evt_mask, 'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00) # Fama-French Three Factor Model elif model == 'ff': # Set y to the estimation window records df_est = df[(df['permno'] == permno) & (df['edate'] == edate) & (df['isest'] == 1)] X = df_est[['smb', 'hml', 'mktrf']] y = df_est['ret'] # Fit an OLS model with intercept on mktrf, smb, hml X = sm_add_constant(X) est = sm_OLS(y, X).fit() # est = smf.ols(formula='ret ~ smb + hml + mktrf', data=df_est).fit() alpha = est.params.__getitem__('const') beta1 = est.params.__getitem__('mktrf') beta2 = est.params.__getitem__('smb') beta3 = est.params.__getitem__('hml') df.loc[evt_mask, 'INTERCEPT'] = alpha df.loc[evt_mask, 'alpha'] = alpha df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid) df.loc[evt_mask, '_nobs'] = _nobs df.loc[evt_mask, 'var_estp'] = est.mse_resid df.loc[evt_mask, 'rsq'] = est.rsquared df.loc[evt_mask, '_p_'] = 2 df.loc[evt_mask, '_edf_'] = (len(y) - 2) nloc = { 'alpha': alpha, 'beta1': beta1, 'beta2': beta2, 'beta3': beta3, 'const': 0 } def f_expret(row): return ((nloc['alpha'] + (nloc['beta1'] * row['mktrf']) + (nloc['beta2'] * row['smb']) + (nloc['beta3'] * row['hml']))) df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret, axis=1) nloc = { 'alpha': alpha, 'beta1': beta1, 'beta2': beta2, 'beta3': beta3, 'const': 0 } def f_abret(row): return (row['ret'] - ((nloc['alpha'] + (nloc['beta1'] * row['mktrf']) + (nloc['beta2'] * row['smb']) + (nloc['beta3'] * row['hml'])))) df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret, axis=1) nloc = {'const': 0} def f_cret(row): tmp = ((row['ret'] * nloc['const']) + (row['ret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret, axis=1) df.loc[evt_mask, 'cret_edate'] = nloc['const'] nloc = {'const': 0} def f_cexpret(row): tmp = ((row['expret'] * nloc['const']) + (row['expret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret, axis=1) nloc = {'const': 0} def f_car(row): tmp = (row['abret'] + nloc['const']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1) df.loc[evt_mask, 'car_edate'] = nloc['const'] nloc = {'const': 0} def f_sar(row): tmp = (row['abret'] / np_sqrt(row['var_estp'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1) df.loc[evt_mask, 'sar_edate'] = nloc['const'] nloc = {'const': 0, 'evtrang': evtrang} def f_scar(row): tmp = (row['car'] / np_sqrt( (evtrang * row['var_estp']))) nloc['const'] = tmp return tmp df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar, axis=1) df.loc[evt_mask, 'scar_edate'] = nloc['const'] nloc = {'const': 0} def f_bhar(row): tmp = (row['cret'] - row['cexpret']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar, axis=1) df.loc[evt_mask, 'bhar_edate'] = nloc['const'] df.loc[evt_mask, 'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00) df.loc[evt_mask, 'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00) # Fama-French Plus Momentum elif model == 'ffm': # Set y to the estimation window records df_est = df[(df['permno'] == permno) & (df['edate'] == edate) & (df['isest'] == 1)] X = df_est[['mktrf', 'smb', 'hml', 'umd']] # indicator variables y = df_est['ret'] # response variables # Fit an OLS (ordinary least squares) model with intercept on mktrf, smb, hml, and umd X = sm_add_constant(X) est = sm_OLS(y, X).fit() alpha = est.params.__getitem__('const') beta1 = est.params.__getitem__('mktrf') beta2 = est.params.__getitem__('smb') beta3 = est.params.__getitem__('hml') beta4 = est.params.__getitem__('umd') df.loc[evt_mask, 'INTERCEPT'] = alpha df.loc[evt_mask, 'alpha'] = alpha df.loc[evt_mask, 'RMSE'] = np_sqrt(est.mse_resid) df.loc[evt_mask, '_nobs'] = _nobs df.loc[evt_mask, 'var_estp'] = est.mse_resid df.loc[evt_mask, 'rsq'] = est.rsquared df.loc[evt_mask, '_p_'] = 2 df.loc[evt_mask, '_edf_'] = (len(y) - 2) nloc = { 'alpha': alpha, 'beta1': beta1, 'beta2': beta2, 'beta3': beta3, 'beta4': beta4, 'const': 0 } def f_expret(row): return ((nloc['alpha'] + (nloc['beta1'] * row['mktrf']) + (nloc['beta2'] * row['smb']) + (nloc['beta3'] * row['hml']) + (nloc['beta4'] * row['umd']))) df.loc[evt_mask, 'expret'] = df[evt_mask].apply(f_expret, axis=1) nloc = { 'alpha': alpha, 'beta1': beta1, 'beta2': beta2, 'beta3': beta3, 'beta4': beta4, 'const': 0 } def f_abret(row): return (row['ret'] - ((nloc['alpha'] + (nloc['beta1'] * row['mktrf']) + (nloc['beta2'] * row['smb']) + (nloc['beta3'] * row['hml']) + (nloc['beta4'] * row['umd'])))) df.loc[evt_mask, 'abret'] = df[evt_mask].apply(f_abret, axis=1) nloc = {'const': 0} def f_cret(row): tmp = ((row['ret'] * nloc['const']) + (row['ret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cret'] = df[evt_mask].apply(f_cret, axis=1) df.loc[evt_mask, 'cret_edate'] = nloc['const'] nloc = {'const': 0} def f_cexpret(row): tmp = ((row['expret'] * nloc['const']) + (row['expret'] + nloc['const'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'cexpret'] = df[evt_mask].apply(f_cexpret, axis=1) nloc = {'const': 0} def f_car(row): tmp = (row['abret'] + nloc['const']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'car'] = df[evt_mask].apply(f_car, axis=1) df.loc[evt_mask, 'car_edate'] = nloc['const'] nloc = {'const': 0} def f_sar(row): tmp = (row['abret'] / np_sqrt(row['var_estp'])) nloc['const'] = tmp return tmp df.loc[evt_mask, 'sar'] = df[evt_mask].apply(f_sar, axis=1) df.loc[evt_mask, 'sar_edate'] = nloc['const'] nloc = {'const': 0, 'evtrang': evtrang} def f_scar(row): tmp = (row['car'] / np_sqrt( (evtrang * row['var_estp']))) nloc['const'] = tmp return tmp df.loc[evt_mask, 'scar'] = df[evt_mask].apply(f_scar, axis=1) df.loc[evt_mask, 'scar_edate'] = nloc['const'] nloc = {'const': 0} def f_bhar(row): tmp = (row['cret'] - row['cexpret']) nloc['const'] = tmp return tmp df.loc[evt_mask, 'bhar'] = df[evt_mask].apply(f_bhar, axis=1) df.loc[evt_mask, 'bhar_edate'] = nloc['const'] df.loc[evt_mask, 'pat_scale'] = (_nobs - 2.00) / (_nobs - 4.00) df.loc[evt_mask, 'pat_scale_edate'] = (_nobs - 2.00) / (_nobs - 4.00) # Something erroneous was passed else: df['isest'][evt_mask] = -2 ################################# # STEP 4 - OUTPUT THE RESULTS # ################################# df_sta = df[df['isevt'] == 1] levt = df_sta['evttime'].unique() columns = [ 'evttime', 'car_m', 'ret_m', 'abret_m', 'abret_t', 'sar_t', 'pat_ar', 'cret_edate_m', 'car_edate_m', 'pat_car_edate_m', 'car_edate_t', 'scar_edate_t', 'bhar_edate_m' ] idxlist = list(levt) df_stats = pd_DataFrame(index=idxlist, columns=columns) df_stats = df_stats.fillna(0.00000000) # with 0s rather than NaNs # Event df_stats['evttime'] = df_sta.groupby(['evttime'])['evttime'].unique() # Means df_stats['abret_m'] = df_sta.groupby(['evttime'])['abret'].mean() df_stats['bhar_edate_m'] = df_sta.groupby(['evttime' ])['bhar_edate'].mean() df_stats['car_edate_m'] = df_sta.groupby(['evttime' ])['car_edate'].mean() df_stats['car_m'] = df_sta.groupby(['evttime'])['car'].mean() df_stats['cret_edate_m'] = df_sta.groupby(['evttime' ])['cret_edate'].mean() df_stats['pat_scale_m'] = df_sta.groupby(['evttime' ])['pat_scale'].mean() df_stats['pat_car_edate_mean'] = 0 df_stats['ret_m'] = df_sta.groupby(['evttime'])['ret'].mean() df_stats['sar_m'] = df_sta.groupby(['evttime'])['sar'].mean() df_stats['scar_edate_m'] = df_sta.groupby(['evttime' ])['scar_edate'].mean() df_stats['scar_m'] = df_sta.groupby(['evttime'])['scar'].mean() # Standard deviations df_stats['car_v'] = df_sta.groupby(['evttime'])['car'].std() df_stats['abret_v'] = df_sta.groupby(['evttime'])['abret'].std() df_stats['sar_v'] = df_sta.groupby(['evttime'])['sar'].std() df_stats['pat_scale_v'] = df_sta.groupby(['evttime' ])['pat_scale'].std() df_stats['car_edate_v'] = df_sta.groupby(['evttime' ])['car_edate'].std() df_stats['scar_edate_v'] = df_sta.groupby(['evttime' ])['scar_edate'].std() df_stats['scar_v'] = df_sta.groupby(['evttime'])['scar'].std() # Counts df_stats['scar_n'] = df_sta.groupby(['evttime'])['scar'].count() df_stats['scar_edate_n'] = df_sta.groupby(['evttime' ])['scar_edate'].count() df_stats['sar_n'] = df_sta.groupby(['evttime'])['sar'].count() df_stats['car_n'] = df_sta.groupby(['evttime'])['car'].count() df_stats['n'] = df_sta.groupby(['evttime'])['evttime'].count() # Sums df_stats['pat_scale_edate_s'] = df_sta.groupby( ['evttime'])['pat_scale_edate'].sum() df_stats['pat_scale_s'] = df_sta.groupby(['evttime' ])['pat_scale'].sum() # T statistics 1 def tstat(row, m, v, n): return row[m] / (row[v] / np_sqrt(row[n])) df_stats['abret_t'] = df_stats.apply(tstat, axis=1, args=('abret_m', 'abret_v', 'n')) df_stats['sar_t'] = df_stats.apply(tstat, axis=1, args=('sar_m', 'sar_v', 'n')) df_stats['car_edate_t'] = df_stats.apply(tstat, axis=1, args=('car_edate_m', 'car_edate_v', 'n')) df_stats['scar_edate_t'] = df_stats.apply(tstat, axis=1, args=('scar_edate_m', 'scar_edate_v', 'scar_edate_n')) # T statistics 2 def tstat2(row, m, s, n): return row[m] / (np_sqrt(row[s]) / row[n]) df_stats['pat_car'] = df_stats.apply(tstat2, axis=1, args=('scar_m', 'pat_scale_s', 'scar_n')) df_stats['pat_car_edate_m'] = df_stats.apply(tstat2, axis=1, args=('scar_edate_m', 'pat_scale_edate_s', 'scar_edate_n')) df_stats['pat_ar'] = df_stats.apply(tstat2, axis=1, args=('sar_m', 'pat_scale_s', 'sar_n')) # FILE 2 # EVENT WINDOW df_evtw = df.ix[ (df['isevt'] == 1), ['permno', 'edate', 'rdate', 'evttime', 'ret', 'abret']] df_evtw.sort_values(['permno', 'evttime'], ascending=[True, True]) # FILE 1 # EVENT DATE maxv = max(levt) df_evtd = df.ix[(df['isevt'] == 1) & (df['evttime'] == maxv), ['permno', 'edate', 'cret', 'car', 'bhar']] df_evtd.sort_values(['permno', 'edate'], ascending=[True, True]) if output == 'df': retval = {} retval['event_stats'] = df_stats retval['event_window'] = df_evtw retval['event_date'] = df_evtd return retval elif output == 'print': retval = {} print( tabulate(df_evtd.sort_values(['permno', 'edate'], ascending=[True, True]), headers='keys', tablefmt='psql')) print(tabulate(df_evtw, headers='keys', tablefmt='psql')) print(tabulate(df_stats, headers='keys', tablefmt='psql')) return retval elif output == 'json': retval = {} retval['event_stats'] = df_stats.to_dict(orient='split') retval['event_window'] = df_evtw.to_dict(orient='split') retval['event_date'] = df_evtd.to_dict(orient='split') # Write this to a file with open(os.path.join(self.output_path, 'EventStudy.json'), 'w') as outfile: json_dump(retval, outfile, cls=EncoderJson) # Return the output in case they are doing something programmatically return json_dumps(retval, cls=EncoderJson) elif output == 'csv': retval = '' es = StringIO_StringIO() df_stats.to_csv(es) retval += es.getvalue() ew = StringIO_StringIO() df_evtw.to_csv(ew) retval += "\r" retval += ew.getvalue() ed = StringIO_StringIO() df_evtd.to_csv(ed) retval += ed.getvalue() # write this to a file with open(os.path.join(self.output_path, 'EventStudy.csv'), 'w') as outfile: outfile.write(retval) # return the output in case they are doing something programmatically return retval elif output == 'xls': retval = {} xlswriter = pd_ExcelWriter( os.path.join(self.output_path, 'EventStudy.xls')) df_stats.to_excel(xlswriter, 'Stats') df_evtw.to_excel(xlswriter, 'Event Window') df_evtd.to_excel(xlswriter, 'Event Date') xlswriter.save() return retval else: pass
def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. rel_dist_thresholds: list Relative distances cutoffs for defining ranks. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [ dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference ] u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [ dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference ] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2) ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2) ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write( 'Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n' ) x = [] y = [] c = [] labels = [] rank_labels = [] rel_dist_thresholds += [1.0] # append boundry for species for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if clade_label in taxa_for_dist_inference: c.append((0.0, 0.0, 0.5)) else: c.append((0.5, 0.5, 0.5)) p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) if i == 0: rank_cutoff = rel_dist_thresholds[i] rank_outlier = dist > rank_cutoff else: rank_cutoff = rel_dist_thresholds[i] upper_rank_cutoff = rel_dist_thresholds[i - 1] rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff) v = [clade_label, dist, rank_cutoff, str(rank_outlier)] v += percentiles[i] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) fout.close() scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # plot relative divergence threshold lines y_min, y_max = ax.get_ylim() for threshold in rel_dist_thresholds[ 0:-1]: # don't draw species boundary ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--') ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center') # make plot interactive mpld3.plugins.connect( self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=96)
def run(self, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file, gtdb_type_strains_ledger, untrustworthy_type_ledger): """Resolve cases where a species has multiple genomes assembled from the type strain.""" # get species in LTP reference database self.logger.info('Determining species defined in LTP reference database.') ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file) self.logger.info(f' ... identified {len(ltp_defined_species):,} species.') # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=None, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_ledger) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.') # update current genomes with GTDB-Tk classifications self.logger.info('Updating current genomes with GTDB-Tk classifications.') num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(gtdbtk_classify_file, prev_genomes) self.logger.info(f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.') # parsing genomes manually established to be untrustworthy as type self.logger.info('Determining genomes manually annotated as untrustworthy as type.') manual_untrustworthy_types = {} with open(untrustworthy_type_ledger) as f: header = f.readline().strip().split('\t') ncbi_sp_index = header.index('NCBI species') reason_index = header.index('Reason for declaring untrustworthy') for line in f: tokens = line.strip().split('\t') gid = canonical_gid(tokens[0]) manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index]) self.logger.info(f' ... identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.') # identify NCBI species with multiple genomes assembled from type strain of species self.logger.info('Determining number of type strain genomes in each NCBI species.') sp_type_strain_genomes = defaultdict(set) for gid in cur_genomes: if cur_genomes[gid].is_effective_type_strain(): ncbi_sp = cur_genomes[gid].ncbi_taxa.species if ncbi_sp != 's__': # yes, NCBI has genomes marked as assembled from type material # that do not actually have a binomial species name sp_type_strain_genomes[ncbi_sp].add(gid) multi_type_strains_sp = [ncbi_sp for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1] self.logger.info(f' ... identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.') # sort by number of genome assemblies self.logger.info('Calculating ANI between type strain genomes in each species.') fout = open(os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w') fout.write('NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n') fout_genomes = open(os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w') fout_genomes.write('Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment') fout_genomes.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n') fout_unresolved = open(os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w') fout_unresolved.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species') fout_unresolved.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n') fout_high_divergence = open(os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w') fout_high_divergence.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n') fout_untrustworthy = open(os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w') fout_untrustworthy.write('Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n') for gid in manual_untrustworthy_types: ncbi_sp, reason = manual_untrustworthy_types[gid] fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, '<not tested>', 'n/a', 'Manual curation: ' + reason)) processed = 0 num_divergent = 0 unresolved_sp_count = 0 ncbi_ltp_resolved = 0 intra_ani_resolved = 0 ncbi_type_resolved = 0 gtdb_family_resolved = 0 gtdb_genus_resolved = 0 gtdb_sp_resolved = 0 ltp_resolved = 0 use_pickled_results = False #*** if use_pickled_results: self.logger.warning('Using previously calculated ANI results in: {}'.format(self.ani_pickle_dir)) prev_gtdb_sp_conflicts = 0 for ncbi_sp, type_gids in sorted(sp_type_strain_genomes.items(), key=lambda kv: len(kv[1])): if len(type_gids) == 1: continue status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format( ncbi_sp, len(type_gids), processed+1, len(multi_type_strains_sp), (processed+1)*100.0/len(multi_type_strains_sp)).ljust(128) sys.stdout.write('{}\r'.format(status_str)) sys.stdout.flush() processed += 1 # calculate ANI between type strain genomes ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_') if not use_pickled_results: #*** ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files) pickle.dump(ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb')) else: ani_af = pickle.load(open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb')) anis = [] afs = [] gid_anis = defaultdict(lambda: {}) gid_afs = defaultdict(lambda: {}) all_similar = True for gid1, gid2 in combinations(type_gids, 2): ani, af = symmetric_ani(ani_af, gid1, gid2) if ani < 99 or af < 0.65: all_similar = False anis.append(ani) afs.append(af) gid_anis[gid1][gid2] = ani gid_anis[gid2][gid1] = ani gid_afs[gid1][gid2] = af gid_afs[gid2][gid1] = af note = 'All type strain genomes have ANI >99% and AF >65%.' unresolved_species = False # read LTP metadata for genomes ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes) untrustworthy_gids = {} gtdb_resolved_sp_conflict = False if not all_similar: # need to establish which genomes are untrustworthy as type num_divergent += 1 unresolved_species = True # write out highly divergent cases for manual inspection; # these should be compared to the automated selection if np_mean(anis) < 95: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_high_divergence.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP # assignment also suggest the asserted type material is incorrect resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(gid_anis, ncbi_sp, type_gids, ltp_metadata, ltp_defined_species, cur_genomes) if resolved: note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy" ncbi_ltp_resolved += 1 # try to resolve by LTP 16S BLAST results if not resolved: resolved, untrustworthy_gids = self.resolve_ltp_conflict(gid_anis, ncbi_sp, type_gids, ltp_metadata, 0) if resolved: note = 'Species resolved by identifying conflicting or lack of LTP BLAST results' ltp_resolved += 1 # try to resolve species using intra-specific ANI test if not resolved: resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(gid_anis) if resolved: note = 'Species resolved by intra-specific ANI test' intra_ani_resolved += 1 # try to resolve by GTDB family assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_family(gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB family classifications' gtdb_family_resolved += 1 # try to resolve by GTDB genus assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_genus(gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB genus classifications' gtdb_genus_resolved += 1 # try to resolve by GTDB species assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_species(gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB species classifications' gtdb_sp_resolved += 1 # try to resolve by considering genomes annotated as type material at NCBI, # which includes considering if genomes are marked as untrustworthy as type if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_types(gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting NCBI assembled from type metadata' ncbi_type_resolved += 1 if resolved: unresolved_species = False # check if type strain genomes marked as trusted or untrusted conflict # with current GTDB species assignment untrustworthy_gtdb_sp_match = False trusted_gtdb_sp_match = False for gid in type_gids: gtdb_canonical_epithet = canonical_taxon(specific_epithet(cur_genomes[gid].gtdb_taxa.species)) if gtdb_canonical_epithet == specific_epithet(ncbi_sp): if gid in untrustworthy_gids: untrustworthy_gtdb_sp_match = True else: trusted_gtdb_sp_match = True if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match: prev_gtdb_sp_conflicts += 1 gtdb_resolved_sp_conflict = True # write results to file for gid, reason in untrustworthy_gids.items(): ltp_species = self.ltp_species(gid, ltp_metadata) if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note: reason += "; considered `untrustworthy as type` at NCBI" fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), reason)) # Sanity check that if the untrustworthy genome has an LTP to only the # expected species, that all other genomes also have a hit to the # expected species (or potentially no hit). Otherwise, more consideration # should be given to the genome with the conflicting LTP hit. if len(ltp_species) == 1 and ncbi_sp in ltp_species: other_sp = set() for test_gid in type_gids: ltp_species = self.ltp_species(test_gid, ltp_metadata) if ltp_species and ncbi_sp not in ltp_species: other_sp.update(ltp_species) if other_sp: self.logger.warning(f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.') num_ncbi_untrustworthy = sum([1 for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note]) if num_ncbi_untrustworthy != len(type_gids): for gid in type_gids: if (gid not in untrustworthy_gids and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note): self.logger.warning("Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy].".format( gid, ncbi_sp, num_ncbi_untrustworthy, len(type_gids))) else: note = 'Species is unresolved; manual curation is required!' unresolved_sp_count += 1 if unresolved_species: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_unresolved.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_genomes.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format( gid, gid in untrustworthy_gids, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), gtdb_resolved_sp_conflict, np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) fout.write('{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format( ncbi_sp, len(type_gids), all_similar, np_mean(anis), np_std(anis), np_mean(afs), np_std(afs), note, ', '.join(type_gids))) sys.stdout.write('\n') fout.close() fout_unresolved.close() fout_high_divergence.close() fout_genomes.close() fout_untrustworthy.close() self.logger.info(f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.') self.logger.info(f' ... resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.') self.logger.info(f' ... resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.') self.logger.info(f' ... resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.') self.logger.info(f' ... resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.') self.logger.info(f' ... resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.') self.logger.info(f' ... resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.') self.logger.info(f' ... resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.') if unresolved_sp_count > 0: self.logger.warning(f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.') self.logger.warning('These should be handled before proceeding with the next step of GTDB species updating.') self.logger.warning("This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'.") self.logger.info(f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.')
def _pairwise_stats(self, clusters, genome_files): """Calculate statistics for all pairwise comparisons in a species cluster.""" self.logger.info('Calculating statistics for all pairwise comparisons in a species cluster:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): statusStr = '-> Processing %d of %d (%.2f%%) clusters (size = %d).'.ljust(86) % ( idx+1, len(clusters), float((idx+1)*100)/len(clusters), len(cids)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if len(cids) == 0: stats[rid] = self.PairwiseStats(min_ani = -1, mean_ani = -1, std_ani = -1, median_ani = -1, ani_to_medoid = -1, mean_ani_to_medoid = -1, ani_below_95 = -1) else: if len(cids) > self.max_genomes_for_stats: cids = set(random.sample(cids, self.max_genomes_for_stats)) # calculate ANI to representative genome gid_pairs = [] gids = list(cids.union([rid])) for gid1, gid2 in combinations(gids, 2): gid_pairs.append((gid1, gid2)) gid_pairs.append((gid2, gid1)) ani_af = self.ani_cache.fastani_pairs(gid_pairs, genome_files, report_progress=False) # calculate medoid point if len(gids) > 2: dist_mat = np_zeros((len(gids), len(gids))) for i, gid1 in enumerate(gids): for j, gid2 in enumerate(gids): if i < j: ani, af = symmetric_ani(ani_af, gid1, gid2) dist_mat[i, j] = ani dist_mat[j, i] = ani medoid_idx = np_argmin(dist_mat.sum(axis=0)) medoid_gid = gids[medoid_idx] else: # with only 2 genomes in a cluster, the representative is the # natural medoid at least for reporting statistics for the # individual species cluster medoid_gid = rid mean_ani_to_medoid = np_mean([symmetric_ani(ani_af, gid, medoid_gid)[0] for gid in gids if gid != medoid_gid]) # calculate statistics anis = [] for gid1, gid2 in combinations(gids, 2): ani, af = symmetric_ani(ani_af, gid1, gid2) anis.append(ani) stats[rid] = self.PairwiseStats(min_ani = min(anis), mean_ani = np_mean(anis), std_ani = np_std(anis), median_ani = np_median(anis), ani_to_medoid = symmetric_ani(ani_af, rid, medoid_gid)[0], mean_ani_to_medoid = mean_ani_to_medoid, ani_below_95 = sum([1 for ani in anis if ani < 95])) sys.stdout.write('\n') return stats
def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. rel_dist_thresholds: list Relative distances cutoffs for defining ranks. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2) ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2) ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] rel_dist_thresholds += [1.0] # append boundry for species for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if clade_label in taxa_for_dist_inference: c.append((0.0, 0.0, 0.5)) else: c.append((0.5, 0.5, 0.5)) p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) if i == 0: rank_cutoff = rel_dist_thresholds[i] rank_outlier = dist > rank_cutoff else: rank_cutoff = rel_dist_thresholds[i] upper_rank_cutoff = rel_dist_thresholds[i - 1] rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff) v = [clade_label, dist, rank_cutoff, str(rank_outlier)] v += percentiles[i] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) fout.close() scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # plot relative divergence threshold lines y_min, y_max = ax.get_ylim() for threshold in rel_dist_thresholds[0:-1]: # don't draw species boundary ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--') ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center') # make plot interactive mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=96)
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference] if len(v) < 2: continue u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) mono = [] poly = [] no_inference = [] for clade_label, dist in rel_dists[rank].items(): x.append(dist) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(dist) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(dist) else: c.append((0.0, 0.0, 1.0)) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1,-1,-1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) d = len(mono) + len(poly) + len(no_inference) if d == 0: break w = float(len(mono)) / d n = 0 if len(mono) > 0: mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(range(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def action_genomic_update(self, rep_change_summary_file, prev_genomes, cur_genomes, new_updated_sp_clusters): """Handle representatives with updated genomes.""" # get genomes with specific changes self.logger.info( 'Identifying representatives with updated genomic files.') genomic_update_gids = self.rep_change_gids(rep_change_summary_file, 'GENOMIC_CHANGE', 'UPDATED') self.logger.info( f' ... identified {len(genomic_update_gids):,} genomes.') # calculate ANI between previous and current genomes assembly_score_change = [] for prev_rid, prev_gtdb_sp in genomic_update_gids.items(): # check that genome hasn't been lost which should # be handled differently assert prev_rid in cur_genomes ani, af = self.fastani.symmetric_ani_cached( f'{prev_rid}-P', f'{prev_rid}-C', prev_genomes[prev_rid].genomic_file, cur_genomes[prev_rid].genomic_file) params = {} params['ani'] = ani params['af'] = af params['prev_ncbi_accession'] = prev_genomes[prev_rid].ncbi_accn params['cur_ncbi_accession'] = cur_genomes[prev_rid].ncbi_accn assert prev_genomes[prev_rid].ncbi_accn != cur_genomes[ prev_rid].ncbi_accn if ani >= self.genomic_update_ani and af >= self.genomic_update_af: params['prev_assembly_quality'] = prev_genomes[ prev_rid].score_assembly() params['new_assembly_quality'] = cur_genomes[ prev_rid].score_assembly() action = 'GENOMIC_CHANGE:UPDATED:MINOR_CHANGE' d = cur_genomes[prev_rid].score_assembly( ) - prev_genomes[prev_rid].score_assembly() assembly_score_change.append(d) else: sp_cids = self.genomes_in_current_sp_cluster( prev_rid, prev_genomes, new_updated_sp_clusters, cur_genomes) if sp_cids: new_rid, top_score, ani, af = self.top_ani_score_prev_rep( prev_rid, sp_cids, prev_genomes, cur_genomes) if new_rid == prev_rid: params['prev_assembly_quality'] = prev_genomes[ prev_rid].score_assembly() params['new_assembly_quality'] = cur_genomes[ prev_rid].score_assembly() action = 'GENOMIC_CHANGE:UPDATED:RETAINED' else: action = 'GENOMIC_CHANGE:UPDATED:REPLACED' params['new_rid'] = new_rid params['ani'] = ani params['af'] = af params['new_assembly_quality'] = cur_genomes[ new_rid].score_assembly() params['prev_assembly_quality'] = prev_genomes[ prev_rid].score_assembly() self.update_rep(prev_rid, new_rid, action) else: action = 'GENOMIC_CHANGE:UPDATED:SPECIES_RETIRED' self.update_rep(prev_rid, None, action) self.action_log.write('{}\t{}\t{}\t{}\n'.format( prev_rid, prev_gtdb_sp, action, params)) self.logger.info( ' ... change in assembly score for updated genomes: {:.2f} +/- {:.2f}' .format(np_mean(assembly_score_change), np_std(assembly_score_change)))
def run(self, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file, gtdb_type_strains_ledger, untrustworthy_type_ledger, ncbi_env_bioproject_ledger): """Resolve cases where a species has multiple genomes assembled from the type strain.""" # get species in LTP reference database self.logger.info( 'Determining species defined in LTP reference database.') ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file) self.logger.info( f' - identified {len(ltp_defined_species):,} species.') # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_ledger, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) # parsing genomes manually established to be untrustworthy as type self.logger.info( 'Determining genomes manually annotated as untrustworthy as type.') manual_untrustworthy_types = self.parse_untrustworthy_type_ledger( untrustworthy_type_ledger) self.logger.info( f' - identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.' ) # Identify NCBI species with multiple genomes assembled from type strain of species. This # is done using a series of heuristics that aim to ensure that the selected type strain # genome is reliable. More formal evaluation and a manuscript descirbing this selection # process is ultimately required. Ideally, the community will eventually adopt a # database that indicates a single `type genome assembly` for each species instead # of just indicating a type strain from which many (sometimes dissimilar) assemblies exist. self.logger.info( 'Determining number of type strain genomes in each NCBI species.') multi_type_strains_sp = self.sp_with_mult_type_strains(cur_genomes) self.logger.info( f' - identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.' ) # resolve species with multiple type strain genomes fout = open( os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w') fout.write( 'NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n' ) fout_genomes = open( os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w') fout_genomes.write( 'Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment' ) fout_genomes.write( '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\tReason for GTDB untrustworthy as type\n' ) fout_unresolved = open( os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w') fout_unresolved.write( 'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species') fout_unresolved.write( '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n' ) fout_high_divergence = open( os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w') fout_high_divergence.write( 'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n' ) fout_untrustworthy = open( os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w') fout_untrustworthy.write( 'Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n' ) for gid in manual_untrustworthy_types: ncbi_sp, reason = manual_untrustworthy_types[gid] fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, '<not tested>', 'n/a', 'Manual curation: ' + reason)) processed = 0 num_divergent = 0 unresolved_sp_count = 0 ncbi_ltp_resolved = 0 intra_ani_resolved = 0 ncbi_type_resolved = 0 ncbi_rep_resolved = 0 gtdb_family_resolved = 0 gtdb_genus_resolved = 0 gtdb_sp_resolved = 0 ltp_resolved = 0 # *** Perhaps should be an external flag, but used right now to speed up debugging use_pickled_results = False if use_pickled_results: self.logger.warning( 'Using previously calculated ANI results in: {}'.format( self.ani_pickle_dir)) prev_gtdb_sp_conflicts = 0 self.logger.info( 'Resolving species with multiple type strain genomes:') for ncbi_sp, type_gids in sorted(multi_type_strains_sp.items(), key=lambda kv: len(kv[1])): assert len(type_gids) > 1 status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format( ncbi_sp, len(type_gids), processed + 1, len(multi_type_strains_sp), (processed + 1) * 100.0 / len(multi_type_strains_sp)).ljust(128) sys.stdout.write('{}\r'.format(status_str)) sys.stdout.flush() processed += 1 # calculate ANI between type strain genomes all_similar, anis, afs, gid_anis, gid_afs = self.calculate_type_strain_ani( ncbi_sp, type_gids, cur_genomes, use_pickled_results) # read LTP metadata for genomes ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes) untrustworthy_gids = {} gtdb_resolved_sp_conflict = False unresolved_species = False note = 'All type strain genomes have ANI >99% and AF >65%.' if not all_similar: note = '' # need to establish which genomes are untrustworthy as type num_divergent += 1 unresolved_species = True # write out highly divergent cases for manual inspection; # these should be compared to the automated selection if np_mean(anis) < 95: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_high_divergence.write( '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n' .format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP # assignment also suggest the asserted type material is incorrect resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes( gid_anis, ncbi_sp, type_gids, ltp_metadata, ltp_defined_species, cur_genomes) if resolved: note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy" ncbi_ltp_resolved += 1 # try to resolve by LTP 16S BLAST results if not resolved: resolved, untrustworthy_gids = self.resolve_ltp_conflict( gid_anis, ncbi_sp, type_gids, ltp_metadata, 0) if resolved: note = 'Species resolved by identifying conflicting or lack of LTP BLAST results' ltp_resolved += 1 # try to resolve species using intra-specific ANI test if not resolved: resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani( gid_anis) if resolved: note = 'Species resolved by intra-specific ANI test' intra_ani_resolved += 1 # try to resolve by GTDB family assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_family( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB family classifications' gtdb_family_resolved += 1 # try to resolve by GTDB genus assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_genus( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB genus classifications' gtdb_genus_resolved += 1 # try to resolve by GTDB species assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_species( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB species classifications' gtdb_sp_resolved += 1 # try to resolve by considering genomes annotated as type material at NCBI, # which includes considering if genomes are marked as untrustworthy as type if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_types( gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting NCBI assembled from type metadata' ncbi_type_resolved += 1 # try to resovle by considering genomes annotated as representative genomes at NCBI if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_reps( gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by considering NCBI representative genomes' ncbi_rep_resolved += 1 if resolved: unresolved_species = False # check if type strain genomes marked as trusted or untrusted conflict # with current GTDB species assignment untrustworthy_gtdb_sp_match = False trusted_gtdb_sp_match = False for gid in type_gids: gtdb_canonical_epithet = canonical_taxon( specific_epithet( cur_genomes[gid].gtdb_taxa.species)) if gtdb_canonical_epithet == specific_epithet(ncbi_sp): if gid in untrustworthy_gids: untrustworthy_gtdb_sp_match = True else: trusted_gtdb_sp_match = True if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match: prev_gtdb_sp_conflicts += 1 gtdb_resolved_sp_conflict = True else: note = 'Species is unresolved; manual curation is required!' unresolved_sp_count += 1 if unresolved_species: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_unresolved.write( '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n' .format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # remove genomes marked as untrustworthy as type at NCBI if one or more potential type strain genomes remaining ncbi_untrustworthy_gids = set([ gid for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note ]) if len(type_gids - set(untrustworthy_gids) - ncbi_untrustworthy_gids) >= 1: for gid in ncbi_untrustworthy_gids: untrustworthy_gids[ gid] = "Genome annotated as `untrustworthy as type` at NCBI and there are other potential type strain genomes available" # report cases where genomes marked as untrustworthy as type at NCBI are being retained as potential type strain genomes num_ncbi_untrustworthy = len(ncbi_untrustworthy_gids) for gid in type_gids: if (gid not in untrustworthy_gids and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note): self.logger.warning( "Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy]." .format(gid, ncbi_sp, num_ncbi_untrustworthy, len(type_gids))) # write out genomes identified as being untrustworthy for gid, reason in untrustworthy_gids.items(): ltp_species = self.ltp_species(gid, ltp_metadata) if 'untrustworthy as type' in cur_genomes[ gid].excluded_from_refseq_note: reason += "; considered `untrustworthy as type` at NCBI" fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), reason)) # Sanity check that if the untrustworthy genome has an LTP to only the # expected species, that all other genomes also have a hit to the # expected species (or potentially no hit). Otherwise, more consideration # should be given to the genome with the conflicting LTP hit. if len(ltp_species) == 1 and ncbi_sp in ltp_species: other_sp = set() for test_gid in type_gids: ltp_species = self.ltp_species(test_gid, ltp_metadata) if ltp_species and ncbi_sp not in ltp_species: other_sp.update(ltp_species) if other_sp: self.logger.warning( f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.' ) # write out information about all type genomes for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_genomes.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\t{}\n' .format(gid, gid in untrustworthy_gids, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), gtdb_resolved_sp_conflict, np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa, untrustworthy_gids.get(gid, ''))) fout.write( '{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format( ncbi_sp, len(type_gids), all_similar, np_mean(anis), np_std(anis), np_mean(afs), np_std(afs), note, ', '.join(type_gids))) sys.stdout.write('\n') fout.close() fout_unresolved.close() fout_high_divergence.close() fout_genomes.close() fout_untrustworthy.close() self.logger.info( f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.' ) self.logger.info( f' - resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.' ) self.logger.info( f' - resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.' ) self.logger.info( f' - resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.' ) self.logger.info( f' - resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.' ) self.logger.info( f' - resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.' ) self.logger.info( f' - resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.' ) self.logger.info( f' - resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.' ) self.logger.info( f' - resolved {ncbi_rep_resolved:,} species by considering RefSeq reference and representative designations at NCBI.' ) if unresolved_sp_count > 0: self.logger.warning( f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.' ) self.logger.warning( 'These should be handled before proceeding with the next step of GTDB species updating.' ) self.logger.warning( "This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'." ) self.logger.info( f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.' )