def create_feat_mat_1(graph): CCs = list(nx_clustering(graph).values()) DCs = list(nx_average_neighbor_degree(graph).values()) degrees = [tup[1] for tup in graph.degree()] edge_wts = [tup[2] for tup in graph.edges.data('weight')] A_mat = nx_to_numpy_matrix(graph) svs = np_linalg_svd(A_mat, full_matrices=False, compute_uv=False) if len(svs) >= 3: sv1 = svs[0] sv2 = svs[1] sv3 = svs[2] elif len(svs) >= 2: sv1 = svs[0] sv2 = svs[1] sv3 = 0 else: sv1 = svs[0] sv2 = sv3 = 0 feat_mat = np_vstack( (nx_density(graph), nx_number_of_nodes(graph), max(degrees), np_mean(degrees), np_median(degrees), np_var(degrees), max(CCs), np_mean(CCs), np_var(CCs), np_mean(edge_wts), max(edge_wts), np_var(edge_wts), np_mean(DCs), np_var(DCs), max(DCs), sv1, sv2, sv3)).T return feat_mat
def resolve_by_intra_specific_ani(self, gid_anis): """Resolve by removing intra-specific genomes with divergent ANI values.""" if len(gid_anis) <= 2: return False, {} # consider most divergent genome as untrustworthy untrustworthy_gids = {} while True: # find most divergent genome min_ani = 100 untrustworthy_gid = None for gid in gid_anis: if gid in untrustworthy_gids: continue anis = [ani for cur_gid, ani in gid_anis[gid].items() if cur_gid not in untrustworthy_gids] if np_mean(anis) < min_ani: min_ani = np_mean(anis) untrustworthy_gid = gid untrustworthy_gids[untrustworthy_gid] = f'{min_ani:.2f}% ANI to other type strain genomes' all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) if all_similar: return True, untrustworthy_gids remaining_genomes = len(gid_anis) - len(untrustworthy_gids) if remaining_genomes <= 2 or len(untrustworthy_gids) >= len(gid_anis): return False, {}
def get_field(self, axes_list): """Returns the values of the field (with symmetries and sums). Parameters ---------- self: Data a Data object axes_list: list a list of RequestedAxis objects Returns ------- values: ndarray values of the field """ values = self.values for axis_requested in axes_list: # Rebuild symmetries when needed axis_symmetries = self.axes[axis_requested.index].symmetries if ( axis_requested.transform == "fft" and axis_requested.is_pattern or axis_requested.extension in ["sum", "rss", "mean", "rms", "integrate"] and axis_requested.is_pattern ): values = take(values, axis_requested.rebuild_indices, axis_requested.index) elif axis_requested.transform == "fft" and "antiperiod" in axis_symmetries: nper = axis_symmetries["antiperiod"] axis_symmetries["antiperiod"] = 2 values = rebuild_symmetries(values, axis_requested.index, axis_symmetries) axis_symmetries["antiperiod"] = nper elif axis_requested.indices is not None: if ( axis_requested.extension in ["sum", "rss", "mean", "rms", "integrate"] or max(axis_requested.indices) > values.shape[axis_requested.index] ): values = rebuild_symmetries( values, axis_requested.index, axis_symmetries ) self.axes[axis_requested.index].symmetries = dict() # sum over sum axes if axis_requested.extension == "sum": values = np_sum(values, axis=axis_requested.index, keepdims=True) # root sum square over rss axes elif axis_requested.extension == "rss": values = sqrt(np_sum(values ** 2, axis=axis_requested.index, keepdims=True)) # mean value over mean axes elif axis_requested.extension == "mean": values = np_mean(values, axis=axis_requested.index, keepdims=True) # RMS over rms axes elif axis_requested.extension == "rms": values = sqrt( np_mean(values ** 2, axis=axis_requested.index, keepdims=True) ) # integration over integration axes elif axis_requested.extension == "integrate": values = trapz( values, x=axis_requested.values, axis=axis_requested.index ) / (np_max(axis_requested.values) - np_min(axis_requested.values)) return values
def predict_image(path_of_image, groupStage): path_of_model = os_path.join("./CUSTOMIZE_4_USER/MODEL_TRAINING", groupStage, groupStage + ".pth") path_of_feature = os_path.join("./CUSTOMIZE_4_USER/MODEL_TRAINING", groupStage, groupStage + ".npz") start_time = time() model = NeuralNet(input_size, hidden_size, num_classes).to(device) model.load_state_dict(load(path_of_model)) data = np_load(path_of_feature) [h_max, s_max, v_max] = data['data_max'] [h_min, s_min, v_min] = data['data_min'] img = imread(path_of_image) img = resize(img, (6000, 4000)) img = img[500:-500, 750:-750, :] img = cvtColor(img, COLOR_BGR2HSV) hchan, schan, vchan = split(img) h_hist = calcHist([img], [0], None, [256], [0, 256]).reshape(256, ) s_hist = calcHist([img], [1], None, [256], [0, 256]).reshape(256, ) v_hist = calcHist([img], [2], None, [256], [0, 256]).reshape(256, ) hMean = np_mean(hchan) / 255 DPV_h_max = np_sum(np_absolute(h_hist - h_max)) / (HEIGHT * WIDTH) DPV_h_min = np_sum(np_absolute(h_hist - h_min)) / (HEIGHT * WIDTH) sMean = np_mean(schan) / 255 DPV_s_max = np_sum(np_absolute(s_hist - s_max)) / (HEIGHT * WIDTH) DPV_s_min = np_sum(np_absolute(s_hist - s_min)) / (HEIGHT * WIDTH) vMean = np_mean(vchan) / 255 DPV_v_max = np_sum(np_absolute(v_hist - v_max)) / (HEIGHT * WIDTH) DPV_v_min = np_sum(np_absolute(v_hist - v_min)) / (HEIGHT * WIDTH) correlation = np_corrcoef(h_hist, s_hist)[0][1] #image_feature = np_array((hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, vMean, DPV_v_max, DPV_v_min)) image_feature = np_array((hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, correlation)) image_feature = from_numpy(image_feature).to(device).float().view( 1, input_size) with no_grad(): out_predict = model(image_feature) _, predicted_result = torch_max(out_predict.data, 1) original = Tensor([[1, 33, 66, 99]]) # Round xx.xx % percentage_result = np_round( mm(out_predict.view(1, num_classes), original.view(num_classes, 1)).item(), 2) # Processed time processedTime = np_round(time() - start_time, 2) #print("Time ",processedTime) return percentage_result, processedTime
def compute(self): """ """ self.s.sort(axis=2) a_sig = np_mean(self.s[:,:,-25:], axis=2) a_noise = np_mean(self.s[:,:,1:52], axis=2) self.snr = 20*np_log10(a_sig/a_noise)
def set_plots(self, data_dict): # self.controller = ThreeDController() # self.controller.show() self.data_dict = data_dict utils.iface.mapCanvas().saveAsImage( os.path.join(self.prefs.CHACHE_BASE_DIR, 'canvas.png')) self.canvas = np.asarray( im.open(os.path.join(self.prefs.CHACHE_BASE_DIR, 'canvas.png'))) self.set_k(data_dict) xMax = utils.iface.mapCanvas().extent().xMaximum() xMin = utils.iface.mapCanvas().extent().xMinimum() yMax = utils.iface.mapCanvas().extent().yMaximum() yMin = utils.iface.mapCanvas().extent().yMinimum() ref_gl_obj = self.plot.add_reference_surface(xMin, xMax, yMin, yMax, self.canvas) self.reference_surf = ThreeDDataSurf(ref_gl_obj, self.plot, 0) self.xoff, self.yoff = self.get_xy_offs(data_dict) for orbit in data_dict.keys(): self.orbit_surf_dict[orbit] = {} self.orbit_surf_dict[orbit]['data'] = [] for band in data_dict[orbit].data: data = np_mean( band[:, data_dict[orbit].get_range( )[0]:data_dict[orbit].get_range()[1] + 1, :], 0) y = np.array(data_dict[orbit].get_proj_y_list()) x = np.array(data_dict[orbit].get_proj_x_list()) z = np.linspace(0, data_dict[orbit].get_v_scale(), data.shape[1]) gl_obj = self.plot.add_surface(x, y, z / 10., data) self.orbit_surf_dict[orbit]['data'].append( ThreeDDataSurf(gl_obj, self.plot, 0)) # if data_dict[orbit].has_key('sim'): if data_dict[orbit].sim: self.orbit_surf_dict[orbit]['sim'] = [] for band in data_dict[orbit].sim: data = np_mean( band[:, data_dict[orbit].get_range( )[0]:data_dict[orbit].get_range()[1] + 1, :], 0) y = np.array(data_dict[orbit].get_proj_y_list()) x = np.array(data_dict[orbit].get_proj_x_list()) z = np.linspace(0, data_dict[orbit].get_v_scale(), data.shape[1]) gl_obj = self.plot.add_surface(x, y, z / 10., data) self.orbit_surf_dict[orbit]['sim'].append( ThreeDDataSurf(gl_obj, self.plot, 0))
def _write_rep_info(self, clusters, cluster_sp_names, quality_metadata, genome_quality, excluded_from_refseq_note, ani_af, output_file): """Write out information about selected representative genomes.""" fout = open(output_file, 'w') fout.write('Species\tType genome\tNCBI assembly level\tNCBI genome category') fout.write('\tGenome size (bp)\tQuality score\tCompleteness (%)\tContamination (%)\tNo. scaffolds\tNo. contigs\tN50 contigs\tAmbiguous bases\tSSU count\tSSU length (bp)') fout.write('\tNo. genomes in cluster\tMean ANI\tMean AF\tMin ANI\tMin AF\tNCBI exclude from RefSeq\n') for gid in clusters: fout.write('%s\t%s\t%s\t%s' % ( cluster_sp_names[gid], gid, quality_metadata[gid].ncbi_assembly_level, quality_metadata[gid].ncbi_genome_category)) fout.write('\t%d\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.1f\t%d\t%d\t%d' % ( quality_metadata[gid].genome_size, genome_quality[gid], quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].scaffold_count, quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases, quality_metadata[gid].ssu_count, quality_metadata[gid].ssu_length if quality_metadata[gid].ssu_length else 0)) anis = [] afs = [] for cluster_id in clusters[gid]: ani, af = symmetric_ani(ani_af, gid, cluster_id) anis.append(ani) afs.append(af) if anis: fout.write('\t%d\t%.1f\t%.2f\t%.1f\t%.2f\t%s\n' % (len(clusters[gid]), np_mean(anis), np_mean(afs), min(anis), min(afs), excluded_from_refseq_note.get(gid, ''))) else: fout.write('\t%d\t%s\t%s\t%s\t%s\t%s\n' % (len(clusters[gid]), 'n/a', 'n/a', 'n/a', 'n/a', excluded_from_refseq_note.get(gid, ''))) fout.close()
def _nonrep_radius(self, unclustered_gids, rep_gids, ani_af_rep_vs_nonrep): """Calculate circumscription radius for unclustered, nontype genomes.""" # set radius for genomes to default values nonrep_radius = {} for gid in unclustered_gids: nonrep_radius[gid] = GenomeRadius(ani = self.ani_sp, af = None, neighbour_gid = None) # determine closest type ANI neighbour and restrict ANI radius as necessary ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb')) for nonrep_gid in unclustered_gids: if nonrep_gid not in ani_af: continue for rep_gid in rep_gids: if rep_gid not in ani_af[nonrep_gid]: continue ani, af = symmetric_ani(ani_af, nonrep_gid, rep_gid) if ani > nonrep_radius[nonrep_gid].ani and af >= self.af_sp: nonrep_radius[nonrep_gid] = GenomeRadius(ani = ani, af = af, neighbour_gid = rep_gid) self.logger.info('ANI circumscription radius: min={:.2f}, mean={:.2f}, max={:.2f}'.format( min([d.ani for d in nonrep_radius.values()]), np_mean([d.ani for d in nonrep_radius.values()]), max([d.ani for d in nonrep_radius.values()]))) return nonrep_radius
def entropy_batch_mixing_(latents, batches, n_neighs=50, n_pools=50, n_samples=100): def cross_entropy(data): n_batches = len(unique(data)) assert n_batches == 2, ValueError( "Entropy can be calculated with only 2 batches") freq = np_mean(data == unique(data)[0]) if freq == 0 or freq == 1: return 0 return -freq * log(freq) - (1 - freq) * log(1 - freq) n_neighs = min(n_neighs, latents.shape[0] - 1) knn = NearestNeighbors(n_neighbors=n_neighs + 1, n_jobs=8) knn.fit(latents) kmatrix = knn.kneighbors_graph(latents) - scipy.sparse.identity( latents.shape[0]) score = 0 #pdb.set_trace() for t in range(n_pools): indices = choice(arange(latents.shape[0]), size=n_samples) while unique(batches[kmatrix[indices].nonzero()[1][ kmatrix[indices].nonzero()[0] == 1]]).shape[0] < 2: indices = choice(arange(latents.shape[0]), size=n_samples) score += np_mean([ cross_entropy(batches[kmatrix[indices].nonzero()[1][ kmatrix[indices].nonzero()[0] == 1]]) for i in range(n_samples) ]) return score / n_pools
def set_data_vol(self,rates_mat,rates_mat_dv_indx=1): """ Calculates the total data volume that can be sent over this link. Uses average data rate to determine data volume. Depending on how much the input data rates matrix is decimated, this could lead to over or underestimates of data volume. :return: """ # Note: float[num_timepoints][2] rates_mat: matrix of datarates at each time during the pass. First column is time in MJD, and second column is data rate from sat to xsat in Mbps, third is rate from xsat to sat. start_mjd = tt.datetime2mjd(self.start)-5/86400.0 # add 5 secs padding to evade any precision problems end_mjd = tt.datetime2mjd(self.end)+5/86400.0 # add 5 secs padding to evade any precision problems # this is fixed in the structure of the data rates output file rates_mat_tp_indx = 0; data_rates = [] for i in range(len(rates_mat)): # if point i is within window - this should take care of any indexing issues if rates_mat[i][rates_mat_tp_indx] >= start_mjd and rates_mat[i][rates_mat_tp_indx] <= end_mjd: data_rates.append(rates_mat[i][rates_mat_dv_indx]) try: # take the average of all the data rates we saw and multiply by the duration of the window to get data volume self.data_vol = np_mean(data_rates) * (self.end - self.start).total_seconds() if self.original_data_vol is None: self.original_data_vol = self.data_vol except RuntimeWarning as e: raise RuntimeWarning('Trouble determining average data rate. Probable no time points were found within start and end of window. Ensure that you are not overly decimating data rate calculations in data rates input file (window: %s, exception seen: %s)'%(self,str(e)))
def statsliste(data, weights=None, masks=None): """Return the per-element (weighted) total weight, means, and standard deviations of the data Input: data: a numpy.array weights: an array of weights for the instances [optional] masks: an array of masks for the instances [optional] Output: n: total weight mean: mean tensor thestd: standard deviation tensor """ if masks is not None: data = data[masks] weights = weights[masks] n = moment0(data, weights) if weights is None: mu = np_mean(data, 0) thestd = std(data, 0) else: mu = mean(n, moment1(data, weights)) thestd = stde(n, moment2e(data, weights), mu) return (n, mu, thestd)
def statslist(data, weights=None, masks=None): """Return the (weighted) total weight, mean, and covariance of the data Input: data: a numpy.array weights: an array of weights for the instances [optional] masks: an array of masks for the instances [optional] Output: n: total weight mean: mean tensor thecov: covariance tensor """ if masks is not None: data = data[masks] if weights is not None: weights = weights[masks] n = moment0(data, weights) if weights is None: mu = np_mean(data, 0) theshape = data[0].shape thesize = data[0].size thecov = cov(data.reshape(n, thesize), None, 0).reshape(theshape * 2) else: mu = mean(n, moment1(data, weights)) thecov = covariance(n, moment2(data, weights), mu) return (n, mu, thecov)
def aggregateResources(self, nbins=20): """ returns a json object which contains max, min, mean, median, and the histogram itself for all memories/cpu WARNING: this method is not particularly efficient and shouldn't be used lightly! """ allData = {"memory": {"data": []}, "cpu": {"data": []}} query = JobInstance.objects.filter(job=self).only("cpu").only("memory") if query.count(): for inst in query: agg = inst.aggregateResources() for key in ['cpu', 'memory']: if len(agg[key]): allData[key]['data'].append(max(agg[key])) del query # finished aggregation, now we can do calculations for key in allData: d = allData[key]["data"] allData[key]["max"] = max(d) allData[key]["min"] = min(d) arr = np_array(d, dtype=float) allData[key]["mean"] = float(np_mean(arr, axis=0)) allData[key]["median"] = float(np_median(arr, axis=0)) hist, bins = np_hist(arr, nbins) center = (bins[:-1] + bins[1:]) / 2 w = (bins[1] - bins[0]) histo = np_array([center, hist]) allData[key]['histogram'] = { "histo": histo.tolist(), "histoT": histo.T.tolist(), "binWidth": float(w) } del allData[key]['data'] return dumps(allData)
def transfer_same_dist(test_list, train_list, com_comp, test_rem): if len(test_rem) == 0: return test_list, train_list, com_comp sizes = [len(line) for line in test_rem] mean_test_size = np_mean(sizes) sd = sqrt(np_var(sizes)) if sd != 0: test_rem_dist = norm_dist(mean_test_size, sd) p_dist = [test_rem_dist.pdf(len(line)) for line in train_list] norm_ct = sum(p_dist) if norm_ct != 0: p_dist = [val / norm_ct for val in p_dist] train_rem = rand_choice(train_list, size=com_comp, replace=False, p=p_dist) else: train_rem = [ line for line in train_list if len(line) == mean_test_size ][:com_comp] test_list = test_list + train_rem for line in train_rem: train_list.remove(line) return test_list, train_list
def _nontype_radius(self, unclustered_gids, type_gids, ani_af_nontype_vs_type): """Calculate circumscription radius for unclustered, nontype genomes.""" # set type radius for all type genomes to default values nontype_radius = {} for gid in unclustered_gids: nontype_radius[gid] = GenomeRadius(ani = self.ani_sp, af = None, neighbour_gid = None) # determine closest type ANI neighbour and restrict ANI radius as necessary ani_af = pickle.load(open(ani_af_nontype_vs_type, 'rb')) for nontype_gid in unclustered_gids: if nontype_gid not in ani_af: continue for type_gid in type_gids: if type_gid not in ani_af[nontype_gid]: continue ani, af = symmetric_ani(ani_af, nontype_gid, type_gid) if ani > nontype_radius[nontype_gid].ani and af >= self.af_sp: nontype_radius[nontype_gid] = GenomeRadius(ani = ani, af = af, neighbour_gid = type_gid) self.logger.info('ANI circumscription radius: min=%.2f, mean=%.2f, max=%.2f' % ( min([d.ani for d in nontype_radius.values()]), np_mean([d.ani for d in nontype_radius.values()]), max([d.ani for d in nontype_radius.values()]))) return nontype_radius
def within_cluster_similarity_statistics(cluster): """ Calculate the sequence similarities within a cluster. Return the similarity matrix. """ representations = cluster.seqs _representations = cluster.seqs_as_list() lenrep = len(_representations) similarities = np.ones((lenrep, lenrep, 3)) for j in range(lenrep): for k in range(j + 1, lenrep): # calculate once sim = diff_sequences(_representations[j], _representations[k]) # but fill both triangles of the matrix similarities[j, k, :] = [ representations[j].id, representations[k].id, sim ] similarities[k, j, :] = [ representations[k].id, representations[j].id, sim ] average_rep_sim = np_mean(similarities[:, :, 2]) var_rep_sim = np_var(similarities[:, :, 2]) return similarities, average_rep_sim, var_rep_sim
def get_knn_purity(latents, labels, n_neighs=30): latents = latents.cpu().detach().numpy() if isinstance(latents, Tensor) else latents labels = labels.cpu().detach().numpy() if isinstance(labels, Tensor) else labels nbrs = NearestNeighbors(n_neighbors=n_neighs + 1).fit(latents) indices = nbrs.kneighbors(latents, return_distance=False)[:, 1:] neigh_labels = vectorize(lambda x: labels[x])(indices) scores = ((neigh_labels - labels.reshape(-1, 1)) == 0).mean(axis=1) res = [ np_mean(scores[labels.reshape(scores.shape) == i]) for i in unique(labels) ] return np_mean(res)
def transformCP(self, timer, silent=False, nolog=False): """Do the main transformation on the coverage profile data""" if(not silent): print " Reticulating splines" self.transformedCP = self.dataManager.getTransformedCoverageProfiles(self.dbFileName, indices=self.indices) self.corners = self.dataManager.getTransformedCoverageCorners(self.dbFileName) self.TCentre = np_mean(self.corners, axis=0) self.transRadius = np_norm(self.corners[0] - self.TCentre)
def process_feature(list_path, labelFeature): list_dir = sorted(listdir(list_path)) if list_dir == []: return -1 for image_path in list_dir: name_image = os_path.join(list_path, image_path) if name_image == imgSample1 or name_image == imgSample2: continue img = imread(name_image) img = resize(img, (6000, 4000)) img = img[500:-500, 750:-750, :] img = cvtColor(img, COLOR_BGR2HSV) hchan, schan, vchan = split(img) h_hist = calcHist([img], [0], None, [256], [0, 256]).reshape(256, ) s_hist = calcHist([img], [1], None, [256], [0, 256]).reshape(256, ) v_hist = calcHist([img], [2], None, [256], [0, 256]).reshape(256, ) hMean = np_mean(hchan) / 255 DPV_h_max = np_sum( np_absolute(h_hist - h_max)) / (HEIGHT * WIDTH) DPV_h_min = np_sum( np_absolute(h_hist - h_min)) / (HEIGHT * WIDTH) sMean = np_mean(schan) / 255 DPV_s_max = np_sum( np_absolute(s_hist - s_max)) / (HEIGHT * WIDTH) DPV_s_min = np_sum( np_absolute(s_hist - s_min)) / (HEIGHT * WIDTH) vMean = np_mean(vchan) / 255 DPV_v_max = np_sum( np_absolute(v_hist - v_max)) / (HEIGHT * WIDTH) DPV_v_min = np_sum( np_absolute(v_hist - v_min)) / (HEIGHT * WIDTH) correlation = np_corrcoef(h_hist, s_hist)[0][1] # variable = [hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, vMean, DPV_v_max, DPV_v_min] variable = [ hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, correlation ] feature.append(variable) labels.append([labelFeature])
def write_rank_count(self, ranks_below_taxon, results_table): """Write table indicating number of ranks below each taxa. Parameters ---------- ranks_below_taxon : d[taxon][rank prefix] -> count, or list of counts Number of ranks below named taxon. results_table : str Desired output file. """ # determine if count is a scalar or vectors taxon = list(ranks_below_taxon.keys())[0] rank_prefix = list(ranks_below_taxon[taxon].keys())[0] count = ranks_below_taxon[taxon][rank_prefix] count_is_scalar = True if isinstance(count, (list, tuple)): count_is_scalar = False # write out results sorted by taxonomic rank sorted_taxon = [] for rank_prefix in (['root'] + list(Taxonomy.rank_prefixes) + ['RS_', 'GB_', 'U_']): taxa_at_rank = [] for taxon in ranks_below_taxon: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) fout = open(results_table, 'w') fout.write('Taxon') for rank_prefix in Taxonomy.rank_prefixes: if count_is_scalar: fout.write('\t%s' % rank_prefix.capitalize()) else: fout.write('\t%s\t%s\t%s\t%s' % ('Mean: ' + rank_prefix.capitalize(), 'Std: ' + rank_prefix.capitalize(), 'Min: ' + rank_prefix.capitalize(), 'Max: ' + rank_prefix.capitalize())) fout.write('\n') for taxon in sorted_taxon: fout.write(taxon) for rank_prefix in Taxonomy.rank_prefixes: count = ranks_below_taxon[taxon][rank_prefix.capitalize()] if count_is_scalar: fout.write('\t%d' % count) else: if len(count) > 0: fout.write('\t%.1f\t%.2f\t%d\t%d' % (np_mean(count), np_std(count), min(count), max(count))) else: fout.write('\t%d\t%d\t%d\t%d' % (0, 0, 0, 0)) fout.write('\n') fout.close()
def write_rank_count(self, ranks_below_taxon, results_table): """Write table indicating number of ranks below each taxa. Parameters ---------- ranks_below_taxon : d[taxon][rank prefix] -> count, or list of counts Number of ranks below named taxon. results_table : str Desired output file. """ # determine if count is a scalar or vectors taxon = ranks_below_taxon.keys()[0] rank_prefix = ranks_below_taxon[taxon].keys()[0] count = ranks_below_taxon[taxon][rank_prefix] count_is_scalar = True if isinstance(count, (list, tuple)): count_is_scalar = False # write out results sorted by taxonomic rank sorted_taxon = [] for rank_prefix in (['root'] + list(Taxonomy.rank_prefixes) + ['RS_', 'GB_', 'U_']): taxa_at_rank = [] for taxon in ranks_below_taxon: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) fout = open(results_table, 'w') fout.write('Taxon') for rank_prefix in Taxonomy.rank_prefixes: if count_is_scalar: fout.write('\t%s' % rank_prefix.capitalize()) else: fout.write('\t%s\t%s\t%s\t%s' % ('Mean: ' + rank_prefix.capitalize(), 'Std: ' + rank_prefix.capitalize(), 'Min: ' + rank_prefix.capitalize(), 'Max: ' + rank_prefix.capitalize())) fout.write('\n') for taxon in sorted_taxon: fout.write(taxon) for rank_prefix in Taxonomy.rank_prefixes: count = ranks_below_taxon[taxon][rank_prefix.capitalize()] if count_is_scalar: fout.write('\t%d' % count) else: if len(count) > 0: fout.write('\t%.1f\t%.2f\t%d\t%d' % (np_mean(count), np_std(count), min(count), max(count))) else: fout.write('\t%d\t%d\t%d\t%d' % (0, 0, 0, 0)) fout.write('\n') fout.close()
def test(data=None, precision_bp=2000, nb_bp=3, taille_fenetre=10, breakp=None, abscisse=None): """Paramètres""" #donnees if data == None: data = [ 580.38, 581.86, 580.97, 580.8, 579.79, 580.39, 580.42, 580.82, 581.4, 581.32, 581.44, 581.68, 581.17, 580.53, 580.01, 579.91, 579.14, 579.16, 579.55, 579.67, 578.44, 578.24, 579.1, 579.09, 579.35, 578.82, 579.32, 579.01, 579, 579.8, 579.83, 579.72, 579.89, 580.01, 579.37, 578.69, 578.19, 578.67, 579.55, 578.92, 578.09, 579.37, 580.13, 580.14, 579.51, 579.24, 578.66, 578.86, 578.05, 577.79, 576.75, 576.75, 577.82, 578.64, 580.58, 579.48, 577.38, 576.9, 576.94, 576.24, 576.84, 576.85, 576.9, 577.79, 578.18, 577.51, 577.23, 578.42, 579.61, 579.05, 579.26, 579.22, 579.38, 579.1, 577.95, 578.12, 579.75, 580.85, 580.41, 579.96, 579.61, 578.76, 578.18, 577.21, 577.13, 579.1, 578.25, 577.91, 576.89, 575.96, 576.8, 577.68, 578.38, 578.52, 579.74, 579.31, 579.89, 579.96, 579.96, 579.96 ] #valeur du découpage pour trouver les breakpoints #nombre de breakpoints > 0 #Affichage variance et moyenne des données print("variance = ", np_var(data)) print("ecart type = ", np_var(data)**0.5) print("moyenne = ", np_mean(data)) #Calcul de l'intégrale de la gaussienne trouvé mu = np_mean(data) sig = np_var(data) ecart = (max(data) - min(data)) integral_g = quad(gaussian, min(data) - ecart, max(data) + ecart, args=(mu, sig)) print("integrale gauss", integral_g) print(mu, sig, ecart) #Appel de la fonctio SAX vector_c, vector_c_fit = sax(data, taille_fenetre)
def __writer(self, num_species, output_dir, writer_queue): """Write results for each species.""" # gather results for each genome output_file = os.path.join(output_dir, 'ani_species.tsv') fout = open(output_file, 'w') fout.write('Species\tNo. Sampled Genomes\tMean ANI\tMedian ANI\t5th Percentile\t95th Percentile') fout.write('\tMean AF\tMedian AF\t5th Percentile\t95th Percentile') fout.write('\tSampled Genomes\n') output_file = os.path.join(output_dir, 'ani.tsv') fout_pw = open(output_file, 'w') fout_pw.write('Species\tGenome 1\tGenome 2\tANI(1->2)\tANI(2->1)\tAF(1->2)\tAF(2->1)\n') processed = 0 while True: species, ani, af, genome_ids, results = writer_queue.get(block=True, timeout=None) if species == None: break processed += 1 statusStr = 'Finished processing %d of %d (%.2f%%) species.' % (processed, num_species, float(processed) * 100 / num_species) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() fout_pw.write(results) row = '%s\t%d' % (species, len(genome_ids)) mean_ani = np_mean(ani) p5, median, p95 = np_percentile(ani, [5, 50, 95]) row += '\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_ani, median, p5, p95) mean_af = np_mean(af) p5, median, p95 = np_percentile(af, [5, 50, 95]) row += '\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_af*100, median*100, p5*100, p95*100) fout.write('%s\t%s\n' % (row, ','.join(genome_ids))) sys.stdout.write('\n') fout.close() fout_pw.close()
def computeMeanStDev(self, list_ts_paa): mean = full(len(list_ts_paa[0]), inf) stdev = full(len(list_ts_paa[0]), inf) for i in range(len(list_ts_paa[0])): seg_i = [ts[i] for ts in list_ts_paa] mean[i] = np_mean(seg_i) stdev[i] = std(seg_i) return mean, stdev
def mean_len_by_words(self)->float: """ mean number of words Example: >>> m='Sin Documentación del indicador' >>> mean_len_bywords(m) >>> 7 """ str_inp_=self.str_inp return round(np_mean(list(map(len,str_inp_.split(" ")))),2)
def process_feature(list_path, labelFeature): print("Extracting...") list_dir = sorted(listdir(list_path)) if list_dir == []: return -1 for image_path in list_dir: name_image = os_path.join(list_path, image_path) if name_image == imgSample1 or name_image == imgSample2: continue img = imread(name_image) img = resize(img, (6000,4000)) img = img[500:-500, 750:-750, :] img = cvtColor(img, COLOR_BGR2HSV) hchan, schan, vchan = split(img) h_hist = calcHist([img], [0], None, [256], [0,256]).reshape(256,) s_hist = calcHist([img], [1], None, [256], [0,256]).reshape(256,) v_hist = calcHist([img], [2], None, [256], [0,256]).reshape(256,) # 7 feature consist of : # + Compute mean value pixel of H channel # + Dissilarity with H channel of "max" image # + Dissilarity with H channel of "min" image # + Compute mean value pixel of S channel # + Dissilarity with S channel of "max" image # + Dissilarity with S channel of "min" image # + Correlation between histogram of H and S channel hMean = np_mean(hchan)/255 DPV_h_max = np_sum(np_absolute(h_hist - h_max))/(HEIGHT*WIDTH) DPV_h_min = np_sum(np_absolute(h_hist - h_min))/(HEIGHT*WIDTH) sMean = np_mean(schan)/255 DPV_s_max = np_sum(np_absolute(s_hist - s_max))/(HEIGHT*WIDTH) DPV_s_min = np_sum(np_absolute(s_hist - s_min))/(HEIGHT*WIDTH) vMean = np_mean(vchan)/255 DPV_v_max = np_sum(np_absolute(v_hist - v_max))/(HEIGHT*WIDTH) DPV_v_min = np_sum(np_absolute(v_hist - v_min))/(HEIGHT*WIDTH) correlation = np_corrcoef(h_hist, s_hist)[0][1] # variable = [hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, vMean, DPV_v_max, DPV_v_min] variable = [hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, correlation] feature.append(variable) labels.append([labelFeature])
def cross_entropy(data): n_batches = len(unique(data)) assert n_batches == 2, ValueError( "Entropy can be calculated with only 2 batches") freq = np_mean(data == unique(data)[0]) if freq == 0 or freq == 1: return 0 return -freq * log(freq) - (1 - freq) * log(1 - freq)
def subsample_msa(self, seqs, markers): # type: (dict, list) -> (list, dict) """Sample columns from each marker in multiple sequence alignment.""" alignment_length = len(seqs.values()[0]) sampled_cols = [] start = 0 lack_sufficient_cols = 0 lack_cols_marker_ids = [] avg_perc_cols = [] for marker_id, marker_name, marker_len in markers: end = start + marker_len valid_cols = self.identify_valid_columns(start, end, seqs) assert (len(valid_cols) <= marker_len) # sanity check self.logger.info( '%s: S:%d, E:%d, LEN:%d, COLS:%d, PERC:%.1f' % (marker_name, start, end, marker_len, len(valid_cols), len(valid_cols) * 100.0 / marker_len)) avg_perc_cols.append(len(valid_cols) * 100.0 / marker_len) if len(valid_cols) < self.subset: self.logger.warning('Marker has <%d columns after filtering.' % self.subset) lack_sufficient_cols += 1 lack_cols_marker_ids.append(marker_id) offset_valid_cols = [i + start for i in valid_cols] sel_cols = random.sample(offset_valid_cols, min(self.subset, len(offset_valid_cols))) sampled_cols.extend(sel_cols) start = end mask = [1 if i in sampled_cols else 0 for i in range(alignment_length)] self.logger.info( 'Identified %d of %d marker genes with <%d columns for sampling:' % (lack_sufficient_cols, len(markers), self.subset)) self.logger.info('%s' % ', '.join(lack_cols_marker_ids)) self.logger.info( 'Marker genes had %.1f+/-%.1f%% of columns available for selection on average.' % (np_mean(avg_perc_cols), np_std(avg_perc_cols))) self.logger.info('Final MSA contains %d columns.' % len(sampled_cols)) # trim columns output_seqs = {} for seq_id, seq in seqs.iteritems(): masked_seq = ''.join( [seq[i] for i in range(0, len(mask)) if mask[i]]) output_seqs[seq_id] = masked_seq return mask, output_seqs
def write_clusters(clusters, species, out_file): """Write out clustering information.""" fout = open(out_file, 'w') fout.write( 'NCBI species\tType genome\tNo. clustered genomes\tMean ANI\tMin ANI\tMean AF\tMin AF\tClustered genomes\n' ) for gid in sorted(clusters, key=lambda x: len(clusters[x]), reverse=True): if len(clusters[gid]): mean_ani = '%.2f' % np_mean([d.ani for d in clusters[gid]]) min_ani = '%.2f' % min([d.ani for d in clusters[gid]]) mean_af = '%.2f' % np_mean([d.af for d in clusters[gid]]) min_af = '%.2f' % min([d.af for d in clusters[gid]]) else: mean_ani = min_ani = mean_af = min_af = 'N/A' fout.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' % (species.get(gid, 'unclassified'), gid, len(clusters[gid]), mean_ani, min_ani, mean_af, min_af, ','.join( [d.gid for d in clusters[gid]]))) fout.close()
def _gene_distribution(self, seq_file): """Calculate length distribution of sequences.""" gene_lens = [] for seq_id, seq in seq_io.read_seq(seq_file): gene_lens.append(len(seq)) p10, p50, p90 = np_percentile(gene_lens, [10, 50, 90]) return np_mean(gene_lens), max(gene_lens), min( gene_lens), p10, p50, p90
def set_plots(self, data_dict): # self.controller = ThreeDController() # self.controller.show() self.data_dict = data_dict utils.iface.mapCanvas().saveAsImage(os.path.join(self.prefs.CHACHE_BASE_DIR,'canvas.png')) self.canvas = np.asarray(im.open(os.path.join(self.prefs.CHACHE_BASE_DIR,'canvas.png'))) self.set_k(data_dict) xMax = utils.iface.mapCanvas().extent().xMaximum() xMin = utils.iface.mapCanvas().extent().xMinimum() yMax = utils.iface.mapCanvas().extent().yMaximum() yMin = utils.iface.mapCanvas().extent().yMinimum() ref_gl_obj = self.plot.add_reference_surface(xMin, xMax, yMin, yMax, self.canvas) self.reference_surf = ThreeDDataSurf(ref_gl_obj, self.plot, 0) self.xoff,self.yoff = self.get_xy_offs(data_dict) for orbit in data_dict.keys(): self.orbit_surf_dict[orbit] = {} self.orbit_surf_dict[orbit]['data'] = [] for band in data_dict[orbit].data: data = np_mean(band[:,data_dict[orbit].get_range()[0]:data_dict[orbit].get_range()[1]+1,:],0) y = np.array(data_dict[orbit].get_proj_y_list()) x = np.array(data_dict[orbit].get_proj_x_list()) z = np.linspace(0, data_dict[orbit].get_v_scale(), data.shape[1]) gl_obj = self.plot.add_surface(x,y,z/10., data) self.orbit_surf_dict[orbit]['data'].append(ThreeDDataSurf(gl_obj, self.plot, 0)) # if data_dict[orbit].has_key('sim'): if data_dict[orbit].sim: self.orbit_surf_dict[orbit]['sim'] = [] for band in data_dict[orbit].sim: data = np_mean(band[:,data_dict[orbit].get_range()[0]:data_dict[orbit].get_range()[1]+1,:],0) y = np.array(data_dict[orbit].get_proj_y_list()) x = np.array(data_dict[orbit].get_proj_x_list()) z = np.linspace(0, data_dict[orbit].get_v_scale(), data.shape[1]) gl_obj = self.plot.add_surface(x,y,z/10., data) self.orbit_surf_dict[orbit]['sim'].append(ThreeDDataSurf(gl_obj, self.plot, 0))
def split_meth_orig(perm_lines, inputs): fact = inputs['fact'] # 0.99 split_pt = int(round(len(perm_lines) * fact)) train_list = [line for line in perm_lines[0:split_pt]] test_list = [line for line in perm_lines[split_pt:]] # Start with something that has a biased size distribution !! sizes = [len(line) for line in train_list] train_mean = np_mean(sizes) # Transferring some of the smaller complexes to the test list train_list_lower_mean = [ line for line in train_list if len(line) < train_mean ] perc_transfer = inputs[ 'perc_transfer'] # 0.3 # You can optimize these parameters ! to_transfer = train_list_lower_mean[:int( round(len(train_list_lower_mean) * perc_transfer))] test_list = test_list + to_transfer # Now remove from train set for line in to_transfer: train_list.remove(line) # Finding complexes in train that share an edge with a complex in test com_comp = 10 while com_comp != 0: # Do until train and test sets are completely separated # Removing super huge complexes also (nodes >30 ) from test set test_list = [line for line in test_list if len(line) < 30] # REMOVE OVERLAP B/W TRAIN AND TEST DATA # Remove complexes from train set sharing two proteins with test set train_rem = [] train_rem_append = train_rem.append com_comp = 0 for train_line in train_list: pres = 0 for test_line in test_list: common = len( set(train_line.edges()).intersection(set(test_line.edges))) if common >= 1: pres = 1 break if pres == 1: train_rem_append(train_line) com_comp += 1 logging_info("No. of train complexes transferred = %s", str(com_comp)) test_list = test_list + train_rem for t_line in train_rem: train_list.remove(t_line) return train_list, test_list
def _type_genome_radius(self, type_gids, type_genome_ani_file): """Calculate circumscription radius for type genomes.""" # set type radius for all type genomes to default values type_radius = {} for gid in type_gids: type_radius[gid] = GenomeRadius(ani=self.ani_sp, af=None, neighbour_gid=None) # determine closest ANI neighbour and restrict ANI radius as necessary with open(type_genome_ani_file) as f: header = f.readline().strip().split('\t') type_gid1_index = header.index('Type genome 1') type_gid2_index = header.index('Type genome 2') ani_index = header.index('ANI') af_index = header.index('AF') for line in f: line_split = line.strip().split('\t') type_gid1 = line_split[type_gid1_index] type_gid2 = line_split[type_gid2_index] if type_gid1 not in type_gids or type_gid2 not in type_gids: continue ani = float(line_split[ani_index]) af = float(line_split[af_index]) if ani > type_radius[type_gid1].ani: if af < self.af_sp: if ani >= self.ani_sp: self.logger.warning( 'ANI for %s and %s is >%.2f, but AF <%.2f [pair skipped].' % (type_gid1, type_gid2, ani, af)) continue if ani > self.max_ani_neighbour: self.logger.error('ANI neighbour %s is >%.2f for %s.' % (type_gid2, ani, type_gid1)) type_radius[type_gid1] = GenomeRadius( ani=ani, af=af, neighbour_gid=type_gid2) self.logger.info( 'ANI circumscription radius: min=%.2f, mean=%.2f, max=%.2f' % (min([d.ani for d in type_radius.values() ]), np_mean([d.ani for d in type_radius.values() ]), max([d.ani for d in type_radius.values()]))) return type_radius
def write_clusters(clusters, type_radius, species, out_file): """Write out clustering information.""" fout = open(out_file, 'w') fout.write('NCBI species\tType genome') fout.write('\tClosest species\tClosest type genome\tANI radius\tAF closest') fout.write('\tNo. clustered genomes\tMean ANI\tMin ANI\tMean AF\tMin AF\tClustered genomes\n') for gid in sorted(clusters, key=lambda x: len(clusters[x]), reverse=True): if len(clusters[gid]): mean_ani = '%.2f' % np_mean([d.ani for d in clusters[gid]]) min_ani = '%.2f' % min([d.ani for d in clusters[gid]]) mean_af = '%.2f' % np_mean([d.af for d in clusters[gid]]) min_af = '%.2f' % min([d.af for d in clusters[gid]]) else: mean_ani = min_ani = mean_af = min_af = 'N/A' fout.write('%s\t%s' % ( species.get(gid, 'unclassified'), gid)) ani, af, closest_gid = type_radius[gid] if not af: af = 0 if not closest_gid or closest_gid == 'N/A': closest_gid = 'N/A' closest_sp = 'N/A' else: closest_sp = species[closest_gid] fout.write('\t%s\t%s\t%.2f\t%.2f' % (closest_sp, closest_gid, ani, af)) fout.write('\t%d\t%s\t%s\t%s\t%s\t%s\n' % ( len(clusters[gid]), mean_ani, min_ani, mean_af, min_af, ','.join([d.gid for d in clusters[gid]]))) fout.close()
def getCentroidStats(self, profile): """Calculate the centroids of the profile""" working_list = profile[self.rowIndices] # return the mean and stdev # we divide by std so we need to make sure it's never 0 tmp_stds = np_std(working_list, axis=0) mean_std = np_mean(tmp_stds) try: std = np_array([x if x != 0 else mean_std for x in tmp_stds]) except: std = mean_std return (np_median(working_list,axis=0), std)
def getCentroidStats(self, profile): """Calculate the centroids of the profile""" working_list = profile[self.rowIndices] # return the mean and stdev # we divide by std so we need to make sure it's never 0 tmp_stds = np_std(working_list, axis=0) mean_std = np_mean(tmp_stds) try: std = np_array([x if x != 0 else mean_std for x in tmp_stds]) except: std = mean_std return (np_median(working_list, axis=0), std)
def write_clusters(clusters, rep_radius, genomes, out_file): """Write out clustering information.""" fout = open(out_file, 'w') fout.write('Representative\tGTDB species\tNCBI species') fout.write( '\tClosest GTDB species\tClosest representative\tANI radius\tAF closest' ) fout.write( '\tNo. clustered genomes\tMean ANI\tMin ANI\tMean AF\tMin AF\tClustered genomes\n' ) for gid in sorted(clusters, key=lambda x: len(clusters[x]), reverse=True): if clusters[gid]: mean_ani = '%.2f' % np_mean([d.ani for d in clusters[gid]]) min_ani = '%.2f' % min([d.ani for d in clusters[gid]]) mean_af = '%.2f' % np_mean([d.af for d in clusters[gid]]) min_af = '%.2f' % min([d.af for d in clusters[gid]]) else: mean_ani = min_ani = mean_af = min_af = 'N/A' fout.write('%s\t%s\t%s' % (gid, genomes[gid].gtdb_taxa.species, genomes[gid].ncbi_taxa.species)) ani, af, closest_gid = rep_radius[gid] if not af: af = 0 if not closest_gid or closest_gid == 'N/A': closest_gid = 'N/A' closest_sp = 'N/A' else: closest_sp = genomes[closest_gid].gtdb_taxa.species fout.write('\t%s\t%s\t%f\t%f' % (closest_sp, closest_gid, ani, af)) fout.write('\t%d\t%s\t%s\t%s\t%s\t%s\n' % (len(clusters[gid]), mean_ani, min_ani, mean_af, min_af, ','.join([d.gid for d in clusters[gid]]))) fout.close()
def _num_lineages(self, tree, threshold): """Produce table with number of lineage for increasing mean branch lengths Parameters ---------- tree : dendropy Tree Input tree. threshold : float Mean distance to terminal taxa used to define lineages. Returns ------- int Number of lineages with multiple taxa. int Number of lineage represented by single leaf node. """ stack = [tree.seed_node] num_lineages = 0 num_terminal_lineages = 0 while stack: node = stack.pop() # check if node is a leaf if node.is_leaf(): num_terminal_lineages += 1 continue # check if node meets mean branch length criterion dists_to_tips = [] for t in node.leaf_iter(): dists_to_tips.append(self._dist_to_ancestor(t, node)) if np_mean(dists_to_tips) > threshold: for c in node.child_node_iter(): stack.append(c) continue num_lineages += 1 return num_lineages, num_terminal_lineages
def _rep_genome_stats(self, clusters, genome_files): """Calculate statistics relative to representative genome.""" self.logger.info('Calculating statistics to cluster representatives:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): if len(cids) == 0: stats[rid] = self.RepStats(min_ani = -1, mean_ani = -1, std_ani = -1, median_ani = -1) else: # calculate ANI to representative genome gid_pairs = [] for cid in cids: gid_pairs.append((cid, rid)) ani_af = self.ani_cache.fastani_pairs(gid_pairs, genome_files, report_progress=False) # calculate statistics anis = [ani_af[cid][rid][0] for cid in cids] stats[rid] = self.RepStats(min_ani = min(anis), mean_ani = np_mean(anis), std_ani = np_std(anis), median_ani = np_median(anis)) statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % ( idx+1, len(clusters), float((idx+1)*100)/len(clusters)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') return stats
def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir): """Calculate statistics for genomes. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. num_clusters : int Number of cluster to form. num_components : int Number of PCA components to consider. K : int K-mer size to use for calculating genomic signature. no_coverage : boolean Flag indicating if coverage information should be used during clustering. no_pca : boolean Flag indicating if PCA of genomic signature should be calculated. iterations : int Iterations of clustering to perform. genome_file : str Sequences being clustered. output_dir : str Directory to write results. """ # get GC and mean coverage for each scaffold in genome self.logger.info('') self.logger.info(' Determining mean coverage and genomic signatures.') signatures = GenomicSignature(K) genome_stats = [] signature_matrix = [] seqs = seq_io.read(genome_file) for seq_id, seq in seqs.iteritems(): stats = scaffold_stats.stats[seq_id] if not no_coverage: genome_stats.append((np_mean(stats.coverage))) else: genome_stats.append(()) if K == 0: pass elif K == 4: signature_matrix.append(stats.signature) else: sig = signatures.seq_signature(seq) total_kmers = sum(sig) for i in xrange(0, len(sig)): sig[i] = float(sig[i]) / total_kmers signature_matrix.append(sig) # calculate PCA of tetranucleotide signatures if K != 0: if not no_pca: self.logger.info(' Calculating PCA of genomic signatures.') pc, variance = self.pca(signature_matrix) self.logger.info(' First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100)) for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, pc[i][0:num_components]) else: self.logger.info(' Using complete genomic signature.') for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, signature_matrix[i]) # whiten data if feature matrix contains coverage and genomic signature data if not no_coverage and K != 0: print ' Whitening data.' genome_stats = whiten(genome_stats) else: genome_stats = np_array(genome_stats) # cluster self.logger.info(' Partitioning genome into %d clusters.' % num_clusters) bError = True while bError: try: bError = False _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise') except ClusterError: bError = True for k in range(num_clusters): self.logger.info(' Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1))) # write out clusters genome_id = remove_extension(genome_file) for k in range(num_clusters): fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w') for i in np_where(labels == k)[0]: seq_id = seqs.keys()[i] fout.write('>' + seq_id + '\n') fout.write(seqs[seq_id] + '\n') fout.close()
def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir): """Calculate distribution of branch lengths at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. taxonomy_file : str File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree). output_dir : str Desired output directory. """ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, set(), min_children, -1) # determine branch lengths to leaves for named lineages rank_bl_dist = defaultdict(list) taxa_bl_dist = defaultdict(list) taxa_at_rank = defaultdict(list) for node in tree.postorder_node_iter(): if node.is_leaf() or not node.label: continue _support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue # get most specific rank in multi-rank taxa string taxa = [t.strip() for t in taxon.split(';')] taxon = taxa[-1] most_specific_rank = taxon[0:3] taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon) for n in node.leaf_iter(): dist_to_node = 0 while n != node: dist_to_node += n.edge_length n = n.parent_node for t in taxa: taxa_bl_dist[t].append(dist_to_node) rank = Taxonomy.rank_labels[Taxonomy.rank_index[most_specific_rank]] if rank != 'species' or Taxonomy().validate_species_name(taxon): if taxon in taxa_for_dist_inference: rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon])) # report number of taxa at each rank print '' print 'Rank\tTaxa\tTaxa for Inference' for rank, taxa in taxa_at_rank.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' # report results sorted by rank sorted_taxon = [] for rank_prefix in Taxonomy.rank_prefixes: taxa_at_rank = [] for taxon in taxa_bl_dist: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) # report results for each named group taxa_file = os.path.join(output_dir, 'taxa_bl_dist.tsv') fout = open(taxa_file, 'w') fout.write('Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for taxon in sorted_taxon: dist = taxa_bl_dist[taxon] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each taxonomic rank rank_file = os.path.join(output_dir, 'rank_bl_dist.tsv') fout = open(rank_file, 'w') fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for rank in Taxonomy.rank_labels: dist = rank_bl_dist[rank] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close()
def __init__(self, orbit, orbit_dict, q_rects = None, roi_movable = False, lock_aspect = True, parent = None, labels = 1, x_label = 'x', y_label = 'y', x_unit = "", y_unit = "", v_offset = (0,0), prefs = None, depth_meas = True, iface = None): super(OrbitViewer, self).__init__(parent) self.plots = [] data_f = [] sim_f = [] self.v_offset = v_offset self.v_offset_data = self.v_offset[0] self.v_offset_sim = self.v_offset[1] self.orbit_label = orbit_dict.get_instrument() + " - Orbit "+str(orbit) self.x_unit = x_unit self.y_unit = y_unit self.orbit_dict=orbit_dict self.prefs = prefs self.iface = iface if orbit_dict.data: for band in orbit_dict.data: data_f.append(np_mean(band,0)) else: for band in orbit_dict.sim: data_f.append(np_zeros(band.shape[1:])) if orbit_dict.sim: for band in orbit_dict.sim: sim_f.append(np_mean(band,0)) else: for band in orbit_dict.data: sim_f.append(np_zeros(band.shape[1:])) ii = 0 for band in orbit_dict.data: depth_cb = CreateDepthLayer(self.orbit_dict, ii, QgsProject.instance().readPath("./"), self.iface) self.plots.append(SinglePlot(images = [data_f[ii], sim_f[ii]], images_label = ["data", "sim"], label_text = self.orbit_label+" Frequency band "+str(ii+1), q_rects = q_rects, roi_movable = roi_movable, lock_aspect = lock_aspect, x_label = x_label, y_label = y_label, x_unit = x_unit, y_unit = y_unit, depth_cb = depth_cb.run, depth_meas = depth_meas)) self.addItem(self.plots[-1], row=0, col=(ii)) ii = ii + 1 self.set_pos_label(0)
def _pairwise_stats(self, clusters, genome_files): """Calculate statistics for all pairwise comparisons in a species cluster.""" self.logger.info('Calculating statistics for all pairwise comparisons in a species cluster:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): statusStr = '-> Processing %d of %d (%.2f%%) clusters (size = %d).'.ljust(86) % ( idx+1, len(clusters), float((idx+1)*100)/len(clusters), len(cids)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if len(cids) == 0: stats[rid] = self.PairwiseStats(min_ani = -1, mean_ani = -1, std_ani = -1, median_ani = -1, ani_to_medoid = -1, mean_ani_to_medoid = -1, ani_below_95 = -1) else: if len(cids) > self.max_genomes_for_stats: cids = set(random.sample(cids, self.max_genomes_for_stats)) # calculate ANI to representative genome gid_pairs = [] gids = list(cids.union([rid])) for gid1, gid2 in combinations(gids, 2): gid_pairs.append((gid1, gid2)) gid_pairs.append((gid2, gid1)) ani_af = self.ani_cache.fastani_pairs(gid_pairs, genome_files, report_progress=False) # calculate medoid point if len(gids) > 2: dist_mat = np_zeros((len(gids), len(gids))) for i, gid1 in enumerate(gids): for j, gid2 in enumerate(gids): if i < j: ani, af = symmetric_ani(ani_af, gid1, gid2) dist_mat[i, j] = ani dist_mat[j, i] = ani medoid_idx = np_argmin(dist_mat.sum(axis=0)) medoid_gid = gids[medoid_idx] else: # with only 2 genomes in a cluster, the representative is the # natural medoid at least for reporting statistics for the # individual species cluster medoid_gid = rid mean_ani_to_medoid = np_mean([symmetric_ani(ani_af, gid, medoid_gid)[0] for gid in gids if gid != medoid_gid]) # calculate statistics anis = [] for gid1, gid2 in combinations(gids, 2): ani, af = symmetric_ani(ani_af, gid1, gid2) anis.append(ani) stats[rid] = self.PairwiseStats(min_ani = min(anis), mean_ani = np_mean(anis), std_ani = np_std(anis), median_ani = np_median(anis), ani_to_medoid = symmetric_ani(ani_af, rid, medoid_gid)[0], mean_ani_to_medoid = mean_ani_to_medoid, ani_below_95 = sum([1 for ani in anis if ani < 95])) sys.stdout.write('\n') return stats
def identify(self, scaffold_stats, genome_stats, gc_per, td_per, cov_corr, cov_perc, report_type, output_file): """Identify scaffolds with divergent genomic characteristics. Outliers are identified independently based on GC content, tetranucleotide signatures, coverage profile correlation, and mean absolute percent error of coverage profile. The coverage correlation check is ignored if the coverage profile consists of a single value. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. genome_stats : GenomeStats Statistics for individual genomes. gc_per : int. Percentile for identifying GC outliers td_per : int Percentile for identifying TD outliers. cov_corr : int Correlation for identifying divergent coverage profiles. cov_perc : int Mean absolute percent error for identifying divergent coverage profiles. report_type : str Report scaffolds that are outliers in 'all' or 'any' distribution. output_file : str Name of output file. """ # read reference distributions from file self.logger.info(' Reading reference distributions.') self.gc_dist = self._read_distribution('gc_dist') self.td_dist = self._read_distribution('td_dist') # identify outliers in each genome fout = open(output_file, 'w') fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tOutlying distributions') fout.write('\tScaffold GC\tMean genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per)) fout.write('\tScaffold TD\tMean genome TD\tUpper TD bound (%s%%)' % td_per) fout.write('\tMean scaffold coverage\tMean genome coverage\tCoverage correlation\tMean coverage error\n') genomic_signature = GenomicSignature(0) processed_genomes = 0 for genome_id, scaffold_ids in scaffold_stats.scaffolds_in_genome.iteritems(): processed_genomes += 1 sys.stdout.write(' Finding outliers in %d of %d (%.1f%%) genomes.\r' % (processed_genomes, scaffold_stats.num_genomes(), processed_genomes * 100.0 / scaffold_stats.num_genomes())) sys.stdout.flush() # find keys into GC and TD distributions # gc -> [mean GC][scaffold length][percentile] # td -> [scaffold length][percentile] gs = genome_stats[genome_id] closest_gc = find_nearest(self.gc_dist.keys(), gs.mean_gc / 100.0) sample_seq_len = self.gc_dist[closest_gc].keys()[0] d = self.gc_dist[closest_gc][sample_seq_len] gc_lower_bound_key = find_nearest(d.keys(), (100 - gc_per) / 2.0) gc_upper_bound_key = find_nearest(d.keys(), (100 + gc_per) / 2.0) td_bound_key = find_nearest(self.td_dist[self.td_dist.keys()[0]].keys(), td_per) for scaffold_id in scaffold_ids: stats = scaffold_stats.stats[scaffold_id] # find GC and TD bounds closest_seq_len = find_nearest(self.gc_dist[closest_gc].keys(), stats.length) gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key] gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key] closest_seq_len = find_nearest(self.td_dist.keys(), stats.length) td_bound = self.td_dist[closest_seq_len][td_bound_key] # find changes from mean delta_gc = (stats.gc - gs.mean_gc) / 100.0 delta_td = genomic_signature.manhattan(stats.signature, gs.mean_signature) # determine if scaffold is an outlier outlying_dists = [] if delta_gc < gc_lower_bound or delta_gc > gc_upper_bound: outlying_dists.append('GC') if delta_td > td_bound: outlying_dists.append('TD') corr_r = 1.0 if len(gs.mean_coverage) > 1: corr_r, _corr_p = pearsonr(gs.mean_coverage, stats.coverage) if corr_r < cov_corr: outlying_dists.append('COV_CORR') mean_cp = [] for cov_genome, cov_scaffold in itertools.izip(gs.mean_coverage, stats.coverage): if cov_genome >= self.min_required_coverage: mean_cp.append(abs(cov_scaffold - cov_genome) * 100.0 / cov_genome) if len(mean_cp) == 0: # genome has zero coverage which is general # will indicate something is wrong mean_cp = -1 outlying_dists.append('COV_PERC') else: mean_cp = np_mean(mean_cp) if mean_cp > cov_perc: outlying_dists.append('COV_PERC') # report outliers if (report_type == 'any' and len(outlying_dists) >= 1) or (report_type == 'all' and len(outlying_dists) >= 3): fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, stats.length, ','.join(outlying_dists))) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (stats.gc, gs.mean_gc, gs.mean_gc + gc_lower_bound * 100, gs.mean_gc + gc_upper_bound * 100)) fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.mean_td, td_bound)) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(stats.coverage), np_mean(gs.mean_coverage), corr_r, mean_cp)) fout.write('\n') sys.stdout.write('\n') fout.close()
def _percent_correct_plot(self, rel_dists, taxa_for_dist_inference, output_prefix): """Create plots showing correctly classified taxa for different relative distance values. Parameters ---------- rel_dists : d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to consider when inferring relative divergence thresholds. output_prefix : str Prefix for plots. """ print '' print ' Relative divergence thresholds (rank, threshold, parent taxa, child taxa):' ranks = sorted(rel_dists.keys()) rel_dist_thresholds = [] for i in xrange(ranks[0], ranks[-1]): parent_rank = i child_rank = i + 1 # determine classification results for relative divergence # values between the medians of adjacent taxonomic ranks parent_rds = [] for taxa, rd in rel_dists[parent_rank].iteritems(): if taxa in taxa_for_dist_inference: parent_rds.append(rd) parent_p50 = np_percentile(parent_rds, 50) child_rds = [] for taxa, rd in rel_dists[child_rank].iteritems(): if taxa in taxa_for_dist_inference: child_rds.append(rd) child_p50 = np_percentile(child_rds, 50) r = [] y_parent = [] y_child = [] y_mean_corr = [] for test_r in np_linspace(parent_p50, child_p50, 100): parent_cor = float(sum([1 for rd in parent_rds if rd <= test_r])) / len(parent_rds) child_cor = float(sum([1 for rd in child_rds if rd > test_r])) / len(child_rds) r.append(test_r) y_parent.append(parent_cor) y_child.append(child_cor) y_mean_corr.append(0.5 * parent_cor + 0.5 * child_cor) # create plot of correctly classified taxa self.fig.clear() self.fig.set_size_inches(6, 6) ax = self.fig.add_subplot(111) ax.plot(r, y_parent, 'k--', label=Taxonomy.rank_labels[i]) ax.plot(r, y_child, 'k:', label=Taxonomy.rank_labels[i + 1]) ax.plot(r, y_mean_corr, 'r-', label='mean') legend = ax.legend(loc='upper left') legend.draw_frame(False) # find maximum of mean correct classification max_mean = max(y_mean_corr) r_max_values = [r[i] for i, rd in enumerate(y_mean_corr) if rd == max_mean] r_max_value = np_mean(r_max_values) # Note: this will fail if there are multiple local maxima print ' %s\t%.3f\t%d\t%d' % (Taxonomy.rank_labels[parent_rank], r_max_value, len(parent_rds), len(child_rds)) # check that there is a single local maximum rd_indices = [i for i, rd in enumerate(y_mean_corr) if rd == max_mean] for rd_index in xrange(0, len(rd_indices) - 1): if rd_indices[rd_index] != rd_indices[rd_index + 1] - 1: print '[Warning] There are multiple local maxima, so estimated relative divergence threshold will be invalid.' rel_dist_thresholds.append(r_max_value) y_min, _y_max = ax.get_ylim() ax.axvline(x=r_max_value, ymin=0, ymax=1, color='r', ls='--') ax.text(r_max_value + 0.001, y_min + 0.01, '%.3f' % r_max_value, horizontalalignment='left') ax.set_xlabel('relative distance') ax.set_ylabel('% taxa correctly classified') self.prettify(ax) self.fig.tight_layout(pad=1) self.fig.savefig(output_prefix + '.%s_%s.png' % (Taxonomy.rank_labels[parent_rank], Taxonomy.rank_labels[child_rank]), dpi=96) print '' return rel_dist_thresholds
def optimal(self, input_tree, rank, min_dist, max_dist, step_size, output_table): """Determine branch length for best congruency with existing taxonomy. Parameters ---------- input_tree : str Name of input tree. rank : int Taxonomic rank to consider (1=Phylum, ..., 6=Species). output_table : str Name of output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # get mean distance to terminal taxa for each node along with # other stats needed to determine classification self.logger.info('Determining MDTT for each node.') rank_prefix = Taxonomy.rank_prefixes[rank] child_rank_prefix = Taxonomy.rank_prefixes[rank+1] rank_info = [] rank_dists = set() for node in tree.seed_node.preorder_internal_node_iter(): if node == tree.seed_node: continue # check if node is at the specified rank node_taxon = None if node.label: support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): node_taxon = taxon if not node_taxon: continue # check that node has two descendants at the next rank child_rank_taxa = [] for c in node.levelorder_iter(): if c.label: support, taxon_name, _auxiliary_info = parse_label(c.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(child_rank_prefix): child_rank_taxa.append(taxon) if len(child_rank_taxa) >= 2: break if len(child_rank_taxa) < 2: continue # get mean branch length to terminal taxa dists_to_tips = [] for t in node.leaf_iter(): dists_to_tips.append(self._dist_to_ancestor(t, node)) node_dist = np_mean(dists_to_tips) # get mean branch length to terminal taxa for first ancestor spanning multiple phyla ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix) ancestor_dists_to_tips = [] for t in ancestor.leaf_iter(): ancestor_dists_to_tips.append(self._dist_to_ancestor(t, ancestor)) ancestor_dist = np_mean(ancestor_dists_to_tips) rank_info.append([node_dist, ancestor_dist, node_taxon]) rank_dists.add(node_dist) self.logger.info('Calculating threshold from %d taxa with specified rank resolution.' % len(rank_info)) fout = open('bl_optimal_taxa_dists.tsv' , 'w') fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n') for node_dist, ancestor_dist, node_taxon in rank_info: fout.write('%s\t%.3f\t%.3f\n' % (node_taxon, node_dist, ancestor_dist)) fout.close() # report number of correct and incorrect taxa for each threshold fout = open(output_table, 'w') header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages' fout.write(header + '\n') print header top_correct = 0 top_incorrect = 0 top_precision = 0 for d in np_arange(min_dist, max_dist+step_size, step_size): rank_dists.add(d) for dist_threshold in sorted(rank_dists, reverse=True): correct = 0 incorrect = 0 for node_dist, ancestor_dist, node_taxon in rank_info: # check if node/edge would be collapsed at the given threshold if node_dist <= dist_threshold and ancestor_dist > dist_threshold: correct += 1 elif node_dist > dist_threshold: incorrect += 1 else: incorrect += 1 # above ancestor with multiple taxa denominator = correct + incorrect if denominator: precision = float(correct) / denominator else: precision = 0 num_lineages, num_terminal_lineages = self._num_lineages(tree, dist_threshold) row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % (dist_threshold, correct, incorrect, precision, num_lineages + num_terminal_lineages, num_lineages, num_terminal_lineages) fout.write(row + '\n') print row if precision > top_precision: top_correct = correct top_incorrect = incorrect top_precision = precision top_threshold = dist_threshold return top_threshold, top_correct, top_incorrect
def compatible(self, scaffolds_of_interest, scaffold_stats, genome_stats, gc_per, td_per, cov_corr, cov_perc, report_type, output_file): """Identify scaffolds with compatible genomic characteristics. Compatible scaffolds are identified based on GC content, tetranucleotide signatures, coverage profile correlation, and mean absolute percent error of coverage profile. The coverage correlation check is ignored if the coverage profile consists of a single value. Parameters ---------- scaffolds_of_interest : d[scaffold_id] -> [no. genes, perc. genes with homology] Scaffolds to consider for compatibility. scaffold_stats : ScaffoldStats Statistics for individual scaffolds to check. genome_stats : GenomeStats Statistics for individual genomes. gc_per : int Percentile for identifying GC outliers. td_per : int Percentile for identifying TD outliers. cov_corr : int Correlation for identifying divergent coverage profiles. cov_perc : int Mean absolute percent error for identifying divergent coverage profiles. report_type : str Report scaffolds that are outliers in 'all' or 'any' distribution. output_file : str Name of output file. """ # read reference distributions from file self.logger.info('') self.logger.info(' Reading reference distributions.') self.gc_dist = self._read_distribution('gc_dist') self.td_dist = self._read_distribution('td_dist') # identify compatible scaffolds in each genome fout = open(output_file, 'w') fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tCompatible distributions') fout.write('\tScaffold GC\tMean genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per)) fout.write('\tScaffold TD\tMean genome TD\tUpper TD bound (%s%%)' % td_per) fout.write('\tMean scaffold coverage\tMean genome coverage\tCoverage correlation\tMean coverage error') fout.write('\t# genes\t% genes with homology\n') genomic_signature = GenomicSignature(0) self.logger.info(' Identifying scaffolds compatible with bins.') processed_scaffolds = 0 for scaffold_id, ss in scaffold_stats.stats.iteritems(): processed_scaffolds += 1 sys.stdout.write(' Processed %d of %d (%.1f%%) scaffolds.\r' % (processed_scaffolds, len(scaffold_stats.stats), processed_scaffolds * 100.0 / len(scaffold_stats.stats))) sys.stdout.flush() if scaffold_id not in scaffolds_of_interest: continue for genome_id, gs in genome_stats.iteritems(): # find keys into GC and TD distributions # gc -> [mean GC][scaffold length][percentile] # td -> [scaffold length][percentile] closest_gc = find_nearest(self.gc_dist.keys(), gs.mean_gc / 100.0) sample_seq_len = self.gc_dist[closest_gc].keys()[0] d = self.gc_dist[closest_gc][sample_seq_len] gc_lower_bound_key = find_nearest(d.keys(), (100 - gc_per) / 2.0) gc_upper_bound_key = find_nearest(d.keys(), (100 + gc_per) / 2.0) td_bound_key = find_nearest(self.td_dist[self.td_dist.keys()[0]].keys(), td_per) # find GC and TD bounds closest_seq_len = find_nearest(self.gc_dist[closest_gc].keys(), ss.length) gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key] gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key] closest_seq_len = find_nearest(self.td_dist.keys(), ss.length) td_bound = self.td_dist[closest_seq_len][td_bound_key] # find changes from mean delta_gc = (ss.gc - gs.mean_gc) / 100.0 delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature) # determine if scaffold compatible compatible_dists = [] if delta_gc >= gc_lower_bound and delta_gc <= gc_upper_bound: compatible_dists.append('GC') if delta_td <= td_bound: compatible_dists.append('TD') corr_r = 1.0 if len(gs.mean_coverage) > 1: corr_r, _corr_p = pearsonr(gs.mean_coverage, ss.coverage) if corr_r >= cov_corr: compatible_dists.append('COV_CORR') mean_cp = [] for cov_genome, cov_scaffold in itertools.izip(gs.mean_coverage, ss.coverage): if cov_genome >= self.min_required_coverage: mean_cp.append(abs(cov_genome - cov_scaffold) * 100.0 / cov_genome) mean_cp = np_mean(mean_cp) if mean_cp <= cov_perc: compatible_dists.append('COV_PERC') # report compatible scaffolds if (report_type == 'any' and len(compatible_dists) >= 1) or (report_type == 'all' and len(compatible_dists) >= 3): fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, ss.length, ','.join(compatible_dists))) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (ss.gc, gs.mean_gc, gs.mean_gc + gc_lower_bound * 100, gs.mean_gc + gc_upper_bound * 100)) fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.mean_td, td_bound)) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(ss.coverage), np_mean(gs.mean_coverage), corr_r, mean_cp)) fout.write('\t%d\t%.1f' % (scaffolds_of_interest[scaffold_id][0], scaffolds_of_interest[scaffold_id][1])) fout.write('\n') sys.stdout.write('\n') fout.close()
def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file, taxonomy_file, output_prefix, min_children, title): # determine named clades in full tree named_clades = set() tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) for node in tree.preorder_node_iter(): if node.label: taxonomy = node.label.split(';') named_clades.add(taxonomy[-1].strip().split(':')[-1]) print 'Identified %d named clades in full tree.' % len(named_clades) # determine named groups with at least the specified number of children print 'Determining taxa with sufficient named children lineages.' taxon_children = defaultdict(set) groups = defaultdict(list) print taxonomy_file for line in open(taxonomy_file): line_split = line.replace('; ', ';').split() genome_id = line_split[0] taxonomy = [x.strip() for x in line_split[1].split(';')] if len(taxonomy) > rank + 1: taxon_children[taxonomy[rank]].add(taxonomy[rank + 1]) if len(taxonomy) > rank: groups[taxonomy[rank]].append(genome_id) groups_to_consider = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children and taxon in named_clades: groups_to_consider.add(taxon) print 'Assessing distribution over %d groups.' % len(groups_to_consider) # calculate relative distance for full tree print '' print 'Calculating relative distance over full tree.' tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups) if len(polyphyletic) > 0: print '' print '[Warning] Full tree contains polyphyletic groups.' # calculate relative distance for dereplicated tree print '' print 'Calculating relative distance over dereplicated tree.' tree = dendropy.Tree.get_from_path(derep_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups) groups_to_consider = groups_to_consider - polyphyletic print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len(groups_to_consider) # calculate relative distance to each group in each tree print '' rel_dists = defaultdict(list) dist_components = defaultdict(list) for f in os.listdir(input_tree_dir): if not f.endswith('.rooted.tree'): continue print f tree_file = os.path.join(input_tree_dir, f) tree = dendropy.Tree.get_from_path(tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # calculate relative distance to named taxa rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups) for taxon, dist in rel_dist.iteritems(): rel_dists[taxon].append(dist) dist_components[taxon].append(components[taxon]) # create scatter plot x = [] y = [] xDerep = [] yDerep = [] xFull = [] yFull = [] perc10 = [] perc90 = [] labels = [] fout = open(output_prefix + '.tsv', 'w') fout.write('Taxon\tP10\tP90\tP90-P10\tMean rel. dist\tMean dist to parent\tMean dist to leaves\tOriginal rel. dist.\tOrigial dist to parent\tOriginal dist to leaves\n') for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)): labels.append(taxon + ' (%d)' % (len(rel_dists[taxon]))) rd = rel_dists[taxon] for d in rd: x.append(d) y.append(i + 0.2) p10, p90 = np_percentile(rd, [10, 90]) perc10.append(p10) perc90.append(p90) print taxon, p90 - p10 mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0) derep_x, derep_a, derep_b = derep_dist_components[taxon] fout.write('%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' % (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x, derep_a, derep_b)) xDerep.append(derep_rel_dist[taxon]) yDerep.append(i) xFull.append(full_rel_dist[taxon]) yFull.append(i) fout.close() self.fig.clear() self.fig.set_size_inches(8, len(rel_dists) * 0.4) ax = self.fig.add_subplot(111) ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s') ax.scatter(xDerep, yDerep, alpha=1.0, s=24, c=(1.0, 0.0, 0.0), marker='s') ax.scatter(xFull, yFull, alpha=1.0, s=24, c=(0.0, 0.0, 1.0), marker='*') for i in xrange(len(labels)): ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-') ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-') # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') if title: ax.set_title(title, size=12) ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('taxa') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(labels) self.prettify(ax) # make plot interactive # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12)) # mpld3.save_html(fig, output_prefix + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(output_prefix + '.png', dpi=300)
def table(self, input_tree, taxon_category_file, bl_step_size, output_table): """Produce table with number of lineage for increasing mean branch lengths Parameters ---------- input_tree : str Name of input tree. taxon_category_file : str File indicating category for each taxon in the tree. bl_step_size : float Step size in table for mean branch length criterion. output_table : str Name of output table. """ # get category for each taxon taxon_category = {} for line in open(taxon_category_file): line_split = line.strip().split('\t') taxon_category[line_split[0]] = line_split[1] # read tree tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # determine mean distance to leaves and taxon categories for each node all_categories = set() node_info = {} parent_mean_dist_to_leafs = {} max_bl_threshold = None for i, node in enumerate(tree.seed_node.preorder_iter()): node.id = i if node.is_leaf(): mean_dist_to_leafs = 0.0 categories = set() for c in taxon_category[node.taxon.label].split('/'): categories.add(c) else: dist_to_leafs = [] categories = set() for t in node.leaf_iter(): dist_to_leafs.append(self._dist_to_ancestor(t, node)) for c in taxon_category[t.taxon.label].split('/'): categories.add(c) mean_dist_to_leafs = np_mean(dist_to_leafs) if node.parent_node: p = parent_mean_dist_to_leafs[node.parent_node.id] else: p = mean_dist_to_leafs + 1e-6 category = '/'.join(sorted(list(categories), reverse=True)) all_categories.add(category) node_info[node.id] = [mean_dist_to_leafs, p, category] parent_mean_dist_to_leafs[node.id] = mean_dist_to_leafs if mean_dist_to_leafs > max_bl_threshold: max_bl_threshold = mean_dist_to_leafs # write table fout = open(output_table, 'w') fout.write('Threshold') for c in all_categories: fout.write('\t%s' % c) fout.write('\n') for bl_threshold in np_arange(0, max_bl_threshold + bl_step_size, bl_step_size): category_count = defaultdict(int) stack = [tree.seed_node] while stack: node = stack.pop() mean_dist_to_leafs, _, category = node_info[node.id] if mean_dist_to_leafs > bl_threshold: for c in node.child_node_iter(): stack.append(c) else: category_count[category] += 1 # check if node meets mean branch length criterion if sum(category_count.values()) > 0: fout.write('%.3f' % bl_threshold) for c in all_categories: fout.write('\t%d' % category_count[c]) fout.write('\n') fout.close() if False: node_info.sort() for bl_threshold in np_arange(0, node_info[-1][0] + bl_step_size, bl_step_size): category_count = defaultdict(int) for mean_bl_dist, parent_mean_bl_dist, category in node_info: if bl_threshold >= mean_bl_dist and bl_threshold < parent_mean_bl_dist: category_count[category] += 1 if sum(category_count.values()) > 0: fout.write('%.3f' % bl_threshold) for c in all_categories: fout.write('\t%d' % category_count[c]) fout.write('\n')
def decorate(self, input_tree, taxonomy_file, threshold, rank, retain_named_lineages, keep_labels, prune, output_tree): """Produce table with number of lineage for increasing mean branch lengths Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomic information for each taxon. threshold : float Branch length threshold. rank : int Rank of labels to retain on tree. retain_named_lineages : bool Retain existing named lineages at the specified rank. keep_labels : bool Keep existing labels on tree. prune : bool Prune tree to preserve only the shallowest and deepest taxa in each lineage. output_tree : str Name of output tree. """ # read taxonomy taxonomy = Taxonomy().read(taxonomy_file) # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # decorate tree rank_prefix = Taxonomy.rank_prefixes[rank] new_name_number = defaultdict(int) ncbi_only = 0 sra_only = 0 labeled_nodes = set() stack = [tree.seed_node] while stack: node = stack.pop() # check if node is a leaf if node.is_leaf(): continue # check if ancestor already has a label at this rank p = node parent_taxon = None while p and not parent_taxon: if p.label: support, taxon_name, _auxiliary_info = parse_label(p.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): parent_taxon = taxon p = p.parent_node if retain_named_lineages and parent_taxon: for c in node.child_node_iter(): stack.append(c) continue # check if descendant node already has a label at this rank children_taxon = [] for c in node.preorder_internal_node_iter(): if c.label: support, taxon_name, _auxiliary_info = parse_label(c.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): children_taxon.append(taxon) if retain_named_lineages and children_taxon: for c in node.child_node_iter(): stack.append(c) continue # check if node meets mean branch length criterion dists_to_tips = [] for t in node.leaf_iter(): dists_to_tips.append(self._dist_to_ancestor(t, node)) if np_mean(dists_to_tips) > threshold: for c in node.child_node_iter(): stack.append(c) continue # count number of SRA and NCBI taxa below node num_sra_taxa = 0 num_ncbi_taxa = 0 taxa_labels = set() for t in node.leaf_iter(): if t.taxon.label.startswith('U_'): num_sra_taxa += 1 else: num_ncbi_taxa += 1 t = taxonomy[t.taxon.label] taxon = t[rank][3:].replace('Candidatus ', '') if taxon: taxa_labels.add(taxon) if parent_taxon: taxa_labels.add(parent_taxon[3:].replace('Candidatus ', '')) elif children_taxon: for c in children_taxon: taxa_labels.add(c[3:].replace('Candidatus ', '')) # name lineage based on position to existing named lineages if taxa_labels: lineage_name = ', '.join(sorted(taxa_labels)) else: lineage_name = 'Unclassified lineage' support = None taxon_name = None if node.label: # preserve support information support, _taxon_name, _auxiliary_info = parse_label(node.label) new_name_number[lineage_name] += 1 if support: node.label = '%d:%s %d' % (support, lineage_name, new_name_number[lineage_name]) else: node.label = '%s %d' % (lineage_name, new_name_number[lineage_name]) labeled_nodes.add(node) if num_sra_taxa == 0: ncbi_only += 1 if num_ncbi_taxa == 0: sra_only += 1 # strip previous labels if not keep_labels: for node in tree.preorder_internal_node_iter(): if node in labeled_nodes: continue if node.label: # preserve support information support, _taxon_name, _auxiliary_info = parse_label(node.label) node.label = support # prune tree to shallowest and deepest taxa in each named lineage if prune: nodes_to_prune = set() for node in labeled_nodes: for c in node.child_node_iter(): dists = [] for t in c.leaf_iter(): d = self._dist_to_ancestor(t, node) dists.append((d, t)) dists.sort() # select taxa at the 10th and 90th percentiles to # give a good sense of the range of depths perc_10th_index = int(0.1 * len(dists) + 0.5) perc_90th_index = int(0.9 * len(dists) + 0.5) for i, (d, t) in enumerate(dists): if i != perc_10th_index and i != perc_90th_index: nodes_to_prune.add(t.taxon) print 'before prune', sum([1 for _ in tree.leaf_node_iter()]) tree.prune_taxa(nodes_to_prune) print 'after prune', sum([1 for _ in tree.leaf_node_iter()]) self.logger.info('Decorated %d internal nodes.' % sum(new_name_number.values())) #self.logger.info('NCBI-only %d; SRA-only %d' % (ncbi_only, sra_only)) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def stats_for_qlp_well(well, compute_clusters=False, override_thresholds=None): """ Return statistics about a QLWell object read from a QLP file. The QLWell object should have a populated `peaks` attribute (reading from QLBs won't work) For parameter explanations and return values, see :func:`stats_for_qlp_well`. """ from pyqlb.nstats.peaks import cluster_1d, channel_amplitudes from pyqlb.nstats.well import accepted_peaks, above_min_amplitude_peaks, well_channel_sp_values, well_cluster_peaks from pyqlb.nstats.well import well_observed_positives_negatives, well_s2d_values, getClusters from pyqlb.nstats.well import high_flier_droplets, low_flier_droplets, singleRain_droplets, doubleRain_droplets, diagonal_scatter from numpy import mean as np_mean, std as np_std if not override_thresholds: override_thresholds = (None, None) statistics = well_statistics(well, override_thresholds=override_thresholds) accepted = len(accepted_peaks(well)) num_above_min = len(above_min_amplitude_peaks(well)) if num_above_min > 0 and accepted > 0: if well.sum_amplitude_bins: peaksets, boundaries, amps = revb_polydisperse_peaks(well, 0, threshold=override_thresholds[0]) poly_peaks = sum([len(p) for p in peaksets]) statistics[0].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min else: peaksets, boundaries, width_gates = polydisperse_peaks(well, 0, threshold=override_thresholds[0]) poly_peaks = sum([len(p) for p in peaksets]) statistics[0].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min else: statistics[0].revb_polydispersity_pct = 0 s, p_plus, p, p_minus = well_channel_sp_values(well, 0, override_threshold=override_thresholds[0]) statistics[0].s_value = s statistics[0].p_plus = p_plus statistics[0].p_plus_drops = int(p_plus*accepted) if p_plus is not None else None statistics[0].p = p statistics[0].p_drops = int(p*accepted) if p is not None else None statistics[0].p_minus = p_minus statistics[0].p_minus_drops = int(p_minus*accepted) if p_minus is not None else None if num_above_min > 0 and accepted > 0: if well.sum_amplitude_bins: peaksets, boundaries, amps = revb_polydisperse_peaks(well, 1, threshold=override_thresholds[1]) poly_peaks = sum([len(p) for p in peaksets]) statistics[1].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min else: peaksets, boundaries, width_gates = polydisperse_peaks(well, 1, threshold=override_thresholds[1]) poly_peaks = sum([len(p) for p in peaksets]) statistics[1].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min else: statistics[1].revb_polydispersity_pct = 0 s, p_plus, p, p_minus = well_channel_sp_values(well, 1, override_threshold=override_thresholds[1]) statistics[1].s_value = s statistics[1].p_plus = p_plus statistics[1].p_plus_drops = int(p_plus*accepted) if p_plus is not None else None statistics[1].p = p statistics[1].p_drops = int(p*accepted) if p is not None else None statistics[1].p_minus = p_minus statistics[1].p_minus_drops = int(p_minus*accepted) if p_minus is not None else None ## compute s2d plots s2d_vals = well_s2d_values( well, thresholds=override_thresholds) statistics[0].s2d_value = s2d_vals[0] if s2d_vals is not None else None statistics[1].s2d_value = s2d_vals[1] if s2d_vals is not None else None ## compute extra cluster metrics clusters = getClusters( well, override_thresholds ) dscatter = diagonal_scatter( clusters ) statistics.diagonal_scatter = dscatter[1] if dscatter is not None else None statistics.diagonal_scatter_pct = dscatter[2] *100 if dscatter is not None else None for channel in [0,1]: high_fliers = high_flier_droplets( clusters, channel ) statistics[channel].high_flier_value = high_fliers[1] if high_fliers is not None else None statistics[channel].high_flier_pct = high_fliers[2] * 100 if high_fliers is not None else None low_fliers = low_flier_droplets( clusters, channel ) statistics[channel].low_flier_value = low_fliers[1] if low_fliers is not None else None statistics[channel].low_flier_pct = low_fliers[2] * 100 if low_fliers is not None else None singleRain = singleRain_droplets( clusters, channel ) statistics[channel].single_rain_value = singleRain[1] if singleRain is not None else None statistics[channel].single_rain_pct = singleRain[2] * 100 if singleRain is not None else None doubleRain = doubleRain_droplets( clusters, channel ) statistics[channel].double_rain_value = doubleRain[1] if doubleRain is not None else None statistics[channel].double_rain_pct = doubleRain[2] * 100 if doubleRain is not None else None if compute_clusters: clusters = well_cluster_peaks(well, override_thresholds) else: clusters = {'positive_peaks': {'positive_peaks': [], 'negative_peaks': []}, 'negative_peaks': {'positive_peaks': [], 'negative_peaks': []}} # cheap hack statistics.alg_version = "%s.%s/%s.%s" % (well.statistics.peak_alg_major_version, well.statistics.peak_alg_minor_version, well.statistics.quant_alg_major_version, well.statistics.quant_alg_minor_version) statistics.ref_copy_num = well.ref_copy_num statistics[0].decision_tree = well.channels[0].decision_tree_verbose statistics[1].decision_tree = well.channels[1].decision_tree_verbose # end cheap hack # SNR for chan in (0,1): if override_thresholds[chan]: # TODO add this to pyqlb.nstats.well instead pos, neg = cluster_1d(accepted_peaks(well), chan, override_thresholds[chan]) else: pos, neg, unknown = well_observed_positives_negatives(well, chan) for attr, coll in (('positive_snr', pos),('negative_snr',neg)): if len(pos) > 0: amps = channel_amplitudes(coll, chan) amp_mean = np_mean(amps) amp_std = np_std(amps) if amp_std > 0: setattr(statistics[chan], attr, amp_mean/amp_std) else: setattr(statistics[chan], attr, 10000) else: setattr(statistics[chan], attr, 0) for channel in [0,1]: means,stds = total_events_amplitude_vals(well,channel) statistics[channel].total_events_amplitude_mean = means if means is not None else None statistics[channel].total_events_amplitude_stdev = stds if stds is not None else None return statistics, clusters
def get_value_for_data_only(self, values): """ Returns the mean, standard deviation and number of values """ return np_mean(values), np_std(values, ddof=1), np.size(values)
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) < 2: continue u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) mono = [] poly = [] no_inference = [] for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(dist) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(dist) else: c.append((0.0, 0.0, 1.0)) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1,-1,-1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n = 0 if len(mono) > 0: mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def get_value_for_data_only(self, values): """ return the mean """ return np_mean(values)
def loadData(self, timer, condition, # condition as set by another function bids=[], # if this is set then only load those contigs with these bin ids verbose=True, # many to some output messages silent=False, # some to no output messages loadCovProfiles=True, loadKmerPCs=True, loadKmerVarPC=True, loadRawKmers=False, makeColors=True, loadContigNames=True, loadContigLengths=True, loadContigGCs=True, loadBins=False, loadLinks=False): """Load pre-parsed data""" timer.getTimeStamp() if(silent): verbose=False if verbose: print "Loading data from:", self.dbFileName try: self.numStoits = self.getNumStoits() self.condition = condition self.indices = self.dataManager.getConditionalIndices(self.dbFileName, condition=condition, silent=silent) if(verbose): print " Loaded indices with condition:", condition self.numContigs = len(self.indices) if self.numContigs == 0: print " ERROR: No contigs loaded using condition:", condition return if(not silent): print " Working with: %d contigs" % self.numContigs if(loadCovProfiles): if(verbose): print " Loading coverage profiles" self.covProfiles = self.dataManager.getCoverageProfiles(self.dbFileName, indices=self.indices) self.normCoverages = self.dataManager.getNormalisedCoverageProfiles(self.dbFileName, indices=self.indices) # work out average coverages self.averageCoverages = np_array([sum(i)/self.numStoits for i in self.covProfiles]) if loadRawKmers: if(verbose): print " Loading RAW kmer sigs" self.kmerSigs = self.dataManager.getKmerSigs(self.dbFileName, indices=self.indices) if(loadKmerPCs): self.kmerPCs = self.dataManager.getKmerPCAs(self.dbFileName, indices=self.indices) if(verbose): print " Loading PCA kmer sigs (" + str(len(self.kmerPCs[0])) + " dimensional space)" self.kmerNormPC1 = np_copy(self.kmerPCs[:,0]) self.kmerNormPC1 -= np_min(self.kmerNormPC1) self.kmerNormPC1 /= np_max(self.kmerNormPC1) if(loadKmerVarPC): self.kmerVarPC = self.dataManager.getKmerVarPC(self.dbFileName, indices=self.indices) if(verbose): print " Loading PCA kmer variance (total variance: %.2f" % np_sum(self.kmerVarPC) + ")" if(loadContigNames): if(verbose): print " Loading contig names" self.contigNames = self.dataManager.getContigNames(self.dbFileName, indices=self.indices) if(loadContigLengths): self.contigLengths = self.dataManager.getContigLengths(self.dbFileName, indices=self.indices) if(verbose): print " Loading contig lengths (Total: %d BP)" % ( sum(self.contigLengths) ) if(loadContigGCs): self.contigGCs = self.dataManager.getContigGCs(self.dbFileName, indices=self.indices) if(verbose): print " Loading contig GC ratios (Average GC: %0.3f)" % ( np_mean(self.contigGCs) ) if(makeColors): if(verbose): print " Creating color map" # use HSV to RGB to generate colors S = 1 # SAT and VAL remain fixed at 1. Reduce to make V = 1 # Pastels if that's your preference... self.colorMapGC = self.createColorMapHSV() if(loadBins): if(verbose): print " Loading bin assignments" self.binIds = self.dataManager.getBins(self.dbFileName, indices=self.indices) if len(bids) != 0: # need to make sure we're not restricted in terms of bins bin_stats = self.getBinStats() for bid in bids: try: self.validBinIds[bid] = bin_stats[bid][0] self.isLikelyChimeric[bid]= bin_stats[bid][1] except KeyError: self.validBinIds[bid] = 0 self.isLikelyChimeric[bid]= False else: bin_stats = self.getBinStats() for bid in bin_stats: self.validBinIds[bid] = bin_stats[bid][0] self.isLikelyChimeric[bid] = bin_stats[bid][1] # fix the binned indices self.binnedRowIndices = {} for i in range(len(self.indices)): if(self.binIds[i] != 0): self.binnedRowIndices[i] = True else: # we need zeros as bin indicies then... self.binIds = np_zeros(len(self.indices)) if(loadLinks): self.loadLinks() self.stoitColNames = self.getStoitColNames() except: print "Error loading DB:", self.dbFileName, exc_info()[0] raise
def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. rel_dist_thresholds: list Relative distances cutoffs for defining ranks. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2) ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2) ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] rel_dist_thresholds += [1.0] # append boundry for species for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if clade_label in taxa_for_dist_inference: c.append((0.0, 0.0, 0.5)) else: c.append((0.5, 0.5, 0.5)) p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) if i == 0: rank_cutoff = rel_dist_thresholds[i] rank_outlier = dist > rank_cutoff else: rank_cutoff = rel_dist_thresholds[i] upper_rank_cutoff = rel_dist_thresholds[i - 1] rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff) v = [clade_label, dist, rank_cutoff, str(rank_outlier)] v += percentiles[i] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) fout.close() scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # plot relative divergence threshold lines y_min, y_max = ax.get_ylim() for threshold in rel_dist_thresholds[0:-1]: # don't draw species boundary ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--') ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center') # make plot interactive mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=96)