def non_iter_ls_inv_stft(stft_object): stft_data = stft_object['stft'] origSigSize = stft_object['origSigSize'] num_rows, _, _ = origSigSize shift_length = stft_object['shift_length'] len_each_section, num_rows_overlap, _, _ = stft_data.shape # TODO: Isn't this just num_rows in the very beginning? # total_new_elements = (num_rows_overlap - 1) * shift_length + len_each_section win_info = stft_object['win_info'] wVec = win_info(len_each_section) wVecSq = wVec**2 vecC = np_arange(1, num_rows_overlap * shift_length, step=shift_length) # vecC = range(0, num_rows_overlap*shift_length-1, shift_length) DlsArr = np_zeros((num_rows, )) for j in vecC: tmpArr = np_arange(j - 1, len_each_section + j - 1) # tmpArr = np_arange(j, len_each_section+j) DlsArr[tmpArr] += wVecSq # DlsArrInv = 1/DlsArr invFT = math_sqrt(len_each_section) * np_ifft(stft_data, axis=0) invFT_real = invFT.real invFT *= wVec[:, np_newaxis, np_newaxis, np_newaxis] yEst = np_zeros(origSigSize) for index, j in enumerate(vecC): tmpArr = np_arange(j - 1, len_each_section + j - 1) yEst[tmpArr, :] += invFT_real[:, index, :] # sigOut = yEst * DlsArrInv[:, np_newaxis, np_newaxis] sigOut = yEst / DlsArr[:, np_newaxis, np_newaxis] return sigOut
def _modeCheck(Ser1): c = np_bincount(Ser1) # 返回众数 i = np_argmax(c) rule = (i - 2 > Ser1) | (i + 2 < Ser1) index = np_arange(Ser1.shape[0])[rule] return index
def parse_matrix_part(matrix, szSub, ovSub): assert matrix.ndim == 3 assert np_ndim(szSub) == 1 assert len(szSub) == 3 assert np_ndim(ovSub) == 1 assert len(ovSub) == 3 matrix_shape = np_asarray(matrix.shape, dtype=int) len_each_section, _, _ = szSub shift_length, _, _ = ovSub len_each_section_range = np_arange(len_each_section) matrix_shape = np_ceil((matrix_shape - szSub + 1)/ovSub).astype(int) num_rows_overlap, num_elements, num_beams = matrix_shape result_matrix = np_zeros((np_prod(szSub), np_prod(matrix_shape))) cnt = 0 for i in range(num_beams): for j in range(num_elements): for k in range(num_rows_overlap): index_1 = len_each_section_range + k * shift_length index_2 = j index_3 = i tmp = matrix[index_1, index_2, index_3] result_matrix[:, cnt] = tmp cnt += 1 return result_matrix
def _two_sigma(Ser1): ''' Ser1:表示传入DataFrame的某一列。 ''' rule = (Ser1.mean() - 2 * Ser1.std() > Ser1) | (Ser1.mean() + 2 * Ser1.std() < Ser1) index = np_arange(Ser1.shape[0])[rule] return index
def invert_index_list(indexes, length): ''' Inverts indexes list indexes: List[Int] of Ndarray flat numpy array length: Int. Length of the base list ''' mask = np_ones(length, dtype='bool') mask[indexes] = False inverted_indexes = np_arange(length)[mask] return inverted_indexes
def fan_regular_pols(np_verts, np_pols, np_distances, np_faces_id, custom_normals, index_offset=0, use_custom_normals=False, output_old_v_id=True, output_old_face_id=True, output_pols_groups=True): pols_number = np_pols.shape[0] pol_sides = np_pols.shape[1] v_pols = np_verts[np_pols] #shape [num_pols, num_corners, 3] if (len(np_distances) > 1 and np.any(np_distances != 0)) or np_distances != 0: if use_custom_normals: normals = custom_normals else: normals = np_faces_normals(v_pols) average = np.sum( v_pols, axis=1 ) / pol_sides + normals * np_distances[:, np_newaxis] #shape [num_pols, 3] else: average = np.sum(v_pols, axis=1) / pol_sides idx_offset = len(np_verts) + index_offset new_idx = np_arange(idx_offset, pols_number + idx_offset) new_pols = np.zeros([pols_number, pol_sides, 3], dtype=int) new_pols[:, :, 0] = np_pols new_pols[:, :, 1] = np_roll(np_pols, -1, axis=1) new_pols[:, :, 2] = new_idx[:, np_newaxis] old_vert_id = np_pols[:, 0].tolist() if output_old_v_id else [] if output_old_face_id: old_face_id = np_repeat(np_faces_id[:, np_newaxis], pol_sides, axis=1).tolist() else: old_face_id = [] if output_pols_groups: pols_groups = np_repeat(1, len(new_pols) * pol_sides).tolist() else: pols_groups = [] return ( average.tolist(), new_pols.reshape(-1, 3).tolist(), old_vert_id, old_face_id, pols_groups, )
def getBreakpointsByCardinality(self, cardinality): if cardinality not in self.breakpointsByCardinality: frac = 1.0 / cardinality list_percent = [] for i_fl in np_arange(frac, 1.0, frac): list_percent.append(i_fl) self.breakpointsByCardinality[cardinality] = ( np_array(norm.ppf(list_percent)) * self.std + self.mean) return self.breakpointsByCardinality[cardinality]
def stft(signal, len_each_section, frac_overlap, padding, win_info=boxcar): shift_length = round(len_each_section * (1. - frac_overlap)) # shift_length = 2 _, num_elements, num_beams = signal.shape zeroCrct = 0 wVec_rectwin = win_info(len_each_section + zeroCrct) wVec = wVec_rectwin[zeroCrct // 2:len(wVec_rectwin) - zeroCrct // 2] allOvrlp = parse_matrix_part(signal, [len_each_section, 1, 1], [shift_length, 1, 1]) num_rows_overlap = allOvrlp.shape[1] // (num_elements * num_beams) newShape = [len_each_section, num_rows_overlap, num_elements, num_beams] subOvrlp = allOvrlp.reshape(newShape, order="F") # Matlab defaults to Fortran startLocs = np_arange(num_rows_overlap * shift_length, step=shift_length) winOvrlp = subOvrlp * wVec[:, np_newaxis, np_newaxis, np_newaxis] stft_array = np_fft(winOvrlp, padding, axis=0) freq = np_arange(padding) / padding out = { 'stft': stft_array, 'freqs': freq, 'startOffsets': startLocs, 'len_each_section': len_each_section, 'padding': padding, 'win_info': win_info, 'frac_overlap': frac_overlap, 'shift_length': shift_length, } return out
def naive_search_with_np(n): prime_numbers = np_array([2]) i = prime_numbers[0] while len(prime_numbers) != n: i += 1 for d in np_arange(2, i): i_is_prime = True if i%d == 0: i_is_prime = False break if i_is_prime: prime_numbers = np_append(prime_numbers, i) print('Байт:', getsizeof(prime_numbers))
def centers(c, side, edges, catan): """ Input: (center,length of the inner side, number of edges, type of CATAN) Output: numpy array of "many" points """ alpha = 2.0 * pi / edges if catan == "CATAN_ext": i_y = int(1 * small_side / (3 * sqrt(3) - 1.0) + 0.5) * 1.3 i_x = int(i_y * cos(alpha) + 0.5) * 2.3 #i_y = int(1*small_side/(3*sqrt(3)-1.0)+0.5)*1.4 #i_x = int(i_y*cos(alpha)+0.5)*2.4 many, j, j_x = 30, 7, 3 else: i_y = int(1.5 * small_side / (3 * sqrt(3) - 1.0) + 0.5) * 1.05 i_x = int(i_y * cos(alpha) + 0.5) * 2.3 many, j, j_x = 19, 5, 2 centers = np_zeros((many, 2)) raw_centers = np_zeros((many, 3)) k = 0 for j_y in np_arange(j) - j_x: if j == 5: j += 1 for j_j in np_arange(j - abs(j_y) - 1): g = j - abs(j_y) - 1 f = 0.5 * (g % 2 == 0) j_k = j_j - g / 2 + f #print j-abs(j_y)-1,(j_j,j_y),j_k centers[k, :] = [ int(c[0] + i_x * j_k + 0.5), int(c[1] + i_y * j_y + 0.5) ] raw_centers[k, :] = [j - abs(j_y) - 1, j_j, j_y] k += 1 #print centers return centers, raw_centers
def process_impropers(atom, dihe_list, func="imp", verbose=False, forzmatrix=False, dihed_count=None): alist = [ atom, atom.get_atom_list()[0], atom.get_atom_list()[1], atom.get_atom_list()[2] ] looked_up_param, centeratom_name, dihed_identifier = return_params( func, alist) if looked_up_param is None: # dihed was not specified pass else: # was specified. Now, check whether it was given double params = looked_up_param.split(",") if len(params) == 1: d = Dihedral(*alist, func=func, param=params[0], was_input=False, is_multiparam=False, has_siblings=False, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) elif len(params) > 1: siblings = np_arange(start=dihed_count, stop=dihed_count + len(params)) if verbose: print_verbose_dihedral_message(func, alist, params) for param in params: d = Dihedral( *alist, func=func, param=param, was_input=True, is_multiparam=True, around_center_atom=atom.get_atomtype() == centeratom_name, siblings=siblings, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) if not forzmatrix: pass return dihe_list, dihed_count
def Skew(x,y,dat,noise=3): interp=sp_interp2d(x,y,dat) dx=x[1]-x[0] ySkew=np_arange(np_amin(y)-dx*x.size,np_amax(y),dx) DAT=np_empty((ySkew.size,x.size)) yMax=np_amax(y); yMin=np_amin(y) for i in range(ySkew.size): for j in range(x.size): if ySkew[i]+j*dx > yMax or ySkew[i]+j*dx < yMin: DAT[i,j]=(np_rand(1)-0.5)*noise else: DAT[i,j]=interp(x[j],ySkew[i]+j*dx) return ySkew,DAT
def plot(self, unchanged, passive, active, xticklabels, ylabel): """Create stacked bar plot.""" self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axis = self.fig.add_subplot(111) ind = np_arange(len(unchanged)) width = 0.7 unchanged = np_array(unchanged) passive = np_array(passive) active = np_array(active) p1 = axis.bar(ind, unchanged, width, color='#80b1d3') p2 = axis.bar(ind, passive, width, bottom=unchanged, color='#fdae6b') p3 = axis.bar(ind, active, width, bottom=unchanged + passive, color='#b3de69') axis.set_ylim([0, 100]) axis.set_yticks(range(0, 101, 10)) axis.set_ylabel(ylabel) axis.set_xticks(ind) axis.set_xticklabels(xticklabels) axis.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.7, zorder=1) axis.set_axisbelow(True) self.prettify(axis) axis.legend((p3[0], p2[0], p1[0]), ('Active change', 'Passive change', 'Unchanged'), fontsize=self.options.tick_font_size, loc='upper left', bbox_to_anchor=(1, 1), frameon=False) #self.fig.tight_layout(pad=1.0, w_pad=0.1, h_pad=0.1) self.draw()
def __init_matches(self): for match_type, var in [['qm', 'qualification_matches'], ['qf', 'quarter_final_matches'], ['sf', 'semi_final_matches'], ['f', 'final_matches']]: num_matches = self.__count_matches(self.raw_matches, match_type) if num_matches is not 0: # zero = range(num_matches) red_teams = np_zeros((num_matches,), np_object) blue_teams = np_zeros((num_matches,), np_object) blue_scores = np_zeros((num_matches,), np_object) red_scores = np_zeros((num_matches,), np_object) match_code = np_zeros((num_matches,), np_object) match_numbers = np_arange(1, num_matches + 1, 1) for match in self.raw_matches: if match['comp_level'] == match_type: match_num = match['match_number'] - 1 red_teams[match_num] = [np_int(match['alliances']['red']['teams'][0][3:]), np_int(match['alliances']['red']['teams'][1][3:]), np_int(match['alliances']['red']['teams'][2][3:])] red_scores[match_num] = [-1 if match['alliances']['red']['score'] is None else match['alliances']['red']['score'], -1 if match['score_breakdown']['red']['auto'] is None else match['score_breakdown']['red']['auto'], -1 if match['score_breakdown']['red']['foul'] is None else match['score_breakdown']['red']['foul']] blue_teams[match_num] = [np_int(match['alliances']['blue']['teams'][0][3:]), np_int(match['alliances']['blue']['teams'][1][3:]), np_int(match['alliances']['blue']['teams'][2][3:])] blue_scores[match_num] = [-1 if match['alliances']['blue']['score'] is None else match['alliances']['blue']['score'], -1 if match['score_breakdown']['blue']['auto'] is None else match['score_breakdown']['blue']['auto'], -1 if match['score_breakdown']['blue']['foul'] is None else match['score_breakdown']['blue']['foul']] match_code[match_num] = match['key'] red_win = np_array(red_scores.tolist())[:, 0] > np_array(blue_scores.tolist())[:, 0] winner = np_array(['blue'] * len(red_win)) winner[red_win] = 'red' self.__setattr__(var, np_rot90(np_array([[match_type] * num_matches, match_numbers, red_teams, blue_teams, red_scores, blue_scores, winner, match_code], np_object))[::-1])
def plot(self, both, isolate, env, xticklabels): """Create stacked bar plot.""" self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axis = self.fig.add_subplot(111) ind = np_arange(len(both)) width = 0.7 both = np_array(both) isolate = np_array(isolate) env = np_array(env) p1 = axis.bar(ind, both, width, color='#80b1d3') p2 = axis.bar(ind, isolate, width, bottom=both, color='#fdae6b') p3 = axis.bar(ind, env, width, bottom=both+isolate, color='#b3de69') axis.set_ylim([0, 100]) axis.set_yticks(range(0, 101, 10)) axis.set_ylabel('Taxa (%)') axis.set_xticks(ind) axis.set_xticklabels(xticklabels) axis.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.7, zorder=1) axis.set_axisbelow(True) self.prettify(axis) axis.legend((p3[0], p2[0], p1[0]), ('Exclusively MAGs and/or SAGs', 'Exclusively isolates', 'Isolate and environmental genomes'), fontsize=self.options.tick_font_size, loc='upper left', bbox_to_anchor=(1, 1), frameon=False) #self.fig.tight_layout(pad=1.0, w_pad=0.1, h_pad=0.1) self.draw()
def plot(self, plot_latinized, plot_placeholder, xticklabels): """Create stacked bar plot.""" self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) axis = self.fig.add_subplot(111) ind = np_arange(len(plot_latinized)) width = 0.7 plot_latinized = np_array(plot_latinized) plot_placeholder = np_array(plot_placeholder) p1 = axis.bar(ind, plot_latinized, width, color='#80b1d3') p2 = axis.bar(ind, plot_placeholder, width, bottom=plot_latinized, color='#fdae6b') axis.set_ylim([0, 100]) axis.set_yticks(range(0, 101, 10)) axis.set_ylabel('Taxa (%)') axis.set_xticks(ind) axis.set_xticklabels(xticklabels) axis.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.7, zorder=1) axis.set_axisbelow(True) self.prettify(axis) axis.legend((p2[0], p1[0]), ('Placeholder', 'Latinized'), fontsize=self.options.tick_font_size, loc='upper left', bbox_to_anchor=(1, 1), frameon=False) #self.fig.tight_layout(pad=1.0, w_pad=0.1, h_pad=0.1) self.draw()
def split_examples(X, y, percent_valid=PERCENT_VALID, percent_test=PERCENT_TEST): ''' # Split by target after selecting a single frequency. So X.shape is probably [24, 2377, 65, 2] ''' # TODO: prove that the split indices for X match those for y. # ## Train, Valid, Test split try: assert X.ndim == 4 except: raise ValueError('X.ndim should be 4, but X.shape is {}'.format(X.shape)) num_examples_all = len(X) assert len(y) == num_examples_all nums_splits = [int((1-percent_valid-percent_test)*num_examples_all), \ int((1-percent_test)*num_examples_all)] indices_original = np_arange(num_examples_all, dtype=int) indices_shuffled = np_random_permutation(indices_original) indices_train, indices_valid, indices_test = np_split(indices_shuffled, nums_splits) # pylint: disable=W0632 X_train, X_valid, X_test = X[indices_train, :, :], X[indices_valid, :, :], X[indices_test, :, :] y_train, y_valid, y_test = y[indices_train, :, :], y[indices_valid, :, :], y[indices_test, :, :] return (X_train, X_valid, X_test), (y_train, y_valid, y_test)
def setUpClass(cls): super(TestTsManagement, cls).setUpClass() # ts_one= # [[ 0 1] # [ 2 3] # [ 4 5] # ... # [92 93] # [94 95] # [96 97] # [98 99]] cls.ts_one = np_arange(100).reshape(50, 2) # ts_two= # [[198 199] # [196 197] # [194 195] # [192 193] # ... # [104 105] # [102 103] # [100 101]] # cls.ts_two = cls.ts_one[50::-1] + 100 # reverse sort of ts_two: # # ts_inverse= # [[100 101] # [102 103] # [104 105] # ... # [194 195] # [196 197] # [198 199]] # cls.ts_inverse = cls.ts_two[::-1]
def deConvolve(self,G_w,noise_dT=3,noise_avg=3,fMax=2.4): self.reGrid(noise_dT=noise_dT,noise_avg=noise_avg) self.tPumpDeconv=np_arange(np_amin(self.tPump),np_amax(self.tPump), self.tTHz[1]-self.tTHz[0]) loc=np_amin(np_where(self.f >= fMax)) for i in range(self.tPumpSkew.size): self.dTSkewFFT[i,:loc]=self.dTSkewFFT[i,:loc]/G_w[:loc] self.avgSkewFFT[i,:loc]=self.avgSkewFFT[i,:loc]/G_w[:loc] self.dTskew=np_irfft(self.dTSkewFFT,axis=1) self.avgSkewFFT=np_irfft(self.avgSkewFFT,axis=1) self.dTdeconv=unSkew(self.tTHz,self.tPump,self.tPumpSkew,self.dTskew) self.avgDeconv=unSkew(self.tTHz,self.tPump ,self.tPumpSkew,self.avgSkewFFT) self.refDeconv=self.avgDeconv-self.dTdeconv self.pumpDeconv=self.avgDeconv+self.dTdeconv self.refFFTdeconv=np_rfft(self.refDeconv,axis=1) self.pumpFFTdeconv=np_rfft(self.pumpDeconv,axis=1) self.transDeconv=self.pumpFFTdeconv/self.refFFTdeconv return
def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file, taxonomy_file, output_prefix, min_children, title): # determine named clades in full tree named_clades = set() tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) for node in tree.preorder_node_iter(): if node.label: taxonomy = node.label.split(';') named_clades.add(taxonomy[-1].strip().split(':')[-1]) print 'Identified %d named clades in full tree.' % len(named_clades) # determine named groups with at least the specified number of children print 'Determining taxa with sufficient named children lineages.' taxon_children = defaultdict(set) groups = defaultdict(list) print taxonomy_file for line in open(taxonomy_file): line_split = line.replace('; ', ';').split() genome_id = line_split[0] taxonomy = [x.strip() for x in line_split[1].split(';')] if len(taxonomy) > rank + 1: taxon_children[taxonomy[rank]].add(taxonomy[rank + 1]) if len(taxonomy) > rank: groups[taxonomy[rank]].append(genome_id) groups_to_consider = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children and taxon in named_clades: groups_to_consider.add(taxon) print 'Assessing distribution over %d groups.' % len( groups_to_consider) # calculate RED for full tree print '' print 'Calculating RED over full tree.' tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups( tree, groups_to_consider, groups) if len(polyphyletic) > 0: print '' print '[Warning] Full tree contains polyphyletic groups.' # calculate RED for dereplicated tree print '' print 'Calculating RED over dereplicated tree.' tree = dendropy.Tree.get_from_path(derep_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups( tree, groups_to_consider, groups) groups_to_consider = groups_to_consider - polyphyletic print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len( groups_to_consider) # calculate RED to each group in each tree print '' rel_dists = defaultdict(list) dist_components = defaultdict(list) for f in os.listdir(input_tree_dir): if not f.endswith('.rooted.tree'): continue print f tree_file = os.path.join(input_tree_dir, f) tree = dendropy.Tree.get_from_path(tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # calculate relative distance to named taxa rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups( tree, groups_to_consider, groups) for taxon, dist in rel_dist.iteritems(): rel_dists[taxon].append(dist) dist_components[taxon].append(components[taxon]) # create scatter plot x = [] y = [] xDerep = [] yDerep = [] xFull = [] yFull = [] perc10 = [] perc90 = [] labels = [] fout = open(output_prefix + '.tsv', 'w') fout.write( 'Taxon\tP10\tP90\tP90-P10\tMean RED\tMean dist to parent\tMean dist to leaves\tOriginal RED\tOrigial dist to parent\tOriginal dist to leaves\n' ) for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)): labels.append(taxon + ' (%d)' % (len(rel_dists[taxon]))) rd = rel_dists[taxon] for d in rd: x.append(d) y.append(i + 0.2) p10, p90 = np_percentile(rd, [10, 90]) perc10.append(p10) perc90.append(p90) print taxon, p90 - p10 mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0) derep_x, derep_a, derep_b = derep_dist_components[taxon] fout.write( '%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' % (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x, derep_a, derep_b)) xDerep.append(derep_rel_dist[taxon]) yDerep.append(i) xFull.append(full_rel_dist[taxon]) yFull.append(i) fout.close() self.fig.clear() self.fig.set_size_inches(8, len(rel_dists) * 0.4) ax = self.fig.add_subplot(111) ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s') ax.scatter(xDerep, yDerep, alpha=1.0, s=24, c=(1.0, 0.0, 0.0), marker='s') ax.scatter(xFull, yFull, alpha=1.0, s=24, c=(0.0, 0.0, 1.0), marker='*') for i in xrange(len(labels)): ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-') ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-') # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') if title: ax.set_title(title, size=12) ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('taxa') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(labels) self.prettify(ax) # make plot interactive # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12)) # mpld3.save_html(fig, output_prefix + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(output_prefix + '.png', dpi=300)
def __init_alliances(self): alliances = [[team[3:] for team in alliance['picks']] for alliance in self.raw_event['alliances']] alliances = np_array(alliances, np_int) numbers = np_vstack(np_arange(1, 9, 1)) self.alliances = np_concatenate((numbers, alliances), 1)
def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [ np_median(dists) for taxon, dists in medians_for_taxa[rank].items() if taxon in taxa_for_dist_inference ] if not v: # not taxa at rank suitable for creating classification # boundaries continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if 1.0 > boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] no_inference = [] for clade_label, dists in medians_for_taxa[rank].items(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if self._is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(md) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(md) else: c.append((0.0, 0.0, 1.0)) mono.append(md) # histogram for each rank n = 0 if len(mono) > 0: mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) w = float( len(mono)) / (len(mono) + len(poly) + len(no_inference)) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max( np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * ( 1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(list(range(0, len(medians_for_taxa)))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect( self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def clean_meshes(vertices, edges, faces, remove_unreferenced_edges=False, remove_unreferenced_faces=False, remove_duplicated_edges=False, remove_duplicated_faces=False, remove_degenerated_edges=False, remove_degenerated_faces=False, remove_loose_verts=False, calc_verts_idx=False, calc_edges_idx=False, calc_faces_idx=False): ''' Cleans a group of meshes using different routines. Returs Clened meshes and removed items indexes ''' verts_out, edges_out, faces_out = [], [], [] verts_removed_out, edges_removed_out, faces_removed_out = [], [], [] for verts_original, edges_original, faces_original in zip( vertices, edges, faces): verts_changed, edges_changed, faces_changed = False, False, False preserved_edges_idx = [] preserved_faces_idx = [] if remove_unreferenced_edges: edges, preserved_edges_mask = remove_unreferenced_topology( edges_original, len(verts_original)) preserved_edges_idx = np_arange( len(edges_original))[preserved_edges_mask] edges_changed = True if remove_unreferenced_faces: faces, preserved_faces_mask = remove_unreferenced_topology( faces_original, len(verts_original)) preserved_faces_idx = np_arange( len(faces_original))[preserved_faces_mask] faces_changed = True if remove_duplicated_edges: if edges_changed: edges, unique_edges_mask = get_unique_topology(edges) preserved_edges_idx = preserved_edges_idx[unique_edges_mask] else: edges, unique_edges_mask = get_unique_topology(edges_original) preserved_edges_idx = np_arange( len(edges_original))[unique_edges_mask] edges_changed = True if remove_duplicated_faces: if faces_changed: faces, unique_faces_mask = get_unique_topology(faces) preserved_faces_idx = preserved_faces_idx[unique_faces_mask] else: faces, unique_faces_mask = get_unique_topology(faces_original) preserved_faces_idx = np_arange( len(faces_original))[unique_faces_mask] faces_changed = True if remove_degenerated_edges: if edges_changed: edges, non_coincident_mask = non_coincident_edges(edges) preserved_edges_idx = preserved_edges_idx[non_coincident_mask] else: edges, non_coincident_mask = non_coincident_edges( edges_original) preserved_edges_idx = np_arange( len(edges_original))[non_coincident_mask] edges_changed = True if remove_degenerated_faces: if faces_changed: faces, non_redundant_mask = non_redundant_faces_indices(faces) preserved_faces_idx = preserved_faces_idx[non_redundant_mask] else: faces, non_redundant_mask = non_redundant_faces_indices( faces_original) preserved_faces_idx = np_arange( len(faces_original))[non_redundant_mask] faces_changed = True if remove_loose_verts: verts, edges, faces, removed_verts_idx = remove_unreferenced_verts( verts_original, edges if edges_changed else edges_original, faces if faces_changed else faces_original) verts_changed = True edges_changed = True faces_changed = True if verts_changed: verts_out.append(verts) if calc_verts_idx: verts_removed_out.append(removed_verts_idx) else: verts_removed_out.append([]) else: verts_out.append(verts_original) verts_removed_out.append([]) if edges_changed: edges_out.append(edges) if calc_edges_idx and len(preserved_edges_idx) > 0: edges_removed_out.append( invert_index_list(preserved_edges_idx, len(edges_original)).tolist()) else: edges_removed_out.append([]) else: edges_out.append(edges_original) edges_removed_out.append([]) if faces_changed: faces_out.append(faces) if calc_faces_idx and len(preserved_faces_idx) > 0: faces_removed_out.append( invert_index_list(preserved_faces_idx, len(faces_original)).tolist()) else: faces_removed_out.append([]) else: faces_out.append(faces_original) faces_removed_out.append([]) return verts_out, edges_out, faces_out, verts_removed_out, edges_removed_out, faces_removed_out
def process_torsionals(atom, dihe_list, func="prop", verbose=False, forzmatrix=False, dihed_count=None): vizinho1 = atom.get_atom_list()[0] vizinho2 = atom.get_atom_list()[1] for vizinhoTemp in vizinho1.get_atom_list(): alist = [vizinhoTemp, vizinho1, atom, vizinho2] if not (vizinhoTemp is None) and (vizinhoTemp != atom): looked_up_param, centeratom_name, dihed_identifier = return_params( func, alist) if looked_up_param is None: if forzmatrix: # dihed was not specified d = Dihedral(*alist, func, None, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) else: # was specified. Now, check whether it was given double params = looked_up_param.split(",") if len(params) == 1: d = Dihedral(*alist, func=func, param=params[0], was_input=False, is_multiparam=False, has_siblings=False, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) if not forzmatrix: break elif len(params) > 1: # dihedrals who belong to the same parameter set: siblings = np_arange(start=dihed_count, stop=dihed_count + len(params)) if verbose: print_verbose_dihedral_message(func, alist, params) for param in params: if atom.get_was_central_atom(): if not (dihed_identifier in atom.get_was_central_atom_for()): d = Dihedral( *alist, func=func, param=param, was_input=True, isDetermined=True, is_multiparam=True, around_center_atom=atom.get_atomtype() == centeratom_name, has_siblings=True, siblings=siblings, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) else: d = Dihedral( *alist, func=func, param=param, was_input=True, isDetermined=True, is_multiparam=True, around_center_atom=atom.get_atomtype() == centeratom_name, has_siblings=True, siblings=siblings, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) if not forzmatrix: atom.set_was_central_atom( True ) # was used as central atom for multiparam dihed atom.append_was_central_atom_for(dihed_identifier) break for vizinhoTemp in vizinho2.get_atom_list(): alist = [vizinhoTemp, vizinho2, atom, vizinho1] if not (vizinhoTemp is None) and vizinhoTemp != atom: looked_up_param, centeratom_name, dihed_identifier = return_params( func, alist) if looked_up_param is None: if forzmatrix: # dihed was not specified d = Dihedral(*alist, func, None, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) else: # was specified. Now, check whether it was given double params = looked_up_param.split(",") if len(params) == 1: d = Dihedral(*alist, func=func, param=params[0], was_input=False, is_multiparam=False, has_siblings=False, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) if not forzmatrix: break elif len(params) > 1: # dihedrals who belong to the same parameter set: siblings = np_arange(start=dihed_count, stop=dihed_count + len(params)) if verbose: print_verbose_dihedral_message(func, alist, params) for param in params: if atom.get_was_central_atom(): if not (dihed_identifier in atom.get_was_central_atom_for()): d = Dihedral( *alist, func=func, param=param, was_input=True, isDetermined=True, is_multiparam=True, around_center_atom=atom.get_atomtype() == centeratom_name, has_siblings=True, siblings=siblings, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) else: d = Dihedral( *alist, func=func, param=param, was_input=True, isDetermined=False, is_multiparam=True, around_center_atom=atom.get_atomtype() == centeratom_name, has_siblings=True, siblings=siblings, dihed_idx=dihed_count) dihed_count += 1 dihe_list.append(d) if not forzmatrix: atom.set_was_central_atom( True ) # was used as central atom for multiparam dihed atom.append_was_central_atom_for(dihed_identifier) break return dihe_list, dihed_count
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) < 2: continue u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) mono = [] poly = [] no_inference = [] for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(dist) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(dist) else: c.append((0.0, 0.0, 1.0)) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1,-1,-1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n = 0 if len(mono) > 0: mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference] if len(v) < 2: continue u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) mono = [] poly = [] no_inference = [] for clade_label, dist in rel_dists[rank].items(): x.append(dist) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(dist) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(dist) else: c.append((0.0, 0.0, 1.0)) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1,-1,-1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) d = len(mono) + len(poly) + len(no_inference) if d == 0: break w = float(len(mono)) / d n = 0 if len(mono) > 0: mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(range(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def table(self, input_tree, taxon_category_file, bl_step_size, output_table): """Produce table with number of lineage for increasing mean branch lengths Parameters ---------- input_tree : str Name of input tree. taxon_category_file : str File indicating category for each taxon in the tree. bl_step_size : float Step size in table for mean branch length criterion. output_table : str Name of output table. """ # get category for each taxon taxon_category = {} for line in open(taxon_category_file): line_split = line.strip().split('\t') taxon_category[line_split[0]] = line_split[1] # read tree tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # determine mean distance to leaves and taxon categories for each node all_categories = set() node_info = {} parent_mean_dist_to_leafs = {} max_bl_threshold = None for i, node in enumerate(tree.seed_node.preorder_iter()): node.id = i if node.is_leaf(): mean_dist_to_leafs = 0.0 categories = set() for c in taxon_category[node.taxon.label].split('/'): categories.add(c) else: dist_to_leafs = [] categories = set() for t in node.leaf_iter(): dist_to_leafs.append(self._dist_to_ancestor(t, node)) for c in taxon_category[t.taxon.label].split('/'): categories.add(c) mean_dist_to_leafs = np_mean(dist_to_leafs) if node.parent_node: p = parent_mean_dist_to_leafs[node.parent_node.id] else: p = mean_dist_to_leafs + 1e-6 category = '/'.join(sorted(list(categories), reverse=True)) all_categories.add(category) node_info[node.id] = [mean_dist_to_leafs, p, category] parent_mean_dist_to_leafs[node.id] = mean_dist_to_leafs if mean_dist_to_leafs > max_bl_threshold: max_bl_threshold = mean_dist_to_leafs # write table fout = open(output_table, 'w') fout.write('Threshold') for c in all_categories: fout.write('\t%s' % c) fout.write('\n') for bl_threshold in np_arange(0, max_bl_threshold + bl_step_size, bl_step_size): category_count = defaultdict(int) stack = [tree.seed_node] while stack: node = stack.pop() mean_dist_to_leafs, _, category = node_info[node.id] if mean_dist_to_leafs > bl_threshold: for c in node.child_node_iter(): stack.append(c) else: category_count[category] += 1 # check if node meets mean branch length criterion if sum(category_count.values()) > 0: fout.write('%.3f' % bl_threshold) for c in all_categories: fout.write('\t%d' % category_count[c]) fout.write('\n') fout.close() if False: node_info.sort() for bl_threshold in np_arange(0, node_info[-1][0] + bl_step_size, bl_step_size): category_count = defaultdict(int) for mean_bl_dist, parent_mean_bl_dist, category in node_info: if bl_threshold >= mean_bl_dist and bl_threshold < parent_mean_bl_dist: category_count[category] += 1 if sum(category_count.values()) > 0: fout.write('%.3f' % bl_threshold) for c in all_categories: fout.write('\t%d' % category_count[c]) fout.write('\n')
def optimal(self, input_tree, rank, min_dist, max_dist, step_size, output_table): """Determine branch length for best congruency with existing taxonomy. Parameters ---------- input_tree : str Name of input tree. rank : int Taxonomic rank to consider (1=Phylum, ..., 6=Species). output_table : str Name of output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # get mean distance to terminal taxa for each node along with # other stats needed to determine classification self.logger.info('Determining MDTT for each node.') rank_prefix = Taxonomy.rank_prefixes[rank] child_rank_prefix = Taxonomy.rank_prefixes[rank + 1] rank_info = [] rank_dists = set() for node in tree.seed_node.preorder_internal_node_iter(): if node == tree.seed_node: continue # check if node is at the specified rank node_taxon = None if node.label: support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): node_taxon = taxon if not node_taxon: continue # check that node has two descendants at the next rank child_rank_taxa = [] for c in node.levelorder_iter(): if c.label: support, taxon_name, _auxiliary_info = parse_label(c.label) if taxon_name: for taxon in [ x.strip() for x in taxon_name.split(';') ]: if taxon.startswith(child_rank_prefix): child_rank_taxa.append(taxon) if len(child_rank_taxa) >= 2: break if len(child_rank_taxa) < 2: continue # get mean branch length to terminal taxa dists_to_tips = [] for t in node.leaf_iter(): dists_to_tips.append(self._dist_to_ancestor(t, node)) node_dist = np_mean(dists_to_tips) # get mean branch length to terminal taxa for first ancestor spanning multiple phyla ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix) ancestor_dists_to_tips = [] for t in ancestor.leaf_iter(): ancestor_dists_to_tips.append( self._dist_to_ancestor(t, ancestor)) ancestor_dist = np_mean(ancestor_dists_to_tips) rank_info.append([node_dist, ancestor_dist, node_taxon]) rank_dists.add(node_dist) self.logger.info( 'Calculating threshold from %d taxa with specified rank resolution.' % len(rank_info)) fout = open('bl_optimal_taxa_dists.tsv', 'w') fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n') for node_dist, ancestor_dist, node_taxon in rank_info: fout.write('%s\t%.3f\t%.3f\n' % (node_taxon, node_dist, ancestor_dist)) fout.close() # report number of correct and incorrect taxa for each threshold fout = open(output_table, 'w') header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages' fout.write(header + '\n') print(header) top_correct = 0 top_incorrect = 0 top_precision = 0 for d in np_arange(min_dist, max_dist + step_size, step_size): rank_dists.add(d) for dist_threshold in sorted(rank_dists, reverse=True): correct = 0 incorrect = 0 for node_dist, ancestor_dist, node_taxon in rank_info: # check if node/edge would be collapsed at the given threshold if node_dist <= dist_threshold and ancestor_dist > dist_threshold: correct += 1 elif node_dist > dist_threshold: incorrect += 1 else: incorrect += 1 # above ancestor with multiple taxa denominator = correct + incorrect if denominator: precision = float(correct) / denominator else: precision = 0 num_lineages, num_terminal_lineages = self._num_lineages( tree, dist_threshold) row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % ( dist_threshold, correct, incorrect, precision, num_lineages + num_terminal_lineages, num_lineages, num_terminal_lineages) fout.write(row + '\n') print(row) if precision > top_precision: top_correct = correct top_incorrect = incorrect top_precision = precision top_threshold = dist_threshold return top_threshold, top_correct, top_incorrect
def optimal(self, input_tree, rank, min_dist, max_dist, step_size, output_table): """Determine branch length for best congruency with existing taxonomy. Parameters ---------- input_tree : str Name of input tree. rank : int Taxonomic rank to consider (1=Phylum, ..., 6=Species). output_table : str Name of output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # get mean distance to terminal taxa for each node along with # other stats needed to determine classification self.logger.info('Determining MDTT for each node.') rank_prefix = Taxonomy.rank_prefixes[rank] child_rank_prefix = Taxonomy.rank_prefixes[rank+1] rank_info = [] rank_dists = set() for node in tree.seed_node.preorder_internal_node_iter(): if node == tree.seed_node: continue # check if node is at the specified rank node_taxon = None if node.label: support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): node_taxon = taxon if not node_taxon: continue # check that node has two descendants at the next rank child_rank_taxa = [] for c in node.levelorder_iter(): if c.label: support, taxon_name, _auxiliary_info = parse_label(c.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(child_rank_prefix): child_rank_taxa.append(taxon) if len(child_rank_taxa) >= 2: break if len(child_rank_taxa) < 2: continue # get mean branch length to terminal taxa dists_to_tips = [] for t in node.leaf_iter(): dists_to_tips.append(self._dist_to_ancestor(t, node)) node_dist = np_mean(dists_to_tips) # get mean branch length to terminal taxa for first ancestor spanning multiple phyla ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix) ancestor_dists_to_tips = [] for t in ancestor.leaf_iter(): ancestor_dists_to_tips.append(self._dist_to_ancestor(t, ancestor)) ancestor_dist = np_mean(ancestor_dists_to_tips) rank_info.append([node_dist, ancestor_dist, node_taxon]) rank_dists.add(node_dist) self.logger.info('Calculating threshold from %d taxa with specified rank resolution.' % len(rank_info)) fout = open('bl_optimal_taxa_dists.tsv' , 'w') fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n') for node_dist, ancestor_dist, node_taxon in rank_info: fout.write('%s\t%.3f\t%.3f\n' % (node_taxon, node_dist, ancestor_dist)) fout.close() # report number of correct and incorrect taxa for each threshold fout = open(output_table, 'w') header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages' fout.write(header + '\n') print header top_correct = 0 top_incorrect = 0 top_precision = 0 for d in np_arange(min_dist, max_dist+step_size, step_size): rank_dists.add(d) for dist_threshold in sorted(rank_dists, reverse=True): correct = 0 incorrect = 0 for node_dist, ancestor_dist, node_taxon in rank_info: # check if node/edge would be collapsed at the given threshold if node_dist <= dist_threshold and ancestor_dist > dist_threshold: correct += 1 elif node_dist > dist_threshold: incorrect += 1 else: incorrect += 1 # above ancestor with multiple taxa denominator = correct + incorrect if denominator: precision = float(correct) / denominator else: precision = 0 num_lineages, num_terminal_lineages = self._num_lineages(tree, dist_threshold) row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % (dist_threshold, correct, incorrect, precision, num_lineages + num_terminal_lineages, num_lineages, num_terminal_lineages) fout.write(row + '\n') print row if precision > top_precision: top_correct = correct top_incorrect = incorrect top_precision = precision top_threshold = dist_threshold return top_threshold, top_correct, top_incorrect
from mpi4py import MPI from numpy import \ arange as np_arange, \ zeros as np_zeros comm = MPI.COMM_WORLD myrank = comm.Get_rank() nproc = comm.Get_size() if myrank == 0: fulldata = np_arange(3 * nproc, dtype='i') print("I'm {0} fulldata is: {1}".format(myrank, fulldata)) else: fulldata = None count = 3 mydata = np_zeros(count, dtype='i') comm.Scatter([fulldata, count, MPI.INT], [mydata, count, MPI.INT], root=0) print("After Scatter, I'm {0} and mydata is: {1}".format(myrank, mydata))
def shuffleBAMs(self): """Make the data transformation deterministic by reordering the bams""" # first we should make a subset of the total data # we'd like to take it down to about 1500 or so RI's # but we'd like to do this in a repeatable way ideal_contig_num = 1500 sub_cons = range(len(self.indices)) while len(sub_cons) > ideal_contig_num: # select every second contig when sorted by norm cov cov_sorted = np_argsort(self.normCoverages[sub_cons]) sub_cons = np_array([sub_cons[cov_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))]) if len(sub_cons) > ideal_contig_num: # select every second contig when sorted by mer PC1 mer_sorted = np_argsort(self.kmerNormPC1[sub_cons]) sub_cons = np_array([sub_cons[mer_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))]) # now that we have a subset, calculate the distance between each of the untransformed vectors num_sc = len(sub_cons) # log shift the coverages towards the origin sub_covs = np_transpose([self.covProfiles[i]*(np_log10(self.normCoverages[i])/self.normCoverages[i]) for i in sub_cons]) sq_dists = cdist(sub_covs,sub_covs,'cityblock') dists = squareform(sq_dists) # initialise a list of left, right neighbours lr_dict = {} for i in range(self.numStoits): lr_dict[i] = [] too_big = 10000 while True: closest = np_argmin(dists) if dists[closest] == too_big: break (i,j) = self.small2indices(closest, self.numStoits-1) lr_dict[j].append(i) lr_dict[i].append(j) # mark these guys as neighbours if len(lr_dict[i]) == 2: # no more than 2 neighbours sq_dists[i,:] = too_big sq_dists[:,i] = too_big sq_dists[i,i] = 0.0 if len(lr_dict[j]) == 2: # no more than 2 neighbours sq_dists[j,:] = too_big sq_dists[:,j] = too_big sq_dists[j,j] = 0.0 # fix the dist matrix sq_dists[j,i] = too_big sq_dists[i,j] = too_big dists = squareform(sq_dists) # now make the ordering ordering = [0, lr_dict[0][0]] done = 2 while done < self.numStoits: last = ordering[done-1] if lr_dict[last][0] == ordering[done-2]: ordering.append(lr_dict[last][1]) last = lr_dict[last][1] else: ordering.append(lr_dict[last][0]) last = lr_dict[last][0] done+=1 # reshuffle the contig order! # yay for bubble sort! working = np_arange(self.numStoits) for i in range(1, self.numStoits): # where is this guy in the list loc = list(working).index(ordering[i]) if loc != i: # swap the columns self.covProfiles[:,[i,loc]] = self.covProfiles[:,[loc,i]] self.stoitColNames[[i,loc]] = self.stoitColNames[[loc,i]] working[[i,loc]] = working[[loc,i]]
def shuffleBAMs(self): """Make the data transformation deterministic by reordering the bams""" # first we should make a subset of the total data # we'd like to take it down to about 1500 or so RI's # but we'd like to do this in a repeatable way ideal_contig_num = 1500 sub_cons = range(len(self.indices)) while len(sub_cons) > ideal_contig_num: # select every second contig when sorted by norm cov cov_sorted = np_argsort(self.normCoverages[sub_cons]) sub_cons = np_array([ sub_cons[cov_sorted[i * 2]] for i in np_arange(int(len(sub_cons) / 2)) ]) if len(sub_cons) > ideal_contig_num: # select every second contig when sorted by mer PC1 mer_sorted = np_argsort(self.kmerNormPC1[sub_cons]) sub_cons = np_array([ sub_cons[mer_sorted[i * 2]] for i in np_arange(int(len(sub_cons) / 2)) ]) # now that we have a subset, calculate the distance between each of the untransformed vectors num_sc = len(sub_cons) # log shift the coverages towards the origin sub_covs = np_transpose([ self.covProfiles[i] * (np_log10(self.normCoverages[i]) / self.normCoverages[i]) for i in sub_cons ]) sq_dists = cdist(sub_covs, sub_covs, 'cityblock') dists = squareform(sq_dists) # initialise a list of left, right neighbours lr_dict = {} for i in range(self.numStoits): lr_dict[i] = [] too_big = 10000 while True: closest = np_argmin(dists) if dists[closest] == too_big: break (i, j) = self.small2indices(closest, self.numStoits - 1) lr_dict[j].append(i) lr_dict[i].append(j) # mark these guys as neighbours if len(lr_dict[i]) == 2: # no more than 2 neighbours sq_dists[i, :] = too_big sq_dists[:, i] = too_big sq_dists[i, i] = 0.0 if len(lr_dict[j]) == 2: # no more than 2 neighbours sq_dists[j, :] = too_big sq_dists[:, j] = too_big sq_dists[j, j] = 0.0 # fix the dist matrix sq_dists[j, i] = too_big sq_dists[i, j] = too_big dists = squareform(sq_dists) # now make the ordering ordering = [0, lr_dict[0][0]] done = 2 while done < self.numStoits: last = ordering[done - 1] if lr_dict[last][0] == ordering[done - 2]: ordering.append(lr_dict[last][1]) last = lr_dict[last][1] else: ordering.append(lr_dict[last][0]) last = lr_dict[last][0] done += 1 # reshuffle the contig order! # yay for bubble sort! working = np_arange(self.numStoits) for i in range(1, self.numStoits): # where is this guy in the list loc = list(working).index(ordering[i]) if loc != i: # swap the columns self.covProfiles[:, [i, loc]] = self.covProfiles[:, [loc, i]] self.stoitColNames[[i, loc]] = self.stoitColNames[[loc, i]] working[[i, loc]] = working[[loc, i]]
def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file, taxonomy_file, output_prefix, min_children, title): # determine named clades in full tree named_clades = set() tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) for node in tree.preorder_node_iter(): if node.label: taxonomy = node.label.split(';') named_clades.add(taxonomy[-1].strip().split(':')[-1]) print 'Identified %d named clades in full tree.' % len(named_clades) # determine named groups with at least the specified number of children print 'Determining taxa with sufficient named children lineages.' taxon_children = defaultdict(set) groups = defaultdict(list) print taxonomy_file for line in open(taxonomy_file): line_split = line.replace('; ', ';').split() genome_id = line_split[0] taxonomy = [x.strip() for x in line_split[1].split(';')] if len(taxonomy) > rank + 1: taxon_children[taxonomy[rank]].add(taxonomy[rank + 1]) if len(taxonomy) > rank: groups[taxonomy[rank]].append(genome_id) groups_to_consider = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children and taxon in named_clades: groups_to_consider.add(taxon) print 'Assessing distribution over %d groups.' % len(groups_to_consider) # calculate relative distance for full tree print '' print 'Calculating relative distance over full tree.' tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups) if len(polyphyletic) > 0: print '' print '[Warning] Full tree contains polyphyletic groups.' # calculate relative distance for dereplicated tree print '' print 'Calculating relative distance over dereplicated tree.' tree = dendropy.Tree.get_from_path(derep_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups) groups_to_consider = groups_to_consider - polyphyletic print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len(groups_to_consider) # calculate relative distance to each group in each tree print '' rel_dists = defaultdict(list) dist_components = defaultdict(list) for f in os.listdir(input_tree_dir): if not f.endswith('.rooted.tree'): continue print f tree_file = os.path.join(input_tree_dir, f) tree = dendropy.Tree.get_from_path(tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # calculate relative distance to named taxa rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups) for taxon, dist in rel_dist.iteritems(): rel_dists[taxon].append(dist) dist_components[taxon].append(components[taxon]) # create scatter plot x = [] y = [] xDerep = [] yDerep = [] xFull = [] yFull = [] perc10 = [] perc90 = [] labels = [] fout = open(output_prefix + '.tsv', 'w') fout.write('Taxon\tP10\tP90\tP90-P10\tMean rel. dist\tMean dist to parent\tMean dist to leaves\tOriginal rel. dist.\tOrigial dist to parent\tOriginal dist to leaves\n') for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)): labels.append(taxon + ' (%d)' % (len(rel_dists[taxon]))) rd = rel_dists[taxon] for d in rd: x.append(d) y.append(i + 0.2) p10, p90 = np_percentile(rd, [10, 90]) perc10.append(p10) perc90.append(p90) print taxon, p90 - p10 mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0) derep_x, derep_a, derep_b = derep_dist_components[taxon] fout.write('%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' % (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x, derep_a, derep_b)) xDerep.append(derep_rel_dist[taxon]) yDerep.append(i) xFull.append(full_rel_dist[taxon]) yFull.append(i) fout.close() self.fig.clear() self.fig.set_size_inches(8, len(rel_dists) * 0.4) ax = self.fig.add_subplot(111) ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s') ax.scatter(xDerep, yDerep, alpha=1.0, s=24, c=(1.0, 0.0, 0.0), marker='s') ax.scatter(xFull, yFull, alpha=1.0, s=24, c=(0.0, 0.0, 1.0), marker='*') for i in xrange(len(labels)): ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-') ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-') # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') if title: ax.set_title(title, size=12) ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('taxa') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(labels) self.prettify(ax) # make plot interactive # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12)) # mpld3.save_html(fig, output_prefix + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(output_prefix + '.png', dpi=300)
def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] no_inference = [] for clade_label, dists in medians_for_taxa[rank].iteritems(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(md) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(md) else: c.append((0.0, 0.0, 1.0)) mono.append(md) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(medians_for_taxa))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def inset_regular_pols(np_verts, np_pols, np_distances, np_inset_rate, np_make_inners, np_faces_id, custom_normals, matrices, offset_mode='CENTER', proportional=False, concave_support=True, index_offset=0, use_custom_normals=False, output_old_face_id=True, output_old_v_id=True, output_pols_groups=True): pols_number = np_pols.shape[0] pol_sides = np_pols.shape[1] v_pols = np_verts[np_pols] #shape [num_pols, num_corners, 3] if offset_mode == 'SIDES': inner_points = sides_mode_inset(v_pols, np_inset_rate, np_distances, concave_support, proportional, use_custom_normals, custom_normals) elif offset_mode == 'MATRIX': inner_points = matrix_mode_inset(v_pols, matrices, use_custom_normals, custom_normals) else: if any(np_distances != 0): if use_custom_normals: normals = custom_normals else: normals = np_faces_normals(v_pols) average = np.sum( v_pols, axis=1 ) / pol_sides #+ normals*np_distances[:, np_newaxis] #shape [num_pols, 3] inner_points = average[:, np_newaxis, :] + ( v_pols - average[:, np_newaxis, :] ) * np_inset_rate[:, np_newaxis, np_newaxis] + normals[:, np_newaxis, :] * np_distances[:, np_newaxis, np_newaxis] else: average = np.sum(v_pols, axis=1) / pol_sides #shape [num_pols, 3] inner_points = average[:, np_newaxis, :] + ( v_pols - average[:, np_newaxis, :] ) * np_inset_rate[:, np_newaxis, np_newaxis] idx_offset = len(np_verts) + index_offset new_v_idx = np_arange(idx_offset, pols_number * pol_sides + idx_offset).reshape( pols_number, pol_sides) side_pols = np.zeros([pols_number, pol_sides, 4], dtype=int) side_pols[:, :, 0] = np_pols side_pols[:, :, 1] = np_roll(np_pols, -1, axis=1) side_pols[:, :, 2] = np_roll(new_v_idx, -1, axis=1) side_pols[:, :, 3] = new_v_idx side_faces = side_pols.reshape(-1, 4) new_insets = new_v_idx[np_make_inners] if pol_sides == 4: new_faces = np_concatenate([side_faces, new_insets]).tolist() else: new_faces = side_faces.tolist() + new_insets.tolist() old_v_id = np_pols.flatten().tolist() if output_old_v_id else [] if output_old_face_id: side_ids = np.repeat(np_faces_id[:, np_newaxis], pol_sides, axis=1) inset_ids = np_faces_id[np_make_inners] old_face_id = np.concatenate((side_ids.flatten(), inset_ids)).tolist() else: old_face_id = [] if output_pols_groups: pols_groups = np_repeat( [1, 2], [len(side_faces), len(new_insets)]).tolist() else: pols_groups = [] return (inner_points.reshape(-1, 3).tolist(), new_faces, new_insets.tolist(), old_v_id, old_face_id, pols_groups)
from mpi4py import MPI import sys from numpy import \ arange as np_arange, \ zeros as np_zeros, \ uint32 as np_uint32 comm = MPI.COMM_WORLD myrank = comm.Get_rank() nproc = comm.Get_size() size = int(sys.argv[1]) partial_sum = np_arange(size) if (myrank != 0): comm.Send([partial_sum, size, MPI.INT], dest=0, tag=7) else: tmp_sum = np_zeros(size, dtype=np_uint32) for i in range(1, nproc): comm.Recv([tmp_sum, size, MPI.INT], source=i, tag=7) print("received data")
def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. rel_dist_thresholds: list Relative distances cutoffs for defining ranks. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [ dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference ] u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [ dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference ] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2) ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2) ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write( 'Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n' ) x = [] y = [] c = [] labels = [] rank_labels = [] rel_dist_thresholds += [1.0] # append boundry for species for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if clade_label in taxa_for_dist_inference: c.append((0.0, 0.0, 0.5)) else: c.append((0.5, 0.5, 0.5)) p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) if i == 0: rank_cutoff = rel_dist_thresholds[i] rank_outlier = dist > rank_cutoff else: rank_cutoff = rel_dist_thresholds[i] upper_rank_cutoff = rel_dist_thresholds[i - 1] rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff) v = [clade_label, dist, rank_cutoff, str(rank_outlier)] v += percentiles[i] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) fout.close() scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # plot relative divergence threshold lines y_min, y_max = ax.get_ylim() for threshold in rel_dist_thresholds[ 0:-1]: # don't draw species boundary ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--') ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center') # make plot interactive mpld3.plugins.connect( self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=96)
def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. rel_dist_thresholds: list Relative distances cutoffs for defining ranks. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2) ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2) ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] rel_dist_thresholds += [1.0] # append boundry for species for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if clade_label in taxa_for_dist_inference: c.append((0.0, 0.0, 0.5)) else: c.append((0.5, 0.5, 0.5)) p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) if i == 0: rank_cutoff = rel_dist_thresholds[i] rank_outlier = dist > rank_cutoff else: rank_cutoff = rel_dist_thresholds[i] upper_rank_cutoff = rel_dist_thresholds[i - 1] rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff) v = [clade_label, dist, rank_cutoff, str(rank_outlier)] v += percentiles[i] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) fout.close() scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # plot relative divergence threshold lines y_min, y_max = ax.get_ylim() for threshold in rel_dist_thresholds[0:-1]: # don't draw species boundary ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--') ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center') # make plot interactive mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=96)