def generate_pcoa_cloud_from_point_in_omega(map_headers, map_data, biom_object, metric, sequences, iterations, axes, tree_object=None): """run the randomisations and get a WebGL PCoA plot string representation Input: mapping_file_tuple: data and headers tuple for representing the mapping file biom_object: otu table biom object metric: string of the name for the beta diversity metric, i. e. 'unifrac' sequences: number of sequences per sample iterations: number of iterations to generate the pcoa plot axes: number of axes to account for tree_object: tree to perform the beta diversity calculation Output: WebGL string representing the PCoA plot """ pcoa_input = {'pcoa_headers':[], 'pcoa_values':[], 'eigenvalues':[], 'coords_pct':[]} for i in range(iterations): rare_biom_table = get_rare_data(biom_object, sequences) beta_dm = single_object_beta(rare_biom_table, metric, tree_object) pcoa_results = pcoa(beta_dm) pcoa_file = StringIO() pcoa_file.write(pcoa_results) pcoa_file.seek(0) pcoa_headers, pcoa_values, eigenvalues, coords_pct = parse_coords(pcoa_file) pcoa_file.close() pcoa_input['pcoa_headers'].append(pcoa_headers) pcoa_input['pcoa_values'].append(pcoa_values) pcoa_input['eigenvalues'].append(eigenvalues) pcoa_input['coords_pct'].append(coords_pct) if iterations==1: coords_headers = pcoa_input['pcoa_headers'][0] coords_data = pcoa_input['pcoa_values'][0] coords_eigenvalues = pcoa_input['eigenvalues'][0] coords_pct = pcoa_input['coords_pct'][0] coords_low, coords_high = None, None else: coords_headers, coords_data, coords_eigenvalues, coords_pct, coords_low,\ coords_high, clones = preprocess_coords_file(pcoa_input['pcoa_headers'], pcoa_input['pcoa_values'], pcoa_input['eigenvalues'], pcoa_input['coords_pct'], map_headers, map_data, custom_axes=None, jackknifing_method='IQR', is_comparison=False) return make_pcoa_plot(coords_headers, coords_data, coords_eigenvalues, coords_pct, \ map_headers, map_data, coords_low, coords_high, True)
def compare_treatment_dists(chosen_samples, category, mf, bt, m, tr): """Calculate avg between, within, and to-all distances for chosen_samples. Notes: chosen_samples is a list of lists of ids that collectively have some amount of different values under category in the mapping file. these samples will br grouped by the value they have and then these groupings will be compared. the between distance is the all the pairwise distances between the groupings. the within distance is the distance between the samples in a single group. the to-all distance is the distance from the group to all other samples in the distmat. Inputs: chosen_samples - list of ids. e.g. [sam1,sam7,sam3,sam6,..] category - str, field in mf. mf - parsed mapping file, dict of sample_id:metadata. bt - biom table containing at least all samples contained in the mf. m - str, metric to used for beta diversity calculation. tr - tree object, containing at least all nodes in bt. Output: A list of marginals that are the treatments of the groups, i.e. ['HF','LF'] bt_wi_m - a 2d upper triangular array that has the average distances between treatment groups (or in the case of the main diagonal, the average within treatment/group distance). bt_wi_se - the standard errors for bt_wi_m. ta_m_se - 2d array with number of treatments/groups rows, and 2 cols where the first col is the average distance between that treatment and all others and the second col is the se. """ dm = single_object_beta(bt, m, tr) #make the sample-sample distance matrix samples, data = parse_distmat(dm) #parse dm which is list of strs tc = treatment_covering(chosen_samples, category, mf) output_marginals = tc.keys() # make 3 arrays for output, between-within means, between-within ses, # to-all means and ses, bt_wi_m = zeros((len(output_marginals),len(output_marginals))) bt_wi_se = zeros((len(output_marginals),len(output_marginals))) ta_m_se = zeros((len(output_marginals),2)) for i,t in enumerate(output_marginals): # calculate within and to-all ta_m_se[i][0], ta_m_se[i][1] = treatment_dist(tc[t], samples, data) bt_wi_m[i][i], bt_wi_se[i][i] = within_treatment_dist(tc[t], samples, data) for t1, t2 in combinations(output_marginals, 2): #calculate between dists t1_ind = output_marginals.index(t1) t2_ind = output_marginals.index(t2) bt_wi_m[t1_ind][t2_ind], bt_wi_se[t1_ind][t2_ind] = \ between_treatments_dist(tc[t1], tc[t2], samples, data) return output_marginals, bt_wi_m, bt_wi_se, ta_m_se
def generate_pcoa_cloud_from_point_in_omega(mapping_file_tuple, biom_object, metric, sequences, iterations, axes, tree_object=None): """run the randomisations and get a WebGL PCoA plot string representation Input: mapping_file_tuple: data and headers tuple for representing the mapping file biom_object: otu table biom object metric: string of the name for the beta diversity metric, i. e. 'unifrac' sequences: number of sequences per sample iterations: number of iterations to generate the pcoa plot axes: number of axes to account for tree_object: tree to perform the beta diversity calculation Output: WebGL string representing the PCoA plot """ # get a list of the SampleIds full_id_list = mapping_file_to_dict(mapping_file_tuple[0], mapping_file_tuple[1]).keys() pcoa_list = [] for i in range(iterations): rare_biom_table = get_rare_data(biom_object, sequences) beta_dm = single_object_beta(rare_biom_table, metric, tree_object) pcoa_results = pcoa(beta_dm) pcoa_list.append(pcoa_results) # convert the list of pcoa lines into ellipsoid coords ellipse_coords_by_sampleId, sampleId_to_coords = get_pcoa_ellipsoid_coords(pcoa_list, axes, full_id_list) # check the ellipses are created correctly if type(ellipse_coords_by_sampleId) == type(''): raise ValueError, 'Could not create PCoA plot' webgl_string = make_pcoa_plot(ellipse_coords_by_sampleId, mapping_file_tuple, sampleId_to_coords['variation explained']) return webgl_string
def single_object_beta(self, otu_table, metric, tree_string, missing_sams=None): """ running single_file_beta should give same result using --rows""" if missing_sams is None: missing_sams = [] metrics = list_known_nonphylogenetic_metrics() metrics.extend(list_known_phylogenetic_metrics()) # new metrics that don't trivially parallelize must be dealt with # carefully warnings.filterwarnings( 'ignore', 'dissimilarity binary_dist_chisq is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_chisq is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_gower is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_hellinger is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore', 'unifrac had no information for\ sample M*') # self.files_to_remove.extend([input_path,tree_path]) # self.folders_to_remove.append(output_dir) # os.mkdir(output_dir+'/ft/') for metric in metrics: # do it beta_out = single_object_beta(otu_table, metric, tree_string, rowids=None, full_tree=False) sams, dmtx = parse_distmat(beta_out) # do it by rows for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] # row_outname = output_dir + '/' + metric + '_' +\ # in_fname r_out = single_object_beta(otu_table, metric, tree_string, rowids=rows, full_tree=False) col_sams, row_sams, row_dmtx = parse_matrix(r_out) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] npt.assert_almost_equal(row_v1, full_v1) # full tree run: if 'full_tree' in str(metric).lower(): continue # do it by rows with full tree for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] #~ row_outname = output_dir + '/ft/' + metric + '_' +\ #~ in_fname r_out = single_object_beta(otu_table, metric, tree_string, rowids=None, full_tree=True) col_sams, row_sams, row_dmtx = parse_matrix(r_out) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] npt.assert_almost_equal(row_v1, full_v1) # do it with full tree r_out = single_object_beta(otu_table, metric, tree_string, rowids=None, full_tree=True) sams_ft, dmtx_ft = parse_distmat(r_out) self.assertEqual(sams_ft, sams) npt.assert_almost_equal(dmtx_ft, dmtx)
def single_object_beta(self, otu_table, metric, tree_string, missing_sams=None): """ running single_file_beta should give same result using --rows""" if missing_sams==None: missing_sams = [] # setup #input_path = get_tmp_filename() #in_fname = os.path.split(input_path)[1] #f = open(input_path,'w') #f.write(otu_table_string) #f.close() #tree_path = get_tmp_filename() #f = open(tree_path,'w') #f.write(tree_string) #f.close() metrics = list_known_nonphylogenetic_metrics() metrics.extend(list_known_phylogenetic_metrics()) #output_dir = get_tmp_filename(suffix = '') #os.mkdir(output_dir) # new metrics that don't trivially parallelize must be dealt with # carefully warnings.filterwarnings('ignore','dissimilarity binary_dist_chisq is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore','dissimilarity dist_chisq is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore','dissimilarity dist_gower is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore','dissimilarity dist_hellinger is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore','unifrac had no information for\ sample M*') #self.files_to_remove.extend([input_path,tree_path]) #self.folders_to_remove.append(output_dir) #os.mkdir(output_dir+'/ft/') for metric in metrics: # do it beta_out = single_object_beta(otu_table, metric, tree_string,rowids=None, full_tree=False) sams, dmtx = parse_distmat(beta_out) # do it by rows for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] #row_outname = output_dir + '/' + metric + '_' +\ #in_fname r_out = single_object_beta(otu_table, metric, tree_string,rowids=rows, full_tree=False) col_sams, row_sams, row_dmtx = parse_matrix(r_out) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j,k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] self.assertFloatEqual(row_v1, full_v1) ### full tree run: if 'full_tree' in str(metric).lower(): continue # do it by rows with full tree for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] #~ row_outname = output_dir + '/ft/' + metric + '_' +\ #~ in_fname r_out = single_object_beta(otu_table, metric, tree_string,rowids=None, full_tree=True) col_sams, row_sams, row_dmtx = parse_matrix(r_out) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j,k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] self.assertFloatEqual(row_v1, full_v1) # # do it with full tree r_out = single_object_beta(otu_table, metric, tree_string,rowids=None, full_tree=True) sams_ft, dmtx_ft = parse_distmat(r_out) self.assertEqual(sams_ft, sams) self.assertFloatEqual(dmtx_ft, dmtx)