def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_path = opts.input_path output_path = opts.output_path if isdir(input_path): # Run PCoA on all distance matrices in the input dir # Create the output directory if it does not exists if not exists(output_path): makedirs(output_path) # Get all the filenames present in the input directory file_names = [fname for fname in listdir(input_path) if not (fname.startswith('.') or isdir(fname))] # Loop through all the input files for fname in file_names: # Get the path to the input distance matrix infile = join(input_path, fname) # Run PCoA on the input distance matrix with open(infile, 'U') as lines: pcoa_scores = pcoa(lines) # Store the PCoA results on the output directory base_fname, ext = splitext(fname) out_file = join(output_path, 'pcoa_%s.txt' % base_fname) pcoa_scores.write(out_file) else: # Run PCoA on the input distance matrix with open(input_path, 'U') as f: pcoa_scores = pcoa(f) # Store the results in the output file pcoa_scores.write(output_path)
def compute_procrustes(result_tables, expected_pc_lookup, taxonomy_level=6, num_dimensions=3, random_trials=999): """ Compute Procrustes M2 and p-values for a set of results result_tables: 2d list of tables to be compared to expected tables, where the data in the inner list is: [dataset_id, reference_database_id, method_id, parameter_combination_id, table_fp] expected_pc_lookup: 2d dict of dataset_id, reference_db_id to principal coordinate matrices, for the expected result coordinate matrices taxonomy_level: level to compute results """ ### Start code copied ALMOST* directly from compute_prfs - some re-factoring for re-use is ### in order here. *ALMOST refers to changes to parser and variable names since expected ### is a pc matrix here. for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables: ## parse the expected table (unless taxonomy_level is specified, this should be ## collapsed on level 6 taxonomy) try: expected_pc_fp = expected_pc_lookup[dataset_id][reference_id] except KeyError: raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id) ## parse the actual table and collapse it at the specified taxonomic level try: actual_table = parse_biom_table(open(actual_table_fp, "U")) except ValueError: raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level) actual_table = actual_table.collapseObservationsByMetadata(collapse_by_taxonomy) ### End code copied directly from compute_prfs. # Next block of code, how do I hate thee? Let me count the ways... # (1) dist_bray_curtis doesn't take a BIOM Table object # (2) pcoa takes a qiime-formatted distance matrix as a list of lines # (3) pcoa return a qiime-formatted pc matrix # (4) procrustes_monte_carlo needs to pass through the pc "file" multiple # times, so we actually *need* those the pcs that get passed in to be # lists of lines dm = dist_bray_curtis(asarray([v for v in actual_table.iterSampleData()])) formatted_dm = format_distance_matrix(actual_table.SampleIds, dm) actual_pc = pcoa(formatted_dm.split("\n")).split("\n") expected_pc = list(open(expected_pc_fp, "U")) ## run Procrustes analysis with monte carlo simulation actual_m_squared, trial_m_squareds, count_better, mc_p_value = procrustes_monte_carlo( expected_pc, actual_pc, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=None, trial_output_dir=None, ) yield (dataset_id, reference_id, method_id, params, actual_m_squared, mc_p_value)
def generate_pcoa_cloud_from_point_in_omega(map_headers, map_data, biom_object, metric, sequences, iterations, axes, tree_object=None): """run the randomisations and get a WebGL PCoA plot string representation Input: mapping_file_tuple: data and headers tuple for representing the mapping file biom_object: otu table biom object metric: string of the name for the beta diversity metric, i. e. 'unifrac' sequences: number of sequences per sample iterations: number of iterations to generate the pcoa plot axes: number of axes to account for tree_object: tree to perform the beta diversity calculation Output: WebGL string representing the PCoA plot """ pcoa_input = {'pcoa_headers':[], 'pcoa_values':[], 'eigenvalues':[], 'coords_pct':[]} for i in range(iterations): rare_biom_table = get_rare_data(biom_object, sequences) beta_dm = single_object_beta(rare_biom_table, metric, tree_object) pcoa_results = pcoa(beta_dm) pcoa_file = StringIO() pcoa_file.write(pcoa_results) pcoa_file.seek(0) pcoa_headers, pcoa_values, eigenvalues, coords_pct = parse_coords(pcoa_file) pcoa_file.close() pcoa_input['pcoa_headers'].append(pcoa_headers) pcoa_input['pcoa_values'].append(pcoa_values) pcoa_input['eigenvalues'].append(eigenvalues) pcoa_input['coords_pct'].append(coords_pct) if iterations==1: coords_headers = pcoa_input['pcoa_headers'][0] coords_data = pcoa_input['pcoa_values'][0] coords_eigenvalues = pcoa_input['eigenvalues'][0] coords_pct = pcoa_input['coords_pct'][0] coords_low, coords_high = None, None else: coords_headers, coords_data, coords_eigenvalues, coords_pct, coords_low,\ coords_high, clones = preprocess_coords_file(pcoa_input['pcoa_headers'], pcoa_input['pcoa_values'], pcoa_input['eigenvalues'], pcoa_input['coords_pct'], map_headers, map_data, custom_axes=None, jackknifing_method='IQR', is_comparison=False) return make_pcoa_plot(coords_headers, coords_data, coords_eigenvalues, coords_pct, \ map_headers, map_data, coords_low, coords_high, True)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if os.path.isdir(opts.input_path): multiple_file_pcoa(opts.input_path, opts.output_path) elif os.path.isfile(opts.input_path): f = open(opts.input_path, 'U') pcoa_res_string = pcoa(f) f.close() f = open(opts.output_path, 'w') f.write(pcoa_res_string) f.close() else: print("io error, check input file path") exit(1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if os.path.isdir(opts.input_path): multiple_file_pcoa(opts.input_path,opts.output_path) elif os.path.isfile(opts.input_path): f = open(opts.input_path,'U') pcoa_res_string = pcoa(f) f.close() f = open(opts.output_path, 'w') f.write(pcoa_res_string) f.close() else: print("io error, check input file path") exit(1)
def generate_pcoa_cloud_from_point_in_omega(mapping_file_tuple, biom_object, metric, sequences, iterations, axes, tree_object=None): """run the randomisations and get a WebGL PCoA plot string representation Input: mapping_file_tuple: data and headers tuple for representing the mapping file biom_object: otu table biom object metric: string of the name for the beta diversity metric, i. e. 'unifrac' sequences: number of sequences per sample iterations: number of iterations to generate the pcoa plot axes: number of axes to account for tree_object: tree to perform the beta diversity calculation Output: WebGL string representing the PCoA plot """ # get a list of the SampleIds full_id_list = mapping_file_to_dict(mapping_file_tuple[0], mapping_file_tuple[1]).keys() pcoa_list = [] for i in range(iterations): rare_biom_table = get_rare_data(biom_object, sequences) beta_dm = single_object_beta(rare_biom_table, metric, tree_object) pcoa_results = pcoa(beta_dm) pcoa_list.append(pcoa_results) # convert the list of pcoa lines into ellipsoid coords ellipse_coords_by_sampleId, sampleId_to_coords = get_pcoa_ellipsoid_coords(pcoa_list, axes, full_id_list) # check the ellipses are created correctly if type(ellipse_coords_by_sampleId) == type(''): raise ValueError, 'Could not create PCoA plot' webgl_string = make_pcoa_plot(ellipse_coords_by_sampleId, mapping_file_tuple, sampleId_to_coords['variation explained']) return webgl_string
def test_pcoa(self): """ pcoa should throw no errors""" res = pcoa(self.distmtx_txt) assert res # formatting tested elsewhere