def test_permanova_selection(): import metadata_permanova_prioritizer permanova_colums = metadata_permanova_prioritizer.permanova_validation("reference_data/permanova/metadata_table-00000.txt") assert(len(permanova_colums) == 4) permanova_colums = metadata_permanova_prioritizer.permanova_validation("reference_data/permanova/kelly_metadata.txt") print(permanova_colums) assert("ATTRIBUTE_bdi_group" in permanova_colums)
def test_metadata_test(): import metadata_permanova_prioritizer input_filename = "reference_data/test_metadata_permanova_parse.tsv" selected_columns = metadata_permanova_prioritizer.permanova_validation(input_filename) print(selected_columns)
def calculate_statistics(input_quant_filename, input_metadata_file, input_summary_file, output_summary_folder, output_plots_folder=None, metadata_column=None, condition_first=None, condition_second=None, metadata_facet_column=None, run_stats=True, PARALLELISM=8, libraryidentifications_df=None): ## Loading feature table features_df = pd.read_csv(input_quant_filename, sep=",") metadata_df = pd.read_csv(input_metadata_file, sep="\t") metadata_df["filename"] = metadata_df["filename"].apply( lambda x: x.rstrip()) ## Determining if we can even do anything print(len(features_df), len(features_df.columns), len(features_df) * len(features_df.columns)) if len(features_df) * len(features_df.columns) > 10000000: print("Feature Table Too Big To Generate") return # removing peak area from columns feature_information_df = features_df[[ "row ID", "row retention time", "row m/z" ]] features_df.index = features_df["row ID"] metabolite_id_list = list(features_df["row ID"]) headers_to_keep = [ header for header in features_df.columns if "Peak area" in header ] features_df = features_df[headers_to_keep] column_mapping = { headers: headers.replace(" Peak area", "").rstrip() for headers in features_df.columns } features_df = features_df.rename(columns=column_mapping) # Transpose features_df = features_df.T # Merging with Metadata features_df["filename"] = features_df.index features_df = features_df.merge(metadata_df, how="inner", on="filename") # Format Long version for later plotting long_form_df = pd.melt(features_df, id_vars=metadata_df.columns, value_vars=metabolite_id_list) long_form_df = long_form_df.rename(columns={ "variable": "featureid", "value": "featurearea" }) # Adding in feature information feature_information_df = feature_information_df.rename( columns={ "row ID": "featureid", "row retention time": "featurert", "row m/z": "featuremz" }) long_form_df = long_form_df.merge(feature_information_df, how="left", on="featureid") # Adding Library Search Inforamtion try: long_form_df = long_form_df.merge(libraryidentifications_df, how="left", left_on="featureid", right_on="#Scan#") long_form_df = long_form_df.drop(columns=["#Scan#"]) except: pass long_form_df.to_csv(os.path.join(output_summary_folder, "data_long.csv"), index=False) # Trying to add in summary to proteosafe output try: file_summary_df = pd.read_csv(input_summary_file, sep="\t") file_summary_df["filename"] = file_summary_df["full_CCMS_path"].apply( lambda x: os.path.basename(x)) enriched_long_df = long_form_df.merge(file_summary_df, how="left", on="filename") columns_to_keep = list(long_form_df.columns) columns_to_keep.append("full_CCMS_path") enriched_long_df = enriched_long_df[columns_to_keep] except: enriched_long_df = long_form_df # Visualization in ProteoSAFe enriched_long_df.to_csv(os.path.join(output_summary_folder, "data_long_visualize.tsv"), sep="\t", index=False) global GLOBAL_DF GLOBAL_DF = long_form_df metabolite_id_list = metabolite_id_list if run_stats == False: return param_candidates = [] # If we do not select a column, we don't calculate stats or do any plots if metadata_column in features_df: output_boxplot_list = [] columns_to_consider = metadata_permanova_prioritizer.permanova_validation( input_metadata_file) # Ignore columns_to_consider = [metadata_column] # HACK TO MAKE FASTER if len(columns_to_consider) > 0: columns_to_consider = columns_to_consider[:5] for column_to_consider in columns_to_consider: # Loop through all metabolites, and create plots if output_plots_folder is not None: for metabolite_id in metabolite_id_list: output_filename = os.path.join( output_plots_folder, "{}_{}.png".format(column_to_consider, metabolite_id)) input_params = {} input_params["metadata_column"] = column_to_consider input_params["output_filename"] = output_filename input_params["variable_value"] = metabolite_id param_candidates.append(input_params) output_dict = {} output_dict["metadata_column"] = column_to_consider output_dict["boxplotimg"] = os.path.basename( output_filename) output_dict["scan"] = metabolite_id output_boxplot_list.append(output_dict) metadata_all_columns_summary_df = pd.DataFrame(output_boxplot_list) metadata_all_columns_summary_df.to_csv(os.path.join( output_summary_folder, "all_columns.tsv"), sep="\t", index=False) # plotting on a specific column if not metadata_column in features_df: pass elif condition_first is None or condition_second is None: pass elif condition_first == "None" or condition_second == "None": pass else: output_stats_list = [] features_df = features_df[features_df[metadata_column].isin( [condition_first, condition_second])] data_first_df = features_df[features_df[metadata_column] == condition_first] data_second_df = features_df[features_df[metadata_column] == condition_second] for metabolite_id in metabolite_id_list: try: stat, pvalue = mannwhitneyu(data_first_df[metabolite_id], data_second_df[metabolite_id]) except KeyboardInterrupt: raise except: continue output_filename = os.path.join( output_plots_folder, "chosen_{}_{}.png".format(metadata_column, metabolite_id)) input_params = {} input_params["metadata_column"] = metadata_column input_params["output_filename"] = output_filename input_params["variable_value"] = metabolite_id input_params["metadata_facet"] = metadata_facet_column input_params[ "metadata_conditions"] = condition_first + ";" + condition_second param_candidates.append(input_params) output_stats_dict = {} output_stats_dict["metadata_column"] = metadata_column output_stats_dict["condition_first"] = condition_first output_stats_dict["condition_second"] = condition_second output_stats_dict["stat"] = stat output_stats_dict["pvalue"] = pvalue output_stats_dict["boxplotimg"] = os.path.basename(output_filename) output_stats_dict["scan"] = metabolite_id output_stats_list.append(output_stats_dict) metadata_columns_summary_df = pd.DataFrame(output_stats_list) metadata_columns_summary_df.to_csv(os.path.join( output_summary_folder, "chosen_columns.tsv"), sep="\t", index=False) print("Calculate Plots", len(param_candidates)) ming_parallel_library.run_parallel_job(plot_box, param_candidates, PARALLELISM, backend="multiprocessing")
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('input_metadata_filename', help='input_metadata_filename') parser.add_argument('input_quantification_table', help='input_quantification_table') parser.add_argument('output_folder', help='output_folder') parser.add_argument("conda_activate_bin") parser.add_argument("conda_environment") parser.add_argument('--distance_metric', default="cosine", help='Enter Distance Metric') args = parser.parse_args() output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv") output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv") df_quantification = pd.read_csv(args.input_quantification_table, sep=",") """Reading Metadata Filename and filling in empty entries""" if len(args.input_metadata_filename) < 2: df_metadata = pd.DataFrame([{"filename": "placeholder"}]) elif os.path.isfile(args.input_metadata_filename): df_metadata = pd.read_csv(args.input_metadata_filename, sep="\t") else: #It is a directory metadata_files = glob.glob( os.path.join(args.input_metadata_filename, "*")) if len(metadata_files) > 1: print("Enter only a single metadata file") exit(1) elif len(metadata_files) == 0: df_metadata = pd.DataFrame([{"filename": "placeholder"}]) else: df_metadata = pd.read_csv(metadata_files[0], sep="\t") if not "sample_name" in df_metadata: df_metadata["sample_name"] = df_metadata["filename"] """Checking if the set of filenames are fully covered, if not then we'll provide a place holder""" all_quantification_filenames = [ key.replace("Peak area", "").rstrip() for key in df_quantification.keys() if "Peak area" in key ] metadata_filenames = [] try: metadata_filenames = list(df_metadata["filename"]) except: metadata_filenames metadata_object_list = df_metadata.to_dict(orient="records") for quantification_filename in all_quantification_filenames: if not quantification_filename in metadata_filenames: print(quantification_filename, "not found") metadata_object = {} metadata_object["filename"] = quantification_filename metadata_object["sample_name"] = quantification_filename metadata_object_list.append(metadata_object) """Adding in missing filenames into the metadata""" new_output_metadata = pd.DataFrame(metadata_object_list) #Removing protected headers new_output_metadata = new_output_metadata.drop( columns=["feature", "#SampleID"], errors="ignore") output_columns = list(new_output_metadata.keys()) output_columns.remove("sample_name") output_columns.insert(0, "sample_name") new_output_metadata.to_csv(output_metadata_filename, index=False, sep="\t", columns=output_columns, na_rep="NaN") """Outputting Manifest Filename""" manifest_df = pd.DataFrame() manifest_df["sample_name"] = new_output_metadata["sample_name"] manifest_df["filepath"] = new_output_metadata["filename"] manifest_df.to_csv(output_manifest_filename, index=False, sep=",") #Running Qiime2 local_qza_table = os.path.join(args.output_folder, "qiime2_table.qza") local_qza_relative_table = os.path.join(args.output_folder, "qiime2_relative_table.qza") local_qza_distance = os.path.join(args.output_folder, "qiime2_distance.qza") local_qza_pcoa = os.path.join(args.output_folder, "qiime2_pcoa.qza") local_qzv_emperor = os.path.join(args.output_folder, "qiime2_emperor.qzv") local_qza_biplot = os.path.join(args.output_folder, "qiime2_biplot.qza") local_qzv_biplot_emperor = os.path.join(args.output_folder, "qiime2_biplot_emperor.qzv") all_cmd = [] all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime metabolomics import-mzmine2 \ --p-manifest {} \ --p-quantificationtable {} \ --o-feature-table {}".format(args.conda_activate_bin, args.conda_environment, output_manifest_filename, args.input_quantification_table, local_qza_table)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime diversity beta \ --i-table {} \ --p-metric {} \ --o-distance-matrix {}".format(args.conda_activate_bin, args.conda_environment, local_qza_table, args.distance_metric, local_qza_distance)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime diversity pcoa \ --i-distance-matrix {} \ --o-pcoa {}".format(args.conda_activate_bin, args.conda_environment, local_qza_distance, local_qza_pcoa)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime emperor plot \ --i-pcoa {} \ --m-metadata-file {} \ --o-visualization {} \ --p-ignore-missing-samples".format(args.conda_activate_bin, args.conda_environment, local_qza_pcoa, output_metadata_filename, local_qzv_emperor)) #Biplotting all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime feature-table relative-frequency \ --i-table {} \ --o-relative-frequency-table {}".format(args.conda_activate_bin, args.conda_environment, local_qza_table, local_qza_relative_table)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime diversity pcoa-biplot \ --i-pcoa {} \ --i-features {} \ --o-biplot {}".format(args.conda_activate_bin, args.conda_environment, local_qza_pcoa, local_qza_relative_table, local_qza_biplot)) all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime emperor biplot \ --i-biplot {} \ --m-sample-metadata-file {} \ --p-number-of-features 10 \ --o-visualization {} \ --p-ignore-missing-samples".format(args.conda_activate_bin, args.conda_environment, local_qza_biplot, output_metadata_filename, local_qzv_biplot_emperor)) # Running Permanova import metadata_permanova_prioritizer selected_columns = metadata_permanova_prioritizer.permanova_validation( output_metadata_filename) for column in selected_columns: print(column) output_qiime2_permanova_qzv = os.path.join( args.output_folder, "permanova_{}.qzv".format(column)) import pathvalidate output_qiime2_permanova_qzv = pathvalidate.sanitize_filepath( output_qiime2_permanova_qzv) cmd = "LC_ALL=en_US && export LC_ALL && source {} {} && \ qiime diversity beta-group-significance \ --i-distance-matrix {} \ --m-metadata-file {} \ --m-metadata-column \"{}\" \ --p-pairwise \ --o-visualization {}".format(args.conda_activate_bin, args.conda_environment, local_qza_distance, output_metadata_filename, column, output_qiime2_permanova_qzv) all_cmd.append(cmd) for cmd in all_cmd: print(cmd) os.system(cmd)