def subset_df_for_each_comparison(df: pd.DataFrame, base_df: pd.DataFrame, comparisons_dict: dict): values_prefix = rule_params["all"]["values_cols_prefix"] abundance_df = df.filter(regex=values_prefix) for comparison in comparisons_dict.keys(): # get abundances values for reference reference_abundances_col = abundance_df.filter( regex=comparisons_dict[comparison]["reference"]) # Add a descriptor in column names / useful if several different control to use, more flexible than in the # config file ( reference_abundances_col = reference_abundances_col.add_suffix( '_reference') # get abundances values for the condition to compare with reference condition_abundances_col = df.filter( regex=comparisons_dict[comparison]["condition"]) # create complete dataframe result_df = pd.concat( [base_df, reference_abundances_col, condition_abundances_col], axis=1) # export result output_result = os.path.join(os.path.dirname(args.output_file), '{}'.format(comparison)) logging.debug('Path to file: {}'.format(output_result)) h.export_result_to_csv(result_df, output_result) logger.info('Data for {} exported to csv'.format(comparison)) return
args = get_args() filename = h.filename(args.input_file) rule_params = h.load_json_parameter(args.file_id) logpath = os.path.join(paths.global_data_dir, args.file_id, 'log/mapping.log') logger = h.get_logger(logpath) logging.info('Starting mapping file: ' + args.input_file) # get data data_df = pd.read_csv(args.input_file, header=0, index_col=None) mapping_df = pd.read_csv(args.mapping_file, header=0, index_col=None) # get parameters values_cols_prefix = rule_params['all']['values_cols_prefix'] col_for_mapping = rule_params['mapping']['col_for_mapping'] col_label = rule_params['mapping']['col_label'] # rename columns with abundances values in the data frame based on metadata in the mapping df result_df = fpreprocessing.rename_col_abundance_withjson( mapping_df, data_df, values_cols_prefix, col_for_mapping, col_label) # build json corresponding to new column name json_for_groups = "metadata_{}.json".format(filename) path_to_json = os.path.join(paths.global_data_dir, args.file_id, json_for_groups) d = fpreprocessing.build_json(mapping_df, path_to_json, col_for_mapping) # export results h.export_result_to_csv(result_df, args.output_file)
# create json with information on % of NaN for samples out = os.path.join(paths.global_data_dir, args.file_id, 'missing_values', 'samples_{}.json'.format(filename)) fqc.export_json_sample(stats_per_sample, out, values_cols_prefix) # filter dataframe for following analysis # remove row to discard filtered_df = fqc.remove_flagged_rows(result_df, 'exclude_na', 1) # remove samples to discard AND keep only base df (as defined in the config file) and abundances values columns filtered_df = fqc.remove_flagged_samples( df=filtered_df, boolean_mask=stats_per_sample['to_exclude'], metadata_col=rule_params['all']['metadata_col'], values_col_prefix=rule_params['all']['values_cols_prefix'], keep_specific=keep_specific, col_name=col_name) # Export dataframe with only proteins/samples compliant with threshold h.export_result_to_csv(filtered_df, args.output_file_filtered) # Export dataframe with all data and information on nan percentage per group and protein h.export_result_to_csv(result_df, args.output_file_complete) logging.info("Keeping " + str(len(filtered_df)) + " proteins with current parameters.") logging.info( "Keeping " + str(len(stats_per_sample[stats_per_sample['to_exclude'] == False])) + " samples with current parameters.")
# load data if args.input_file: data_df = pd.read_csv(args.input_file, header=0, index_col=None) result = update_overlap(data_df) # get subset of data compliant with criterion defined in config file - returns dictionary with dataframe subsets_data = h.subset_data(data_df, subset_filters) for subset_name in subsets_data: df = subsets_data[subset_name] # remove extension in path file output_file = re.sub( '.csv', '', args.output_file) + '_{}.csv'.format(subset_name) h.export_result_to_csv(df, output_file) elif args.input_directory: files = list_files_in_dir(args.input_directory, '.csv') for file in files: data_df = pd.read_csv(file, header=0, index_col=None) # update overlap result = update_overlap(data_df) # get subset of data on which to perform enrichment - returns dictionary with dataframe subsets_data = h.subset_data(data_df, subset_filters) for subset_name in subsets_data: df = subsets_data[subset_name]
# Divide data frame in specific and aspecific proteins rows result, specific_proteins_df = extract_specific_proteins(data_df) # Add arbitrary p-value for specific proteins reference = rule_params["all"]["reference"] specific_proteins_pval = update_res_with_specific_proteins(specific_proteins_df, reference, test) else: result = data_df.copy() # Compute z-score res_zscore = compute_z_score(result) # Now find which distribution fits the best best_dist, args_param = find_best_distribution(res_zscore, args.histogramm_distribution) # compute p-value from the distribution res_pval = compute_p_value(result, rule_params['distribution']['test_type'], best_dist, args_param) # Concatenate results on aspecific et specific proteins: if rule_params['all']['specific_proteins']: res_pval = pd.concat([res_pval, specific_proteins_pval], axis=0) # log results significant = len(res_pval[res_pval['pvalue'] < 0.05]) logger.info("{} proteins are significant (p-value < 0.05).".format(str(significant))) # export results h.export_result_to_csv(res_pval, args.output_file)
args.analysis_id) try: os.mkdir(path2analysis_folder) logger.debug("Creating folder for this analysis") except FileExistsError: logger.debug("Analysis folder already created") path2error_file = os.path.join(paths.global_data_dir, args.analysis_id, args.error_file) logger.debug("Exporting import_error file to: {}".format(path2error_file)) with open(args.error_file, 'w+') as json_file: json.dump(errors, json_file, indent=True) # export header output_sample_name = os.path.join(paths.global_data_dir, args.analysis_id, args.output_sample_name) logger.debug("Exporting header file to: {}".format(output_sample_name)) fi.get_sample_name(df, output_sample_name) # export data if args.output_file: output_csv = args.output_file else: output_csv = os.path.join( paths.global_data_dir, args.analysis_id, "csv/{}.csv".format(h.filename(args.input_file))) logger.debug("Exporting converted file to: {}".format(output_csv)) h.export_result_to_csv(df, output_csv, index_col=True)