def isotope_label_detector( input_spec: InputSpec, source_dir: Path, exp_name: str, min_scans: int = 5, num_cond: int = 1, n_jobs: int = -1, ): scan_dir = source_dir.joinpath("all_scan_data") slope_dir = source_dir.joinpath("all_slope_data") slope_dir.mkdir(parents=True, exist_ok=True) out_dir = source_dir.joinpath("all_isotope_analysis") out_dir.mkdir(parents=True, exist_ok=True) features = utils.get_featurelist(source_dir=source_dir, exp_name=exp_name) conditions = input_spec.get_conditions() def run_label_detector(cond): out_file = slope_dir.joinpath(f"all_slope_data_{cond}.csv") print(f"Detecting labels in {cond}") #load both unlabeled and labeled scan data for cond scan_files = list(scan_dir.glob(f"all_ions_*{cond}.csv")) scan_dfs = [] for s in scan_files: assert s.exists() s_df = pd.read_csv(s) scan_dfs.append(s_df) df = utils.combine_dfs(scan_dfs) #fil = scan_dir.joinpath(f"all_ions_{cond}.csv") #assert fil.exists() #df = pd.read_csv(fil) grouped = df.groupby(["exp_id", "isotope", "condition"]) data = [] for (e_id, iso, c), g in grouped: res = utils.calc_rep_stats(g, e_id, iso, c, min_scans=min_scans) # print(data) if len(res) < 1: continue data.extend(res) if data: agg_df = pd.DataFrame(data) res_df = utils.aggregate_results(agg_df) res_df.to_csv(out_file, index=False) utils.run_label_analysis(res_df, cond, out_dir) else: print(f"No labels to detect for {cond}") # Run processing of each condition in separate process joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(run_label_detector)(c) for c in conditions) # [run_label_detector(c) for c in conditions] sum_df = utils.summarize_labels(out_dir, features, conditions) sum_df.to_csv(source_dir.joinpath(f"{exp_name}_data_summary.csv")) filtered_df = utils.filter_summary(sum_df, num_cond) filtered_df.to_csv( source_dir.joinpath(f"{exp_name}_data_summary_filtered.csv"))
def generate_featurelist( input_spec: InputSpec, source_dir: Path, exp_name: str, config: Dict, n_jobs: int = -1, blank_remove: bool = True, ): """ Create an mz ground truth list of all features detected in experiment and basketed to align between conditions and replicates. Optional blank subtraction. """ print("Collecting feature list files") # define here so input_spec and config in scope and not needed as params def do_munge_featurelist(cond: str): out_dir = source_dir.joinpath(cond) out_dir.mkdir(parents=True, exist_ok=True) print(f"Working on {cond}") all_collapsed = utils.munge_featurelist(inp_spec=input_spec, cond=cond, out_dir=out_dir, config=config) return all_collapsed # peaks in blanks to subtract later # This should not break in the event that there are no blanks # but the printing may be misleading if input_spec.get_feature_filepaths("blank"): blanks = do_munge_featurelist("blank") else: print("No blanks found") blank_remove = False # Run pre-processing on conditions in separate processes conditions = input_spec.get_conditions() cond_dfs = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(do_munge_featurelist)(c) for c in conditions) all_cond_df = utils.combine_dfs(cond_dfs) if blank_remove: print("Substracting blanks") utils.blank_subtract(blanks, all_cond_df, config=config) else: print("Not subtracting blanks") print("Grouping all features") all_cond_df.reset_index(inplace=True, drop=True) dereplicator.group_features( all_cond_df, exp_name, "exp_id", config=config) # final grouping - exp_id used for scan munging all_cond_df.to_csv(source_dir.joinpath(f"{exp_name}_all_features.csv"), index=False) return all_cond_df