def write_output(out_dir: Path, df_T11: DataFrame, df_T10: DataFrame) -> None: """ Write results files as csv to main results folder Args: out_dir (Path): Path to main results folder. df_T11 (DataFrame): Dataframe T11 df_T10 (DataFrame): Dataframe T10 """ save_df_as_csv( out_dir, df_T11, "T11", [ "cont_descriptor_vector_id", "descriptor_vector_id", "fp_json", "fp_val_json", "fold_id", ], ) save_df_as_csv( out_dir, df_T10, "T10", [ "cont_descriptor_vector_id", "cont_classification_task_id", "class_label", "fold_id", ], )
def write_failed_output(out_dir: Path, df_T4c_failed: pd.DataFrame, columns_T4c: list) -> None: """Save csv files of activity data with ambiguous class labels. Args: out_dir (Path): output Path object df_T4c (DataFrame): dataframe containing classified activity data """ save_df_as_csv(out_dir, df_T4c_failed, "T4c.FAILED", columns_T4c)
def write_mappting_tables(out_dir: Path, df: DataFrame) -> None: """ Wrapper to save mapping table as csv file Args: out_dir (Path): Path to mapping table subfolder df (DataFrame): mapping table dataframe T3 """ save_df_as_csv(out_dir, df, "T3_mapping")
def do_prepare_prediction(args): """Wrapper to run the entire pipeline for training. Args: args (Namespace): Subparser argmuents """ start = time.time() _args = vars(args) if _args["non_interactive"] is True: overwriting = True else: overwriting = False num_cpu = _args["number_cpu"] # load parameters and key load_config(_args) load_key(_args) bit_size = melloddy_tuner.utils.config.parameters.get_parameters( )["fingerprint"]["fold_size"] ######### # Consistency check print("Consistency checks of config and key files.") hash_reference_set.main(_args) ######### print("Prepare for prediction.") ###### df = read_input_file(_args["structure_file"]) # Make directories, load input files output_dir_std, dt_std = standardize_smiles.prepare(_args) df_smi, df_smi_failed = standardize_smiles.run(df, dt_std) output_dir_desc, dt_desc = calculate_descriptors.prepare( _args, overwriting) df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc) df_desc_c = df_desc.copy() df_desc_c.loc[:, "descriptor_vector_id"] = ( df_desc_c.groupby("input_compound_id").ngroup().replace(-1, np.nan).add(1)) df_T6 = df_desc_c[["descriptor_vector_id", "fp_feat", "fp_val"]] out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting) df_T11 = map_2_cont_id( df_T6, "descriptor_vector_id").sort_values("cont_descriptor_vector_id") save_df_as_csv(results_dir, df_T11, "T11_pred") x_matrix = csv_2_mtx.matrix_from_strucutres(df_T11, bit_size) save_mtx_as_npy(x_matrix, out_dir_matrices, "pred_x") print(f"Preparation took {time.time() - start:.08} seconds.") print(f"Prediction preparation done.")
def write_tmp_output( out_dir: Path, df_T4c: pd.DataFrame, df_T3c: pd.DataFrame, columns_T4c: list, columns_T3c: list, ) -> None: """Save csv files of classified activity data. Args: out_dir (Path): output Path object df_T4c (DataFrame): dataframe containing classified activity data df_T3c (DataFrame): dataframe containing classification threshold definitions """ save_df_as_csv(out_dir, df_T4c, "T4c", columns_T4c) save_df_as_csv(out_dir, df_T3c, "T3c", columns_T3c)
def write_tmp_output( out_dir: Path, data_failed: DataFrame, data_duplicated_id_pairs: DataFrame, data_excluded: DataFrame, ) -> None: """ Writes output files to additional results folder Args: out_dir (Path): Path to additional results folder data_failed (DataFrame): Dataframe containing activity data from failed structures data_duplicated_id_pairs (DataFrame): Dataframe containing duplicated pairs of ids data_excluded (DataFrame): Dataframe containing """ save_df_as_csv( out_dir, data_failed, "T4_failed_structures", ["input_compound_id", "classification_task_id", "class_label"], ) save_df_as_csv( out_dir, data_duplicated_id_pairs, "T4_duplicates", ["classification_task_id", "descriptor_vector_id", "class_label"], ) save_df_as_csv( out_dir, data_excluded, "T4_excluded_data", ["classification_task_id", "descriptor_vector_id", "class_label"], )
def main(args: dict = None): """ Main function reading input files, executing functions and writing output files. """ start = time.time() if args is None: args = vars(init_arg_parser()) if args["non_interactive"] is True: overwriting = True else: overwriting = False load_config(args) load_key(args) print("Consistency checks of config and key files.") hash_reference_set.main(args) print("Generate sparse matrices from given dataframes.") fp_param = melloddy_tuner.utils.config.parameters.get_parameters( )["fingerprint"] bit_size = fp_param["fold_size"] output_dir, results_dir = prepare(args, overwriting) tag = args["tag"] if (tag != "cls") and (tag != "clsaux"): print("Please choose a different tag. Only cls or clsaux are allowed.") exit() df_T6 = read_input_file(args["structure_file"]) df_T10c = read_input_file(args["activity_file_clf"]) df_T10r = read_input_file(args["activity_file_reg"]) df_T6_cont, T10c_cont, T10r_cont = get_cont_id(df_T6, df_T10c, df_T10r) df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]] df_T9c = read_input_file(args["weight_table_clf"]) df_T9r = read_input_file(args["weight_table_reg"]) save_df_as_csv(results_dir, T10c_cont, "T10c_cont") save_df_as_csv(results_dir, T10r_cont, "T10r_cont") save_df_as_csv(results_dir, df_T6_cont, "T6_cont") save_csv_output(output_dir, tag, df_T9c, df_T9r) x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask = make_matrices( df_T11, T10c_cont, T10r_cont, bit_size) y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False) y_matrix_clf.eliminate_zeros() save_npy_matrices( output_dir, tag, x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask, ) end = time.time() print(f"Formatting to matrices took {end - start:.08} seconds.") print(f"Files are ready for SparseChem.")
def save_csv_output(out_dir: Path, tag: str, df_T9c: DataFrame, df_T9r: DataFrame) -> None: """ Wrapper to save csv files (counts.csv and weights.csv) to matrix output folder. Args: out_dir (Path): path to matrix output folder df_T10_counts (DataFrame): activity dataframe T10_counts containing counts per task. df_T3_mapped (DataFrame): Mapped weight tabel T3 """ df_T9c = df_T9c.rename( columns={ ("cont_classification_task_id"): "task_id", ("assay_type"): "task_type", ("weight"): "training_weight", }) df_T9c = df_T9c.dropna(subset=["task_id"]).sort_values("task_id") df_T9c["task_id"] = df_T9c["task_id"].astype(int) df_T9r = df_T9r.rename( columns={ ("cont_regression_task_id"): "task_id", ("assay_type"): "task_type", ("weight"): "training_weight", }) df_T9r = df_T9r.dropna(subset=["task_id"]).sort_values("task_id") df_T9r["task_id"] = df_T9r["task_id"].astype(int) if tag == "cls": out_dir_cls = out_dir / "cls" out_dir_cls.mkdir(exist_ok=True) save_df_as_csv( out_dir_cls, df_T9c, "cls_weights", ["task_id", "task_type", "training_weight", "aggregation_weight"], ) out_dir_reg = out_dir / "reg" out_dir_reg.mkdir(exist_ok=True) save_df_as_csv( out_dir_reg, df_T9r, "reg_weights", [ "task_id", "task_type", "training_weight", "aggregation_weight", "censored_weight", ], ) if tag == "clsaux": out_dir_clsaux = out_dir / "clsaux" out_dir_clsaux.mkdir(exist_ok=True) save_df_as_csv( out_dir_clsaux, df_T9c, "clsaux_weights", ["task_id", "task_type", "training_weight", "aggregation_weight"], )
def do_prepare_training(args): """Wrapper to run the entire pipeline for training. Args: args (Namespace): Subparser argmuents #""" start_total = time.time() start = time.time() _args = vars(args) if _args["non_interactive"] is True: overwriting = True else: overwriting = False num_cpu = _args["number_cpu"] # # load parameters and key load_config(_args) load_key(_args) bit_size = melloddy_tuner.utils.config.parameters.get_parameters( )["fingerprint"]["fold_size"] ######### # Consistency check print("Consistency checks of config and key files.") hash_reference_set.main(_args) ######### start = time.time() tag = _args["tag"] print("Reading input data.") df_T0 = read_input_file(_args["weight_table"]) df_T1 = read_input_file(_args["activity_file"]) df_T2 = read_input_file(_args["structure_file"]) print("Data loaded.") print("Start sanity checks of input data.") print("Check assay types in T0.") sanity_check_assay_type(df_T0) print("Check consistency of input_assay_id between T0 and T1.") sanity_check_assay_sizes(df_T0, df_T1) print("Check consistency of input_compound_id between T1 and T2.") sanity_check_compound_sizes(df_T1, df_T2) print("Check uniqueness of T0 and T2.") sanity_check_uniqueness(df_T0, colname="input_assay_id", filename="T0") sanity_check_uniqueness(df_T2, colname="input_compound_id", filename="T2") print(f"Sanity checks took {time.time() - start:.08} seconds.") print(f"Sanity checks passed.") start = time.time() print("Start standardizing structures.") # Make directories, load input files results_dir = make_dir(_args, "results", None, overwriting) output_dir_std, dt_std = standardize_smiles.prepare(_args) df_smi, sd_smi_failed = standardize_smiles.run(df_T2, dt_std) save_df_as_csv(output_dir_std, df_smi, "T2_standardized") save_df_as_csv(output_dir_std, sd_smi_failed, "T2_standardized.FAILED") del sd_smi_failed, df_T2 print(f"Standardization took {time.time() - start:.08} seconds.") print(f"Standardization done.") df_T5 = pd.DataFrame() df_T6 = pd.DataFrame() if _args["folding_method"] == "scaffold": print("Using scaffold-based fold assignment.") output_dir_desc, dt_desc = calculate_descriptors.prepare( _args, overwriting) start = time.time() print("Start calculating descriptors.") df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc) save_df_as_csv(output_dir_desc, df_desc, "T2_descriptors") save_df_as_csv(output_dir_desc, df_desc_failed, "T2_descriptors.FAILED") del df_smi, df_desc_failed print( f"Fingerprint calculation took {time.time() - start:.08} seconds.") print(f"Descriptor calculation done.") start = time.time() print("Start computing folds.") output_dir_fold, mapping_table_dir, dt_fold = calculate_scaffold_folds.prepare( _args) df_fold, df_fold_failed = calculate_scaffold_folds.run( df_desc, dt_fold) save_df_as_csv(output_dir_fold, df_fold, "T2_folds") save_df_as_csv(output_dir_fold, df_fold_failed, "T2_folds.FAILED") del df_fold_failed, df_desc df_T5, df_T6, df_duplicates = helper.format_dataframe(df_fold) save_df_as_csv(mapping_table_dir, df_T5, "T5") save_df_as_csv(mapping_table_dir, df_T6, "T6") save_df_as_csv(output_dir_desc, df_duplicates, "T2_descriptor_vector_id.DUPLICATES") del df_duplicates print(f"Fold calculation took {time.time() - start:.08} seconds.") print(f"Fold calculation done.") elif _args["folding_method"] == "lsh": print("Using LSH based fold assignment.") output_dir_lsh, mapping_table_dir, dt_lsh = calculate_lsh_folds.prepare( _args, overwriting) output_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.csv") error_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.FAILED.csv") dupl_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.DUPLICATES.csv") mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv") mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv") df_desc_lsh, df_desc_lsh_failed = dt_lsh.process_dataframe(df_smi) df_desc_lsh.to_csv(output_file, index=False) df_desc_lsh_failed.to_csv(error_file, index=False) df_T5, df_T6, df_duplicates = helper.format_dataframe(df_desc_lsh) df_duplicates.to_csv(dupl_file, index=False) df_T5.to_csv(mapping_file_T5, index=False) df_T6.to_csv(mapping_file_T6, index=False) del df_duplicates end = time.time() print( f"Fingerprint calculation and LSH folding took {end - start:.08} seconds." ) print(f"Descriptor calculation and LSH folding done.") else: print("Please use scaffold or lsh as folding method.") quit() start = time.time() print("Start aggregating values.") output_dir_agg = aggregate_values.prepare(_args, overwriting) ( df_T4r, df_failed_range, df_failed_aggr, df_failed_std, df_dup, df_T0_upd, ) = aggregate_values.aggregate_replicates( df_T0, df_T1, df_T5, ConfigDict.get_parameters()["credibility_range"], num_cpu) df_T4r = df_T4r[[ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ]] save_df_as_csv( output_dir_agg, df_T4r, "T4r", [ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ], ) save_df_as_csv( output_dir_agg, df_failed_range, "failed_range_T1", [ "input_compound_id", "input_assay_id", "standard_qualifier", "standard_value" ], ) save_df_as_csv( output_dir_agg, df_failed_aggr, "failed_aggr_T1", [ "descriptor_vector_id", "input_assay_id", "standard_qualifier", "standard_value", "fold_id", ], ) save_df_as_csv( output_dir_agg, df_failed_std, "failed_std_T1", [ "descriptor_vector_id", "input_assay_id", "standard_qualifier", "standard_value", "fold_id", ], ) save_df_as_csv( output_dir_agg, df_dup, "duplicates_T1", [ "input_assay_id", "input_compound_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ], ) save_df_as_csv(output_dir_agg, df_T0_upd, "T0_upd") del df_T5, df_failed_range, df_failed_aggr, df_dup, df_T1 print(f"Replicate aggregation took {time.time() - start:.08} seconds.") print(f"Replicate aggregation done.") start = time.time() print("Start thresholding.") output_dir_thres = apply_thresholding.prepare(_args, overwriting) df_T0_upd = df_T0_upd.astype({"input_assay_id": "int"}) df_T4r = df_T4r.astype({"input_assay_id": "int"}) df_T4c, df_T3c = apply_thresholding.run(df_T0_upd, df_T4r, num_cpu) # Write final dataframes (T4c, T3c) columns_T3c = [ "classification_task_id", "input_assay_id", "assay_type", "variance_quorum_OK", "use_in_regression", "is_auxiliary", "threshold", "threshold_method", "direction", ] columns_T4c = [ "classification_task_id", "descriptor_vector_id", "fold_id", "input_assay_id", "standard_qualifier", "standard_value", "threshold", "class_label", ] df_T4c.sort_values("classification_task_id", inplace=True) df_T3c.sort_values("classification_task_id", inplace=True) # Filter ambiguous class labels df_T4c_failed = df_T4c[df_T4c.class_label.isna()] df_T4c = df_T4c[~df_T4c.class_label.isna()] df_T4c = df_T4c[columns_T4c] df_T3c = df_T3c[columns_T3c] save_df_as_csv(output_dir_thres, df_T4c_failed, "T4c.FAILED") save_df_as_csv(output_dir_thres, df_T4c, "T4c") save_df_as_csv(output_dir_thres, df_T3c, "T3c") print(f"Thresholding took {time.time() - start:.08} seconds.") print(f"Thresholding done.") print("Start filter classification data.") start = time.time() output_dir_filter_clf = filter_classification.prepare(_args, overwriting) T10c, T8c, T4c_filtered_out, T4c_dedup = filter_classification.filter_clf( df_T3c, df_T4c, ConfigDict.get_parameters()["training_quorum"]["classification"], ConfigDict.get_parameters()["evaluation_quorum"]["classification"], ConfigDict.get_parameters()["initial_task_weights"], ) filter_classification.write_tmp_output(output_dir_filter_clf, T10c, T8c, T4c_filtered_out, T4c_dedup) del df_T4c, df_T3c, T4c_filtered_out, T4c_dedup print(f"Classification filtering took {time.time() - start:.08} seconds.") print(f"Classification filtering done.") print("Start filter regression data.") ##### start = time.time() out_dir_filter_reg = filter_regression.prepare(_args, overwriting) T10r, T8r, T4r_filtered_out, T4r_dedup = filter_regression.filter_regression_tasks( df_T0_upd, df_T4r, ConfigDict.get_parameters()["training_quorum"]["regression"], ConfigDict.get_parameters()["evaluation_quorum"]["regression"], ConfigDict.get_parameters()["initial_task_weights"], ConfigDict.get_parameters()["censored_downweighting"], ) filter_regression.write_tmp_output(out_dir_filter_reg, T10r, T8r, T4r_filtered_out, T4r_dedup) del df_T0, df_T4r, T4r_filtered_out, T4r_dedup print(f"Filtering regression data took {time.time() - start:.08} seconds.") print(f"Filtering regression data done.") print("Start creating sparse matrices.") start = time.time() out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting) df_T6_cont, T10c_cont, T10r_cont = csv_2_mtx.get_cont_id(df_T6, T10c, T10r) df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]] save_df_as_csv(results_dir, T10c_cont, "T10c_cont") save_df_as_csv(results_dir, T10r_cont, "T10r_cont") save_df_as_csv(results_dir, df_T6_cont, "T6_cont") csv_2_mtx.save_csv_output(out_dir_matrices, tag, T8c, T8r) del df_T6, df_T6_cont, T10r, T10c ( x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask, ) = csv_2_mtx.make_matrices(df_T11, T10c_cont, T10r_cont, bit_size) del df_T11, T10c_cont, T10r_cont y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False) y_matrix_clf.eliminate_zeros() csv_2_mtx.save_npy_matrices( out_dir_matrices, tag, x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask, ) print(f"Formatting to matrices took {time.time() - start:.08} seconds.") end = time.time() print(f"Overall processing took {end - start_total:.08} seconds.") print(f"Files are ready for SparseChem.")
def write_tmp_output( out_dir: Path, df: pd.DataFrame, df_failed_range: pd.DataFrame, df_failed_aggr: pd.DataFrame, df_failed_std: pd.DataFrame, df_dup: pd.DataFrame, T0_upd: pd.DataFrame, ) -> None: """Save csv files of aggregated activity values and data outside the credibililty range. Args: out_dir (Path): output Path object df (DataFrame): dataframe containing aggregated activity data T0_upd (DataFrame): dataframe with updated T0 info df_failed_range (DataFrame): dataframe containing activity data outside the credibility range df_failed_aggr (DataFrame): dataframe containing activity data that failed at aggregation step df_failed_std (DataFrame): dataframe containing activity data that failed due to low std per task per fold """ save_df_as_csv( out_dir, df, "T4r", [ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ], ) save_df_as_csv(out_dir, T0_upd, "T0_upd") save_df_as_csv( out_dir, df_failed_range, "failed_range_T1", ["input_compound_id", "input_assay_id", "standard_qualifier", "standard_value"], ) save_df_as_csv( out_dir, df_failed_aggr, "failed_aggr_T1", [ "descriptor_vector_id", "input_assay_id", "standard_qualifier", "standard_value", "fold_id", ], ) save_df_as_csv( out_dir, df_failed_std, "failed_std_T1", [ "descriptor_vector_id", "input_assay_id", "standard_qualifier", "standard_value", "fold_id", ], ) save_df_as_csv( out_dir, df_dup, "duplicates_T1", [ "input_assay_id", "input_compound_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ], )
def write_tmp_output( out_dir: Path, T10c: pd.DataFrame, T8c: pd.DataFrame, T4c_filtered_out: pd.DataFrame, T4c_dedup: pd.DataFrame, ) -> None: """Save csv files of aggregated activity values and data outside the credibililty range. Args: out_dir (Path): output Path object T4c_filtered_out (DataFrame): dataframe containing regression activity data filtered out in training T4c_dedup (DataFrame): dataframe containing duplicated regression activity data """ save_df_as_csv( out_dir, T4c_filtered_out, "filtered_out_T4c", [ "classification_task_id", "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", "threshold", "class_label", ], ) save_df_as_csv( out_dir, T4c_dedup, "duplicates_T4c", [ "classification_task_id", "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", "threshold", "class_label", ], ) save_df_as_csv( out_dir, T10c, "T10c", [ "cont_classification_task_id", "descriptor_vector_id", "fold_id", "class_label", ], ) save_df_as_csv( out_dir, T8c, "T8c", [ "cont_classification_task_id", "classification_task_id", "input_assay_id", "assay_type", "variance_quorum_OK", "is_auxiliary", "use_in_regression", "threshold", "threshold_method", "direction", "training_quorum_OK", "evaluation_quorum_OK", "aggregation_weight", "weight", "num_total_actives", "num_fold_min_actives", "num_total_inactives", "num_fold_min_inactives", "n_tasks", ], )
def write_tmp_output( out_dir: Path, T10r: pd.DataFrame, T8r: pd.DataFrame, T4r_filtered_out: pd.DataFrame, T4r_dedup: pd.DataFrame, ) -> None: """Save csv files of aggregated activity values and data outside the credibililty range. Args: out_dir (Path): output Path object T10r (DataFrame): dataframe containing deduplicated regression task data T8r (DataFrame): dataframe containing deduplicated regression task definitions and metadata T4r_filtered_out (DataFrame): dataframe containing regression activity data filtered out in training T4r_dedup (DataFrame): dataframe containing duplicated regression activity data """ save_df_as_csv( out_dir, T10r, "T10r", [ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", "cont_regression_task_id", ], ) save_df_as_csv( out_dir, T8r, "T8r", [ "cont_regression_task_id", "input_assay_id", "assay_type", "variance_quorum_OK", "is_auxiliary", "use_in_regression", "expert_threshold_1", "expert_threshold_2", "expert_threshold_3", "expert_threshold_4", "expert_threshold_5", "direction", "training_quorum_OK", "evaluation_quorum_OK", "aggregation_weight", "weight", "censored_weight", ], ) save_df_as_csv( out_dir, T4r_filtered_out, "filtered_out_T4r", [ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ], ) save_df_as_csv( out_dir, T4r_dedup, "duplicates_T4r", [ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", "cont_regression_task_id", ], )