def write_output(out_dir: Path, df_T11: DataFrame, df_T10: DataFrame) -> None:
    """
    Write results files as csv to main results folder

    Args:
        out_dir (Path): Path to main results folder.
        df_T11 (DataFrame): Dataframe T11
        df_T10 (DataFrame): Dataframe T10
    """
    save_df_as_csv(
        out_dir,
        df_T11,
        "T11",
        [
            "cont_descriptor_vector_id",
            "descriptor_vector_id",
            "fp_json",
            "fp_val_json",
            "fold_id",
        ],
    )
    save_df_as_csv(
        out_dir,
        df_T10,
        "T10",
        [
            "cont_descriptor_vector_id",
            "cont_classification_task_id",
            "class_label",
            "fold_id",
        ],
    )
示例#2
0
def write_failed_output(out_dir: Path, df_T4c_failed: pd.DataFrame,
                        columns_T4c: list) -> None:
    """Save csv files of activity data with ambiguous class labels.

    Args:
        out_dir (Path): output Path object
        df_T4c (DataFrame): dataframe containing classified activity data
    """
    save_df_as_csv(out_dir, df_T4c_failed, "T4c.FAILED", columns_T4c)
def write_mappting_tables(out_dir: Path, df: DataFrame) -> None:
    """
    Wrapper to save mapping table as csv file

    Args:
        out_dir (Path): Path to mapping table subfolder
        df (DataFrame): mapping table dataframe T3
    """
    save_df_as_csv(out_dir, df, "T3_mapping")
示例#4
0
def do_prepare_prediction(args):
    """Wrapper to run the entire pipeline for training.

    Args:
        args (Namespace): Subparser argmuents
    """
    start = time.time()
    _args = vars(args)
    if _args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    num_cpu = _args["number_cpu"]
    # load parameters and key
    load_config(_args)
    load_key(_args)
    bit_size = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]["fold_size"]
    #########
    # Consistency check
    print("Consistency checks of config and key files.")
    hash_reference_set.main(_args)
    #########
    print("Prepare for prediction.")

    ######
    df = read_input_file(_args["structure_file"])
    # Make directories, load input files
    output_dir_std, dt_std = standardize_smiles.prepare(_args)

    df_smi, df_smi_failed = standardize_smiles.run(df, dt_std)
    output_dir_desc, dt_desc = calculate_descriptors.prepare(
        _args, overwriting)
    df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc)
    df_desc_c = df_desc.copy()
    df_desc_c.loc[:, "descriptor_vector_id"] = (
        df_desc_c.groupby("input_compound_id").ngroup().replace(-1,
                                                                np.nan).add(1))
    df_T6 = df_desc_c[["descriptor_vector_id", "fp_feat", "fp_val"]]
    out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting)

    df_T11 = map_2_cont_id(
        df_T6, "descriptor_vector_id").sort_values("cont_descriptor_vector_id")

    save_df_as_csv(results_dir, df_T11, "T11_pred")
    x_matrix = csv_2_mtx.matrix_from_strucutres(df_T11, bit_size)
    save_mtx_as_npy(x_matrix, out_dir_matrices, "pred_x")
    print(f"Preparation took {time.time() - start:.08} seconds.")
    print(f"Prediction preparation done.")
示例#5
0
def write_tmp_output(
    out_dir: Path,
    df_T4c: pd.DataFrame,
    df_T3c: pd.DataFrame,
    columns_T4c: list,
    columns_T3c: list,
) -> None:
    """Save csv files of classified activity data.

    Args:
        out_dir (Path): output Path object
        df_T4c (DataFrame): dataframe containing classified activity data
        df_T3c (DataFrame): dataframe containing classification threshold definitions
    """
    save_df_as_csv(out_dir, df_T4c, "T4c", columns_T4c)
    save_df_as_csv(out_dir, df_T3c, "T3c", columns_T3c)
def write_tmp_output(
    out_dir: Path,
    data_failed: DataFrame,
    data_duplicated_id_pairs: DataFrame,
    data_excluded: DataFrame,
) -> None:
    """
    Writes output files to additional results folder

    Args:
        out_dir (Path): Path to additional results folder
        data_failed (DataFrame): Dataframe containing activity data from failed structures
        data_duplicated_id_pairs (DataFrame): Dataframe containing duplicated pairs of ids
        data_excluded (DataFrame): Dataframe containing
    """
    save_df_as_csv(
        out_dir,
        data_failed,
        "T4_failed_structures",
        ["input_compound_id", "classification_task_id", "class_label"],
    )
    save_df_as_csv(
        out_dir,
        data_duplicated_id_pairs,
        "T4_duplicates",
        ["classification_task_id", "descriptor_vector_id", "class_label"],
    )
    save_df_as_csv(
        out_dir,
        data_excluded,
        "T4_excluded_data",
        ["classification_task_id", "descriptor_vector_id", "class_label"],
    )
示例#7
0
def main(args: dict = None):
    """
    Main function reading input files, executing functions and writing output files.
    """
    start = time.time()
    if args is None:
        args = vars(init_arg_parser())

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)

    print("Generate sparse matrices from given dataframes.")
    fp_param = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]
    bit_size = fp_param["fold_size"]
    output_dir, results_dir = prepare(args, overwriting)
    tag = args["tag"]

    if (tag != "cls") and (tag != "clsaux"):
        print("Please choose a different tag. Only cls or clsaux are allowed.")
        exit()
    df_T6 = read_input_file(args["structure_file"])
    df_T10c = read_input_file(args["activity_file_clf"])
    df_T10r = read_input_file(args["activity_file_reg"])
    df_T6_cont, T10c_cont, T10r_cont = get_cont_id(df_T6, df_T10c, df_T10r)
    df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]]

    df_T9c = read_input_file(args["weight_table_clf"])
    df_T9r = read_input_file(args["weight_table_reg"])

    save_df_as_csv(results_dir, T10c_cont, "T10c_cont")
    save_df_as_csv(results_dir, T10r_cont, "T10r_cont")
    save_df_as_csv(results_dir, df_T6_cont, "T6_cont")

    save_csv_output(output_dir, tag, df_T9c, df_T9r)

    x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask = make_matrices(
        df_T11, T10c_cont, T10r_cont, bit_size)
    y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False)
    y_matrix_clf.eliminate_zeros()

    save_npy_matrices(
        output_dir,
        tag,
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    )
    end = time.time()
    print(f"Formatting to matrices took {end - start:.08} seconds.")
    print(f"Files are ready for SparseChem.")
示例#8
0
def save_csv_output(out_dir: Path, tag: str, df_T9c: DataFrame,
                    df_T9r: DataFrame) -> None:
    """
    Wrapper to save csv files (counts.csv and weights.csv) to matrix output folder.

    Args:
        out_dir (Path): path to matrix output folder
        df_T10_counts (DataFrame): activity dataframe T10_counts containing counts per task.
        df_T3_mapped (DataFrame): Mapped weight tabel T3
    """

    df_T9c = df_T9c.rename(
        columns={
            ("cont_classification_task_id"): "task_id",
            ("assay_type"): "task_type",
            ("weight"): "training_weight",
        })
    df_T9c = df_T9c.dropna(subset=["task_id"]).sort_values("task_id")
    df_T9c["task_id"] = df_T9c["task_id"].astype(int)
    df_T9r = df_T9r.rename(
        columns={
            ("cont_regression_task_id"): "task_id",
            ("assay_type"): "task_type",
            ("weight"): "training_weight",
        })
    df_T9r = df_T9r.dropna(subset=["task_id"]).sort_values("task_id")
    df_T9r["task_id"] = df_T9r["task_id"].astype(int)
    if tag == "cls":
        out_dir_cls = out_dir / "cls"
        out_dir_cls.mkdir(exist_ok=True)
        save_df_as_csv(
            out_dir_cls,
            df_T9c,
            "cls_weights",
            ["task_id", "task_type", "training_weight", "aggregation_weight"],
        )
        out_dir_reg = out_dir / "reg"
        out_dir_reg.mkdir(exist_ok=True)
        save_df_as_csv(
            out_dir_reg,
            df_T9r,
            "reg_weights",
            [
                "task_id",
                "task_type",
                "training_weight",
                "aggregation_weight",
                "censored_weight",
            ],
        )
    if tag == "clsaux":
        out_dir_clsaux = out_dir / "clsaux"
        out_dir_clsaux.mkdir(exist_ok=True)
        save_df_as_csv(
            out_dir_clsaux,
            df_T9c,
            "clsaux_weights",
            ["task_id", "task_type", "training_weight", "aggregation_weight"],
        )
示例#9
0
def do_prepare_training(args):
    """Wrapper to run the entire pipeline for training.

    Args:
        args (Namespace): Subparser argmuents
    #"""
    start_total = time.time()

    start = time.time()
    _args = vars(args)
    if _args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    num_cpu = _args["number_cpu"]
    # # load parameters and key
    load_config(_args)
    load_key(_args)
    bit_size = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]["fold_size"]
    #########
    # Consistency check
    print("Consistency checks of config and key files.")
    hash_reference_set.main(_args)
    #########
    start = time.time()
    tag = _args["tag"]

    print("Reading input data.")
    df_T0 = read_input_file(_args["weight_table"])
    df_T1 = read_input_file(_args["activity_file"])
    df_T2 = read_input_file(_args["structure_file"])
    print("Data loaded.")
    print("Start sanity checks of input data.")
    print("Check assay types in T0.")
    sanity_check_assay_type(df_T0)

    print("Check consistency of input_assay_id between T0 and T1.")
    sanity_check_assay_sizes(df_T0, df_T1)

    print("Check consistency of input_compound_id between T1 and T2.")
    sanity_check_compound_sizes(df_T1, df_T2)

    print("Check uniqueness of T0 and T2.")
    sanity_check_uniqueness(df_T0, colname="input_assay_id", filename="T0")
    sanity_check_uniqueness(df_T2, colname="input_compound_id", filename="T2")
    print(f"Sanity checks took {time.time() - start:.08} seconds.")
    print(f"Sanity checks passed.")

    start = time.time()
    print("Start standardizing structures.")

    # Make directories, load input files
    results_dir = make_dir(_args, "results", None, overwriting)
    output_dir_std, dt_std = standardize_smiles.prepare(_args)

    df_smi, sd_smi_failed = standardize_smiles.run(df_T2, dt_std)
    save_df_as_csv(output_dir_std, df_smi, "T2_standardized")
    save_df_as_csv(output_dir_std, sd_smi_failed, "T2_standardized.FAILED")
    del sd_smi_failed, df_T2
    print(f"Standardization took {time.time() - start:.08} seconds.")
    print(f"Standardization done.")
    df_T5 = pd.DataFrame()
    df_T6 = pd.DataFrame()
    if _args["folding_method"] == "scaffold":
        print("Using scaffold-based fold assignment.")

        output_dir_desc, dt_desc = calculate_descriptors.prepare(
            _args, overwriting)

        start = time.time()
        print("Start calculating descriptors.")

        df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc)

        save_df_as_csv(output_dir_desc, df_desc, "T2_descriptors")
        save_df_as_csv(output_dir_desc, df_desc_failed,
                       "T2_descriptors.FAILED")
        del df_smi, df_desc_failed

        print(
            f"Fingerprint calculation took {time.time() - start:.08} seconds.")
        print(f"Descriptor calculation done.")

        start = time.time()
        print("Start computing folds.")
        output_dir_fold, mapping_table_dir, dt_fold = calculate_scaffold_folds.prepare(
            _args)

        df_fold, df_fold_failed = calculate_scaffold_folds.run(
            df_desc, dt_fold)
        save_df_as_csv(output_dir_fold, df_fold, "T2_folds")
        save_df_as_csv(output_dir_fold, df_fold_failed, "T2_folds.FAILED")
        del df_fold_failed, df_desc
        df_T5, df_T6, df_duplicates = helper.format_dataframe(df_fold)
        save_df_as_csv(mapping_table_dir, df_T5, "T5")
        save_df_as_csv(mapping_table_dir, df_T6, "T6")
        save_df_as_csv(output_dir_desc, df_duplicates,
                       "T2_descriptor_vector_id.DUPLICATES")
        del df_duplicates

        print(f"Fold calculation took {time.time() - start:.08} seconds.")
        print(f"Fold calculation done.")

    elif _args["folding_method"] == "lsh":
        print("Using LSH based fold assignment.")
        output_dir_lsh, mapping_table_dir, dt_lsh = calculate_lsh_folds.prepare(
            _args, overwriting)

        output_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.csv")
        error_file = os.path.join(output_dir_lsh,
                                  "T2_descriptors_lsh.FAILED.csv")
        dupl_file = os.path.join(output_dir_lsh,
                                 "T2_descriptors_lsh.DUPLICATES.csv")
        mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv")
        mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv")

        df_desc_lsh, df_desc_lsh_failed = dt_lsh.process_dataframe(df_smi)
        df_desc_lsh.to_csv(output_file, index=False)
        df_desc_lsh_failed.to_csv(error_file, index=False)
        df_T5, df_T6, df_duplicates = helper.format_dataframe(df_desc_lsh)
        df_duplicates.to_csv(dupl_file, index=False)
        df_T5.to_csv(mapping_file_T5, index=False)
        df_T6.to_csv(mapping_file_T6, index=False)
        del df_duplicates
        end = time.time()
        print(
            f"Fingerprint calculation and LSH folding took {end - start:.08} seconds."
        )
        print(f"Descriptor calculation and LSH folding done.")
    else:
        print("Please use scaffold or lsh as folding method.")
        quit()

    start = time.time()

    print("Start aggregating values.")

    output_dir_agg = aggregate_values.prepare(_args, overwriting)

    (
        df_T4r,
        df_failed_range,
        df_failed_aggr,
        df_failed_std,
        df_dup,
        df_T0_upd,
    ) = aggregate_values.aggregate_replicates(
        df_T0, df_T1, df_T5,
        ConfigDict.get_parameters()["credibility_range"], num_cpu)
    df_T4r = df_T4r[[
        "input_assay_id",
        "descriptor_vector_id",
        "fold_id",
        "standard_qualifier",
        "standard_value",
    ]]
    save_df_as_csv(
        output_dir_agg,
        df_T4r,
        "T4r",
        [
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_range,
        "failed_range_T1",
        [
            "input_compound_id", "input_assay_id", "standard_qualifier",
            "standard_value"
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_aggr,
        "failed_aggr_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_std,
        "failed_std_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_dup,
        "duplicates_T1",
        [
            "input_assay_id",
            "input_compound_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(output_dir_agg, df_T0_upd, "T0_upd")
    del df_T5, df_failed_range, df_failed_aggr, df_dup, df_T1
    print(f"Replicate aggregation took {time.time() - start:.08} seconds.")
    print(f"Replicate aggregation done.")

    start = time.time()
    print("Start thresholding.")
    output_dir_thres = apply_thresholding.prepare(_args, overwriting)
    df_T0_upd = df_T0_upd.astype({"input_assay_id": "int"})
    df_T4r = df_T4r.astype({"input_assay_id": "int"})
    df_T4c, df_T3c = apply_thresholding.run(df_T0_upd, df_T4r, num_cpu)

    # Write final dataframes (T4c, T3c)
    columns_T3c = [
        "classification_task_id",
        "input_assay_id",
        "assay_type",
        "variance_quorum_OK",
        "use_in_regression",
        "is_auxiliary",
        "threshold",
        "threshold_method",
        "direction",
    ]
    columns_T4c = [
        "classification_task_id",
        "descriptor_vector_id",
        "fold_id",
        "input_assay_id",
        "standard_qualifier",
        "standard_value",
        "threshold",
        "class_label",
    ]

    df_T4c.sort_values("classification_task_id", inplace=True)
    df_T3c.sort_values("classification_task_id", inplace=True)

    # Filter ambiguous class labels
    df_T4c_failed = df_T4c[df_T4c.class_label.isna()]
    df_T4c = df_T4c[~df_T4c.class_label.isna()]

    df_T4c = df_T4c[columns_T4c]
    df_T3c = df_T3c[columns_T3c]

    save_df_as_csv(output_dir_thres, df_T4c_failed, "T4c.FAILED")
    save_df_as_csv(output_dir_thres, df_T4c, "T4c")
    save_df_as_csv(output_dir_thres, df_T3c, "T3c")

    print(f"Thresholding took {time.time() - start:.08} seconds.")
    print(f"Thresholding done.")

    print("Start filter classification data.")
    start = time.time()

    output_dir_filter_clf = filter_classification.prepare(_args, overwriting)
    T10c, T8c, T4c_filtered_out, T4c_dedup = filter_classification.filter_clf(
        df_T3c,
        df_T4c,
        ConfigDict.get_parameters()["training_quorum"]["classification"],
        ConfigDict.get_parameters()["evaluation_quorum"]["classification"],
        ConfigDict.get_parameters()["initial_task_weights"],
    )

    filter_classification.write_tmp_output(output_dir_filter_clf, T10c, T8c,
                                           T4c_filtered_out, T4c_dedup)

    del df_T4c, df_T3c, T4c_filtered_out, T4c_dedup

    print(f"Classification filtering took {time.time() - start:.08} seconds.")
    print(f"Classification filtering done.")
    print("Start filter regression data.")
    #####
    start = time.time()
    out_dir_filter_reg = filter_regression.prepare(_args, overwriting)

    T10r, T8r, T4r_filtered_out, T4r_dedup = filter_regression.filter_regression_tasks(
        df_T0_upd,
        df_T4r,
        ConfigDict.get_parameters()["training_quorum"]["regression"],
        ConfigDict.get_parameters()["evaluation_quorum"]["regression"],
        ConfigDict.get_parameters()["initial_task_weights"],
        ConfigDict.get_parameters()["censored_downweighting"],
    )
    filter_regression.write_tmp_output(out_dir_filter_reg, T10r, T8r,
                                       T4r_filtered_out, T4r_dedup)
    del df_T0, df_T4r, T4r_filtered_out, T4r_dedup
    print(f"Filtering regression data took {time.time() - start:.08} seconds.")
    print(f"Filtering regression data done.")

    print("Start creating sparse matrices.")

    start = time.time()
    out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting)

    df_T6_cont, T10c_cont, T10r_cont = csv_2_mtx.get_cont_id(df_T6, T10c, T10r)
    df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]]

    save_df_as_csv(results_dir, T10c_cont, "T10c_cont")
    save_df_as_csv(results_dir, T10r_cont, "T10r_cont")
    save_df_as_csv(results_dir, df_T6_cont, "T6_cont")

    csv_2_mtx.save_csv_output(out_dir_matrices, tag, T8c, T8r)
    del df_T6, df_T6_cont, T10r, T10c

    (
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    ) = csv_2_mtx.make_matrices(df_T11, T10c_cont, T10r_cont, bit_size)
    del df_T11, T10c_cont, T10r_cont
    y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False)
    y_matrix_clf.eliminate_zeros()

    csv_2_mtx.save_npy_matrices(
        out_dir_matrices,
        tag,
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    )

    print(f"Formatting to matrices took {time.time() - start:.08} seconds.")
    end = time.time()
    print(f"Overall processing took {end - start_total:.08} seconds.")
    print(f"Files are ready for SparseChem.")
示例#10
0
def write_tmp_output(
    out_dir: Path,
    df: pd.DataFrame,
    df_failed_range: pd.DataFrame,
    df_failed_aggr: pd.DataFrame,
    df_failed_std: pd.DataFrame,
    df_dup: pd.DataFrame,
    T0_upd: pd.DataFrame,
) -> None:
    """Save csv files of aggregated activity values and data outside the credibililty range.

    Args:
        out_dir (Path): output Path object
        df (DataFrame): dataframe containing aggregated activity data
        T0_upd (DataFrame): dataframe with updated T0 info
        df_failed_range (DataFrame): dataframe containing activity data outside the credibility range
        df_failed_aggr (DataFrame): dataframe containing activity data that failed at aggregation step
        df_failed_std (DataFrame): dataframe containing activity data that failed due to low std per task per fold
    """
    save_df_as_csv(
        out_dir,
        df,
        "T4r",
        [
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(out_dir, T0_upd, "T0_upd")
    save_df_as_csv(
        out_dir,
        df_failed_range,
        "failed_range_T1",
        ["input_compound_id", "input_assay_id", "standard_qualifier", "standard_value"],
    )
    save_df_as_csv(
        out_dir,
        df_failed_aggr,
        "failed_aggr_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        out_dir,
        df_failed_std,
        "failed_std_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        out_dir,
        df_dup,
        "duplicates_T1",
        [
            "input_assay_id",
            "input_compound_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
def write_tmp_output(
    out_dir: Path,
    T10c: pd.DataFrame,
    T8c: pd.DataFrame,
    T4c_filtered_out: pd.DataFrame,
    T4c_dedup: pd.DataFrame,
) -> None:
    """Save csv files of aggregated activity values and data outside the credibililty range.

    Args:
        out_dir (Path): output Path object
        T4c_filtered_out (DataFrame): dataframe containing regression activity data filtered out in training
        T4c_dedup (DataFrame): dataframe containing duplicated regression activity data
    """

    save_df_as_csv(
        out_dir,
        T4c_filtered_out,
        "filtered_out_T4c",
        [
            "classification_task_id",
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
            "threshold",
            "class_label",
        ],
    )
    save_df_as_csv(
        out_dir,
        T4c_dedup,
        "duplicates_T4c",
        [
            "classification_task_id",
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
            "threshold",
            "class_label",
        ],
    )
    save_df_as_csv(
        out_dir,
        T10c,
        "T10c",
        [
            "cont_classification_task_id",
            "descriptor_vector_id",
            "fold_id",
            "class_label",
        ],
    )
    save_df_as_csv(
        out_dir,
        T8c,
        "T8c",
        [
            "cont_classification_task_id",
            "classification_task_id",
            "input_assay_id",
            "assay_type",
            "variance_quorum_OK",
            "is_auxiliary",
            "use_in_regression",
            "threshold",
            "threshold_method",
            "direction",
            "training_quorum_OK",
            "evaluation_quorum_OK",
            "aggregation_weight",
            "weight",
            "num_total_actives",
            "num_fold_min_actives",
            "num_total_inactives",
            "num_fold_min_inactives",
            "n_tasks",
        ],
    )
示例#12
0
def write_tmp_output(
    out_dir: Path,
    T10r: pd.DataFrame,
    T8r: pd.DataFrame,
    T4r_filtered_out: pd.DataFrame,
    T4r_dedup: pd.DataFrame,
) -> None:
    """Save csv files of aggregated activity values and data outside the credibililty range.

    Args:
        out_dir (Path): output Path object
        T10r (DataFrame): dataframe containing deduplicated regression task data
        T8r (DataFrame): dataframe containing deduplicated regression task definitions and metadata
        T4r_filtered_out (DataFrame): dataframe containing regression activity data filtered out in training
        T4r_dedup (DataFrame): dataframe containing duplicated regression activity data
    """
    save_df_as_csv(
        out_dir,
        T10r,
        "T10r",
        [
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
            "cont_regression_task_id",
        ],
    )
    save_df_as_csv(
        out_dir,
        T8r,
        "T8r",
        [
            "cont_regression_task_id",
            "input_assay_id",
            "assay_type",
            "variance_quorum_OK",
            "is_auxiliary",
            "use_in_regression",
            "expert_threshold_1",
            "expert_threshold_2",
            "expert_threshold_3",
            "expert_threshold_4",
            "expert_threshold_5",
            "direction",
            "training_quorum_OK",
            "evaluation_quorum_OK",
            "aggregation_weight",
            "weight",
            "censored_weight",
        ],
    )
    save_df_as_csv(
        out_dir,
        T4r_filtered_out,
        "filtered_out_T4r",
        [
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(
        out_dir,
        T4r_dedup,
        "duplicates_T4r",
        [
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
            "cont_regression_task_id",
        ],
    )