def main(args):
    """General wrapper function for replicate aggregation.

    Args:
        args (dict): Dictionary of arguments from argparser

    Returns:
        DataFrame: Activity file T4r with aggregated values
    """
    start = time.time()

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)
    print("Start classification filtering.")
    output_dir = prepare(args, overwriting)
    T3c = read_input_file(args["classification_weight_table"])
    T4c = read_input_file(args["classification_activity_file"])
    T10c, T8c, T4c_filtered_out, T4c_dedup = filter_clf(
        T3c,
        T4c,
        ConfigDict.get_parameters()["training_quorum"]["classification"],
        ConfigDict.get_parameters()["evaluation_quorum"]["classification"],
        ConfigDict.get_parameters()["initial_task_weights"],
    )
    write_tmp_output(output_dir, T10c, T8c, T4c_filtered_out, T4c_dedup)
    print(f"Classification filtering took {time.time() - start:.08} seconds.")
    print(f"Classification filtering done.")
Exemplo n.º 2
0
def main(args: dict = None):
    """
    Main function reading input files, executing functions and writing output files.
    """
    start = time.time()
    if args is None:
        args = vars(init_arg_parser())

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)

    print("Generate sparse matrices from given dataframes.")
    fp_param = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]
    bit_size = fp_param["fold_size"]
    output_dir, results_dir = prepare(args, overwriting)
    tag = args["tag"]

    if (tag != "cls") and (tag != "clsaux"):
        print("Please choose a different tag. Only cls or clsaux are allowed.")
        exit()
    df_T6 = read_input_file(args["structure_file"])
    df_T10c = read_input_file(args["activity_file_clf"])
    df_T10r = read_input_file(args["activity_file_reg"])
    df_T6_cont, T10c_cont, T10r_cont = get_cont_id(df_T6, df_T10c, df_T10r)
    df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]]

    df_T9c = read_input_file(args["weight_table_clf"])
    df_T9r = read_input_file(args["weight_table_reg"])

    save_df_as_csv(results_dir, T10c_cont, "T10c_cont")
    save_df_as_csv(results_dir, T10r_cont, "T10r_cont")
    save_df_as_csv(results_dir, df_T6_cont, "T6_cont")

    save_csv_output(output_dir, tag, df_T9c, df_T9r)

    x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask = make_matrices(
        df_T11, T10c_cont, T10r_cont, bit_size)
    y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False)
    y_matrix_clf.eliminate_zeros()

    save_npy_matrices(
        output_dir,
        tag,
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    )
    end = time.time()
    print(f"Formatting to matrices took {end - start:.08} seconds.")
    print(f"Files are ready for SparseChem.")
Exemplo n.º 3
0
def main(args):
    """General wrapper function for replicate aggregation.

    Args:
        args (dict): Dictionary of arguments from argparser

    Returns:
        DataFrame: Activity file T4r with aggregated values
    """
    start = time.time()

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)
    print("Start aggregation.")
    output_dir = prepare(args, overwriting)
    T0 = read_input_file(args["assay_file"])
    T1 = read_input_file(args["activity_file"])
    print("Check assay types in T0.")
    sanity_check_assay_type(T0)

    print("Check consistency of input_assay_id between T0 and T1.")
    sanity_check_assay_sizes(T0, T1)
    print("Check uniqueness of T0.")
    sanity_check_uniqueness(T0, colname="input_assay_id", filename=args["assay_file"])
    print(f"Sanity checks took {time.time() - start:.08} seconds.")
    print(f"Sanity checks passed.")

    T5 = read_input_file(args["mapping_table"])
    (
        df_aggr,
        df_failed_range,
        df_failed_aggr,
        df_failed_std,
        df_dup,
        T0_upd,
    ) = aggregate_replicates(
        T0, T1, T5, ConfigDict.get_parameters()["credibility_range"], args["number_cpu"]
    )
    write_tmp_output(
        output_dir,
        df_aggr,
        df_failed_range,
        df_failed_aggr,
        df_failed_std,
        df_dup,
        T0_upd,
    )

    print(f"Replicate aggregation took {time.time() - start:.08} seconds.")
    print(f"Replicate aggregation done.")
def main(args: dict = None):
    """
    Main function reading input files, executing functions and writing output files.
    """
    start = time.time()
    if args is None:
        args = vars(init_arg_parser())

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)

    print("Start activity data formatting.")
    output_dir, mapping_table_dir = prepare(args, overwriting)
    results_dir = make_results_dir(args, overwriting)
    # read input files (mapping table T5, T10) activity data T4, and weight table T3
    df_activity_data = read_input_file(args["activity_file"])

    df_weight_table = read_input_file(args["weight_table"])
    mapping_table_T5, mapping_table_T6, mapping_table_T10 = load_mapping_tables(
        args["dir_mapping_tables"])

    # read input files (mapping table T5, T10) activity data T4, and weight table T3
    pd.options.mode.chained_assignment = "raise"

    df_activity_data_formatted = do_actvity_formattting(
        df_activity_data, mapping_table_T5, mapping_table_T10)

    data_failed, data_duplicated_id_pairs, data_excluded = output_tmp_results(
        df_activity_data_formatted)
    write_tmp_output(output_dir, data_failed, data_duplicated_id_pairs,
                     data_excluded)
    del (data_failed, data_duplicated_id_pairs, data_excluded)

    df_T11, df_T10, df_T3_mapped = output_results(df_activity_data_formatted,
                                                  df_weight_table,
                                                  mapping_table_T6)
    write_mappting_tables(mapping_table_dir, df_T3_mapped)
    write_output(results_dir, df_T11, df_T10)
    del (df_activity_data_formatted, df_T11, df_T10, df_T3_mapped)
    end = time.time()
    print(f"Formatting of activity data took {end - start:.08} seconds.")
    print(f"Activity data processing done.")
Exemplo n.º 5
0
def do_prepare_prediction(args):
    """Wrapper to run the entire pipeline for training.

    Args:
        args (Namespace): Subparser argmuents
    """
    start = time.time()
    _args = vars(args)
    if _args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    num_cpu = _args["number_cpu"]
    # load parameters and key
    load_config(_args)
    load_key(_args)
    bit_size = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]["fold_size"]
    #########
    # Consistency check
    print("Consistency checks of config and key files.")
    hash_reference_set.main(_args)
    #########
    print("Prepare for prediction.")

    ######
    df = read_input_file(_args["structure_file"])
    # Make directories, load input files
    output_dir_std, dt_std = standardize_smiles.prepare(_args)

    df_smi, df_smi_failed = standardize_smiles.run(df, dt_std)
    output_dir_desc, dt_desc = calculate_descriptors.prepare(
        _args, overwriting)
    df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc)
    df_desc_c = df_desc.copy()
    df_desc_c.loc[:, "descriptor_vector_id"] = (
        df_desc_c.groupby("input_compound_id").ngroup().replace(-1,
                                                                np.nan).add(1))
    df_T6 = df_desc_c[["descriptor_vector_id", "fp_feat", "fp_val"]]
    out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting)

    df_T11 = map_2_cont_id(
        df_T6, "descriptor_vector_id").sort_values("cont_descriptor_vector_id")

    save_df_as_csv(results_dir, df_T11, "T11_pred")
    x_matrix = csv_2_mtx.matrix_from_strucutres(df_T11, bit_size)
    save_mtx_as_npy(x_matrix, out_dir_matrices, "pred_x")
    print(f"Preparation took {time.time() - start:.08} seconds.")
    print(f"Prediction preparation done.")
Exemplo n.º 6
0
def main(args: dict = None):
    start = time.time()
    if args is None:
        args = vars(init_arg_parser())
    if ("reference_set"
            not in args.keys()) or (args.get("reference_set") is None):
        print(
            "Default reference files from unit_test/reference_files/ loaded.")
        main_location = os.path.dirname(os.path.realpath(__file__))
        default_reference_file = os.path.join(
            main_location, "../../unit_test/reference_files/reference_set.csv")
        path_structure_file = default_reference_file
    else:
        path_structure_file = args["reference_set"]
    output_dir, dt_standarizer, dt_fold, dt_descriptor = prepare(args)
    df = read_input_file(path_structure_file)
    ref_smi, ref_smi_failed = dt_standarizer.process_dataframe(df)
    ref_fold, ref_fold_failed = dt_fold.process_dataframe(ref_smi)
    ref_desc, ref_desc_failed = dt_descriptor.process_dataframe(ref_fold)
    ref_desc_grouped, ref_desc_dupl = calculate_descriptors.format_dataframe(
        ref_desc)
    ref_T5 = ref_desc_grouped[["input_compound_id", "descriptor_vector_id"]]
    ref_T6 = ref_desc_grouped[[
        "descriptor_vector_id", "fp_feat", "fp_val", "fold_id"
    ]]
    ref_T11 = map_2_cont_id(
        ref_T6,
        "descriptor_vector_id").sort_values("cont_descriptor_vector_id")
    dict_df = {
        "T2_standardized": ref_smi,
        "T2_standardized.FAILED": ref_smi_failed,
        "T2_folds": ref_fold,
        "T2_folds.FAILED": ref_fold_failed,
        "T2_descriptors": ref_desc,
        "T2_desciptors.FAILED": ref_desc_failed,
        "T2_descriptors.DUPLICATES": ref_desc_dupl,
        "T5": ref_T5,
        "T6": ref_T6,
        "T11": ref_T11,
    }
    write_output(output_dir, dict_df)
    hash_reference_dir(args, output_dir)
    compare_hash_keys(args, output_dir)
    print(
        f"Hashing reference data finished after {time.time() - start:.08} seconds."
    )
def load_input_file(path: str):
    """Read mapping tables from given path argument

    Args:
        path_dir (str): path to mapping table subfolder

    Returns:
        Tuple(DataFrame, DataFrame, DataFrame): mapping_table_T5, mapping_table_T6, mapping_table_T10
    """

    T2_file = Path(path)
    if T2_file.is_file() is False:
        print("Given file does not exist.")
        quit()

    T2 = read_input_file(T2_file)

    return T2
Exemplo n.º 8
0
def do_prepare_training(args):
    """Wrapper to run the entire pipeline for training.

    Args:
        args (Namespace): Subparser argmuents
    #"""
    start_total = time.time()

    start = time.time()
    _args = vars(args)
    if _args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    num_cpu = _args["number_cpu"]
    # # load parameters and key
    load_config(_args)
    load_key(_args)
    bit_size = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]["fold_size"]
    #########
    # Consistency check
    print("Consistency checks of config and key files.")
    hash_reference_set.main(_args)
    #########
    start = time.time()
    tag = _args["tag"]

    print("Reading input data.")
    df_T0 = read_input_file(_args["weight_table"])
    df_T1 = read_input_file(_args["activity_file"])
    df_T2 = read_input_file(_args["structure_file"])
    print("Data loaded.")
    print("Start sanity checks of input data.")
    print("Check assay types in T0.")
    sanity_check_assay_type(df_T0)

    print("Check consistency of input_assay_id between T0 and T1.")
    sanity_check_assay_sizes(df_T0, df_T1)

    print("Check consistency of input_compound_id between T1 and T2.")
    sanity_check_compound_sizes(df_T1, df_T2)

    print("Check uniqueness of T0 and T2.")
    sanity_check_uniqueness(df_T0, colname="input_assay_id", filename="T0")
    sanity_check_uniqueness(df_T2, colname="input_compound_id", filename="T2")
    print(f"Sanity checks took {time.time() - start:.08} seconds.")
    print(f"Sanity checks passed.")

    start = time.time()
    print("Start standardizing structures.")

    # Make directories, load input files
    results_dir = make_dir(_args, "results", None, overwriting)
    output_dir_std, dt_std = standardize_smiles.prepare(_args)

    df_smi, sd_smi_failed = standardize_smiles.run(df_T2, dt_std)
    save_df_as_csv(output_dir_std, df_smi, "T2_standardized")
    save_df_as_csv(output_dir_std, sd_smi_failed, "T2_standardized.FAILED")
    del sd_smi_failed, df_T2
    print(f"Standardization took {time.time() - start:.08} seconds.")
    print(f"Standardization done.")
    df_T5 = pd.DataFrame()
    df_T6 = pd.DataFrame()
    if _args["folding_method"] == "scaffold":
        print("Using scaffold-based fold assignment.")

        output_dir_desc, dt_desc = calculate_descriptors.prepare(
            _args, overwriting)

        start = time.time()
        print("Start calculating descriptors.")

        df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc)

        save_df_as_csv(output_dir_desc, df_desc, "T2_descriptors")
        save_df_as_csv(output_dir_desc, df_desc_failed,
                       "T2_descriptors.FAILED")
        del df_smi, df_desc_failed

        print(
            f"Fingerprint calculation took {time.time() - start:.08} seconds.")
        print(f"Descriptor calculation done.")

        start = time.time()
        print("Start computing folds.")
        output_dir_fold, mapping_table_dir, dt_fold = calculate_scaffold_folds.prepare(
            _args)

        df_fold, df_fold_failed = calculate_scaffold_folds.run(
            df_desc, dt_fold)
        save_df_as_csv(output_dir_fold, df_fold, "T2_folds")
        save_df_as_csv(output_dir_fold, df_fold_failed, "T2_folds.FAILED")
        del df_fold_failed, df_desc
        df_T5, df_T6, df_duplicates = helper.format_dataframe(df_fold)
        save_df_as_csv(mapping_table_dir, df_T5, "T5")
        save_df_as_csv(mapping_table_dir, df_T6, "T6")
        save_df_as_csv(output_dir_desc, df_duplicates,
                       "T2_descriptor_vector_id.DUPLICATES")
        del df_duplicates

        print(f"Fold calculation took {time.time() - start:.08} seconds.")
        print(f"Fold calculation done.")

    elif _args["folding_method"] == "lsh":
        print("Using LSH based fold assignment.")
        output_dir_lsh, mapping_table_dir, dt_lsh = calculate_lsh_folds.prepare(
            _args, overwriting)

        output_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.csv")
        error_file = os.path.join(output_dir_lsh,
                                  "T2_descriptors_lsh.FAILED.csv")
        dupl_file = os.path.join(output_dir_lsh,
                                 "T2_descriptors_lsh.DUPLICATES.csv")
        mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv")
        mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv")

        df_desc_lsh, df_desc_lsh_failed = dt_lsh.process_dataframe(df_smi)
        df_desc_lsh.to_csv(output_file, index=False)
        df_desc_lsh_failed.to_csv(error_file, index=False)
        df_T5, df_T6, df_duplicates = helper.format_dataframe(df_desc_lsh)
        df_duplicates.to_csv(dupl_file, index=False)
        df_T5.to_csv(mapping_file_T5, index=False)
        df_T6.to_csv(mapping_file_T6, index=False)
        del df_duplicates
        end = time.time()
        print(
            f"Fingerprint calculation and LSH folding took {end - start:.08} seconds."
        )
        print(f"Descriptor calculation and LSH folding done.")
    else:
        print("Please use scaffold or lsh as folding method.")
        quit()

    start = time.time()

    print("Start aggregating values.")

    output_dir_agg = aggregate_values.prepare(_args, overwriting)

    (
        df_T4r,
        df_failed_range,
        df_failed_aggr,
        df_failed_std,
        df_dup,
        df_T0_upd,
    ) = aggregate_values.aggregate_replicates(
        df_T0, df_T1, df_T5,
        ConfigDict.get_parameters()["credibility_range"], num_cpu)
    df_T4r = df_T4r[[
        "input_assay_id",
        "descriptor_vector_id",
        "fold_id",
        "standard_qualifier",
        "standard_value",
    ]]
    save_df_as_csv(
        output_dir_agg,
        df_T4r,
        "T4r",
        [
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_range,
        "failed_range_T1",
        [
            "input_compound_id", "input_assay_id", "standard_qualifier",
            "standard_value"
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_aggr,
        "failed_aggr_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_std,
        "failed_std_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_dup,
        "duplicates_T1",
        [
            "input_assay_id",
            "input_compound_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(output_dir_agg, df_T0_upd, "T0_upd")
    del df_T5, df_failed_range, df_failed_aggr, df_dup, df_T1
    print(f"Replicate aggregation took {time.time() - start:.08} seconds.")
    print(f"Replicate aggregation done.")

    start = time.time()
    print("Start thresholding.")
    output_dir_thres = apply_thresholding.prepare(_args, overwriting)
    df_T0_upd = df_T0_upd.astype({"input_assay_id": "int"})
    df_T4r = df_T4r.astype({"input_assay_id": "int"})
    df_T4c, df_T3c = apply_thresholding.run(df_T0_upd, df_T4r, num_cpu)

    # Write final dataframes (T4c, T3c)
    columns_T3c = [
        "classification_task_id",
        "input_assay_id",
        "assay_type",
        "variance_quorum_OK",
        "use_in_regression",
        "is_auxiliary",
        "threshold",
        "threshold_method",
        "direction",
    ]
    columns_T4c = [
        "classification_task_id",
        "descriptor_vector_id",
        "fold_id",
        "input_assay_id",
        "standard_qualifier",
        "standard_value",
        "threshold",
        "class_label",
    ]

    df_T4c.sort_values("classification_task_id", inplace=True)
    df_T3c.sort_values("classification_task_id", inplace=True)

    # Filter ambiguous class labels
    df_T4c_failed = df_T4c[df_T4c.class_label.isna()]
    df_T4c = df_T4c[~df_T4c.class_label.isna()]

    df_T4c = df_T4c[columns_T4c]
    df_T3c = df_T3c[columns_T3c]

    save_df_as_csv(output_dir_thres, df_T4c_failed, "T4c.FAILED")
    save_df_as_csv(output_dir_thres, df_T4c, "T4c")
    save_df_as_csv(output_dir_thres, df_T3c, "T3c")

    print(f"Thresholding took {time.time() - start:.08} seconds.")
    print(f"Thresholding done.")

    print("Start filter classification data.")
    start = time.time()

    output_dir_filter_clf = filter_classification.prepare(_args, overwriting)
    T10c, T8c, T4c_filtered_out, T4c_dedup = filter_classification.filter_clf(
        df_T3c,
        df_T4c,
        ConfigDict.get_parameters()["training_quorum"]["classification"],
        ConfigDict.get_parameters()["evaluation_quorum"]["classification"],
        ConfigDict.get_parameters()["initial_task_weights"],
    )

    filter_classification.write_tmp_output(output_dir_filter_clf, T10c, T8c,
                                           T4c_filtered_out, T4c_dedup)

    del df_T4c, df_T3c, T4c_filtered_out, T4c_dedup

    print(f"Classification filtering took {time.time() - start:.08} seconds.")
    print(f"Classification filtering done.")
    print("Start filter regression data.")
    #####
    start = time.time()
    out_dir_filter_reg = filter_regression.prepare(_args, overwriting)

    T10r, T8r, T4r_filtered_out, T4r_dedup = filter_regression.filter_regression_tasks(
        df_T0_upd,
        df_T4r,
        ConfigDict.get_parameters()["training_quorum"]["regression"],
        ConfigDict.get_parameters()["evaluation_quorum"]["regression"],
        ConfigDict.get_parameters()["initial_task_weights"],
        ConfigDict.get_parameters()["censored_downweighting"],
    )
    filter_regression.write_tmp_output(out_dir_filter_reg, T10r, T8r,
                                       T4r_filtered_out, T4r_dedup)
    del df_T0, df_T4r, T4r_filtered_out, T4r_dedup
    print(f"Filtering regression data took {time.time() - start:.08} seconds.")
    print(f"Filtering regression data done.")

    print("Start creating sparse matrices.")

    start = time.time()
    out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting)

    df_T6_cont, T10c_cont, T10r_cont = csv_2_mtx.get_cont_id(df_T6, T10c, T10r)
    df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]]

    save_df_as_csv(results_dir, T10c_cont, "T10c_cont")
    save_df_as_csv(results_dir, T10r_cont, "T10r_cont")
    save_df_as_csv(results_dir, df_T6_cont, "T6_cont")

    csv_2_mtx.save_csv_output(out_dir_matrices, tag, T8c, T8r)
    del df_T6, df_T6_cont, T10r, T10c

    (
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    ) = csv_2_mtx.make_matrices(df_T11, T10c_cont, T10r_cont, bit_size)
    del df_T11, T10c_cont, T10r_cont
    y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False)
    y_matrix_clf.eliminate_zeros()

    csv_2_mtx.save_npy_matrices(
        out_dir_matrices,
        tag,
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    )

    print(f"Formatting to matrices took {time.time() - start:.08} seconds.")
    end = time.time()
    print(f"Overall processing took {end - start_total:.08} seconds.")
    print(f"Files are ready for SparseChem.")
Exemplo n.º 9
0
def main(args):
    """General wrapper function for thresholding.

    Args:
        args (dict): Dictionary of arguments from argparser

    Returns:
        df_T4c (DataFrame): dataframe containing classified activity data
        df_T3c (DataFrame): dataframe containing classification threshold definitions
    """
    start = time.time()

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    load_config(args)
    load_key(args)
    num_cpu = args["number_cpu"]
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)
    print("Start thresholding.")

    # Load files
    output_dir = prepare(args, overwriting)
    T0 = read_input_file(args["assay_file"])
    # T0 = T0.astype({'input_assay_id': 'str'})
    T4r = read_input_file(args["activity_file"])
    # T4r = T4r.astype({'input_assay_id': 'str'})
    # Merge T0 and T4r on input_assay_id

    df_T4c, df_T3c = run(T0, T4r, num_cpu)

    # Write final dataframes (T4c, T3c)
    columns_T3c = [
        "classification_task_id",
        "input_assay_id",
        "assay_type",
        "variance_quorum_OK",
        "use_in_regression",
        "is_auxiliary",
        "threshold",
        "threshold_method",
        "direction",
    ]
    columns_T4c = [
        "classification_task_id",
        "descriptor_vector_id",
        "fold_id",
        "input_assay_id",
        "standard_qualifier",
        "standard_value",
        "threshold",
        "class_label",
    ]

    df_T4c.sort_values("classification_task_id", inplace=True)
    df_T3c.sort_values("classification_task_id", inplace=True)

    # Filter ambiguous class labels
    df_T4c_failed = df_T4c[df_T4c.class_label.isna()]
    df_T4c = df_T4c[~df_T4c.class_label.isna()]

    write_failed_output(output_dir, df_T4c_failed, columns_T4c)
    write_tmp_output(output_dir, df_T4c, df_T3c, columns_T4c, columns_T3c)

    print(f"Thresholding took {time.time() - start:.08} seconds.")
    print(f"Thresholding done.")
Exemplo n.º 10
0
    def test_aggregation(self):
        T0file = os.path.join(curDir, "input", "test_aggr", "T0.csv")
        T1file = os.path.join(curDir, "input", "test_aggr", "T1.csv")
        T5file = os.path.join(curDir, "input", "test_aggr", "T5.csv")
        T0 = read_input_file(T0file)
        T1 = read_input_file(T1file)
        T5 = read_input_file(T5file)

        (df_aggr, df_failed_range, df_failed_aggr, df_failed_std, df_dup,
         t0upd) = aggregate_replicates(T0, T1, T5,
                                       self.config["credibility_range"], 1)
        T4r = df_aggr[[
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ]].reset_index(drop=True)
        df_failed_range = df_failed_range[[
            "input_compound_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
        ]].reset_index(drop=True)
        df_failed_aggr = df_failed_aggr[[
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ]].reset_index(drop=True)
        df_failed_std = df_failed_std[[
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ]].reset_index(drop=True)
        df_dup = df_dup[[
            "input_assay_id",
            "input_compound_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ]].reset_index(drop=True)

        dupfile = os.path.join(curDir, "output", "test_aggr",
                               "duplicates_T1.csv")
        failed_range_file = os.path.join(curDir, "output", "test_aggr",
                                         "failed_range_T1.csv")
        failed_aggr_file = os.path.join(curDir, "output", "test_aggr",
                                        "failed_aggr_T1.csv")
        failed_std_file = os.path.join(curDir, "output", "test_aggr",
                                       "failed_std_T1.csv")
        T4rfile = os.path.join(curDir, "output", "test_aggr", "T4r.csv")
        df_dup_exp = read_input_file(dupfile)
        df_failed_aggr_exp = read_input_file(failed_aggr_file)
        df_failed_range_exp = read_input_file(failed_range_file)
        df_failed_std_exp = read_input_file(failed_std_file)
        T4r_exp = read_input_file(T4rfile)

        pd_testing.assert_frame_equal(T4r, T4r_exp)
        pd_testing.assert_frame_equal(df_failed_range, df_failed_range_exp)
        pd_testing.assert_frame_equal(df_failed_std, df_failed_std_exp)
        # aggr dfs don't match due to different column type object vs float64
        # pd_testing.assert_frame_equal(
        #    df_failed_aggr, df_failed_aggr_exp, check_column_type=False)
        pd_testing.assert_frame_equal(df_dup, df_dup_exp)