def main(args):
    """General wrapper function for replicate aggregation.

    Args:
        args (dict): Dictionary of arguments from argparser

    Returns:
        DataFrame: Activity file T4r with aggregated values
    """
    start = time.time()

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)
    print("Start classification filtering.")
    output_dir = prepare(args, overwriting)
    T3c = read_input_file(args["classification_weight_table"])
    T4c = read_input_file(args["classification_activity_file"])
    T10c, T8c, T4c_filtered_out, T4c_dedup = filter_clf(
        T3c,
        T4c,
        ConfigDict.get_parameters()["training_quorum"]["classification"],
        ConfigDict.get_parameters()["evaluation_quorum"]["classification"],
        ConfigDict.get_parameters()["initial_task_weights"],
    )
    write_tmp_output(output_dir, T10c, T8c, T4c_filtered_out, T4c_dedup)
    print(f"Classification filtering took {time.time() - start:.08} seconds.")
    print(f"Classification filtering done.")
def prepare(args: dict, overwriting: bool):
    """Setup run by creating directories and log files.

    Args:
        args (dict): argparser arguments
        overwriting (bool): overwriting flag

    Returns:
        Tuple(DataFrame, DataFrame): Path to output and mapping_table subdirectories.
    """
    output_dir_lsh = make_dir(args, "results_tmp", "lsh_folding", overwriting)
    mapping_table_dir = make_dir(args, "mapping_table", None, overwriting)
    create_log_files(output_dir_lsh)
    create_log_files(mapping_table_dir)
    load_config(args)
    load_key(args)
    method_params_fp = ConfigDict.get_parameters()["fingerprint"]
    method_params_lsh = ConfigDict.get_parameters()["lsh"]
    method_params = {**method_params_fp, **method_params_lsh}
    key = SecretDict.get_secrets()["key"]
    lshf = LSHFoldingCalculator.from_param_dict(
        secret=key, method_param_dict=method_params, verbosity=0)
    outcols = ["fp_feat", "fp_val", "fold_id", "success", "error_message"]
    out_types = ["object", "object", "object", "bool", "object"]
    dt = DfTransformer(
        lshf,
        input_columns={"canonical_smiles": "smiles"},
        output_columns=outcols,
        output_types=out_types,
        success_column="success",
        nproc=args["number_cpu"],
        verbosity=0,
    )
    return output_dir_lsh, mapping_table_dir, dt
Exemplo n.º 3
0
def main(args: dict = None):
    """
    Main function reading input files, executing functions and writing output files.
    """
    start = time.time()
    if args is None:
        args = vars(init_arg_parser())

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)

    print("Generate sparse matrices from given dataframes.")
    fp_param = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]
    bit_size = fp_param["fold_size"]
    output_dir, results_dir = prepare(args, overwriting)
    tag = args["tag"]

    if (tag != "cls") and (tag != "clsaux"):
        print("Please choose a different tag. Only cls or clsaux are allowed.")
        exit()
    df_T6 = read_input_file(args["structure_file"])
    df_T10c = read_input_file(args["activity_file_clf"])
    df_T10r = read_input_file(args["activity_file_reg"])
    df_T6_cont, T10c_cont, T10r_cont = get_cont_id(df_T6, df_T10c, df_T10r)
    df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]]

    df_T9c = read_input_file(args["weight_table_clf"])
    df_T9r = read_input_file(args["weight_table_reg"])

    save_df_as_csv(results_dir, T10c_cont, "T10c_cont")
    save_df_as_csv(results_dir, T10r_cont, "T10r_cont")
    save_df_as_csv(results_dir, df_T6_cont, "T6_cont")

    save_csv_output(output_dir, tag, df_T9c, df_T9r)

    x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask = make_matrices(
        df_T11, T10c_cont, T10r_cont, bit_size)
    y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False)
    y_matrix_clf.eliminate_zeros()

    save_npy_matrices(
        output_dir,
        tag,
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    )
    end = time.time()
    print(f"Formatting to matrices took {end - start:.08} seconds.")
    print(f"Files are ready for SparseChem.")
def main(args: dict = None):
    """Main wrapper to execute descriptor calculation and fold assignment.

    Args:
        args (dict): argparser dict containing relevant
    """
    start = time.time()
    if args is None:
        args = vars(init_arg_parser())

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    num_cpu = args["number_cpu"]
    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)
    print("Start calculating descriptors and assign LSH folds.")
    output_dir_lsh, mapping_table_dir, dt = prepare(args, overwriting)

    input_file = args["structure_file"]
    output_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.csv")
    error_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.FAILED.csv")
    dupl_file = os.path.join(output_dir_lsh,
                             "T2_descriptors_lsh.DUPLICATES.csv")
    mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv")
    mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv")

    df = pd.read_csv(input_file)
    df_processed, df_failed = dt.process_dataframe(df)
    df_processed.to_csv(output_file, index=False)
    df_failed.to_csv(error_file, index=False)
    df_grouped, df_desc_dupl = format_dataframe(df_processed)
    # col_T5 = ["input_compound_id", "fold_id"]
    # df_T5 = pd.merge(df_processed[col_T5], df_grouped[['input_compound_id', 'descriptor_vector_id', 'fold_id']], on=[
    #                 "input_compound_id", "fold_id"], how="left")
    df_T5 = pd.merge(
        df_processed[["input_compound_id", "fp_feat", "fp_val", "fold_id"]],
        df_grouped[["fp_feat", "fp_val", "descriptor_vector_id", "fold_id"]],
        on=["fp_feat", "fp_val", "fold_id"],
        how="left",
    )[["input_compound_id", "fold_id",
       "descriptor_vector_id"]].reset_index(drop=True)
    df_T6 = df_grouped[[
        "descriptor_vector_id", "fp_feat", "fp_val", "fold_id"
    ]]
    df_desc_dupl.to_csv(dupl_file, index=False)
    df_T5.to_csv(mapping_file_T5, index=False)
    df_T6.to_csv(mapping_file_T6, index=False)
    end = time.time()
    print(
        f"Fingerprint calculation and LSH folding took {end - start:.08} seconds."
    )
    print(f"Descriptor calculation and LSH folding done.")
Exemplo n.º 5
0
def main(args):
    """General wrapper function for replicate aggregation.

    Args:
        args (dict): Dictionary of arguments from argparser

    Returns:
        DataFrame: Activity file T4r with aggregated values
    """
    start = time.time()

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)
    print("Start aggregation.")
    output_dir = prepare(args, overwriting)
    T0 = read_input_file(args["assay_file"])
    T1 = read_input_file(args["activity_file"])
    print("Check assay types in T0.")
    sanity_check_assay_type(T0)

    print("Check consistency of input_assay_id between T0 and T1.")
    sanity_check_assay_sizes(T0, T1)
    print("Check uniqueness of T0.")
    sanity_check_uniqueness(T0, colname="input_assay_id", filename=args["assay_file"])
    print(f"Sanity checks took {time.time() - start:.08} seconds.")
    print(f"Sanity checks passed.")

    T5 = read_input_file(args["mapping_table"])
    (
        df_aggr,
        df_failed_range,
        df_failed_aggr,
        df_failed_std,
        df_dup,
        T0_upd,
    ) = aggregate_replicates(
        T0, T1, T5, ConfigDict.get_parameters()["credibility_range"], args["number_cpu"]
    )
    write_tmp_output(
        output_dir,
        df_aggr,
        df_failed_range,
        df_failed_aggr,
        df_failed_std,
        df_dup,
        T0_upd,
    )

    print(f"Replicate aggregation took {time.time() - start:.08} seconds.")
    print(f"Replicate aggregation done.")
Exemplo n.º 6
0
def prepare(args):
    overwriting = True

    load_config(args)
    load_key(args)
    output_dir = make_dir(args, "reference_set", None, overwriting)
    key = SecretDict.get_secrets()["key"]
    method_params_standardizer = ConfigDict.get_parameters()["standardization"]
    st = Standardizer.from_param_dict(
        method_param_dict=method_params_standardizer, verbosity=0)
    outcols_st = ["canonical_smiles", "success", "error_message"]
    out_types_st = ["object", "bool", "object"]
    dt_standarizer = DfTransformer(
        st,
        input_columns={"smiles": "smiles"},
        output_columns=outcols_st,
        output_types=out_types_st,
        success_column="success",
        nproc=1,
        verbosity=0,
    )

    method_params_folding = ConfigDict.get_parameters()["scaffold_folding"]
    sa = ScaffoldFoldAssign.from_param_dict(
        secret=key, method_param_dict=method_params_folding, verbosity=0)
    outcols_sa = [
        "murcko_smiles", "sn_smiles", "fold_id", "success", "error_message"
    ]
    out_types_sa = ["object", "object", "int", "bool", "object"]
    dt_fold = DfTransformer(
        sa,
        input_columns={"canonical_smiles": "smiles"},
        output_columns=outcols_sa,
        output_types=out_types_sa,
        success_column="success",
        nproc=1,
        verbosity=0,
    )

    method_params_descriptor = ConfigDict.get_parameters()["fingerprint"]
    dc = DescriptorCalculator.from_param_dict(
        secret=key, method_param_dict=method_params_descriptor, verbosity=0)
    outcols_dc = ["fp_feat", "fp_val", "success", "error_message"]
    out_types_dc = ["object", "object", "bool", "object"]
    dt_descriptor = DfTransformer(
        dc,
        input_columns={"canonical_smiles": "smiles"},
        output_columns=outcols_dc,
        output_types=out_types_dc,
        success_column="success",
        nproc=1,
        verbosity=0,
    )

    return output_dir, dt_standarizer, dt_fold, dt_descriptor
Exemplo n.º 7
0
def do_prepare_prediction(args):
    """Wrapper to run the entire pipeline for training.

    Args:
        args (Namespace): Subparser argmuents
    """
    start = time.time()
    _args = vars(args)
    if _args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    num_cpu = _args["number_cpu"]
    # load parameters and key
    load_config(_args)
    load_key(_args)
    bit_size = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]["fold_size"]
    #########
    # Consistency check
    print("Consistency checks of config and key files.")
    hash_reference_set.main(_args)
    #########
    print("Prepare for prediction.")

    ######
    df = read_input_file(_args["structure_file"])
    # Make directories, load input files
    output_dir_std, dt_std = standardize_smiles.prepare(_args)

    df_smi, df_smi_failed = standardize_smiles.run(df, dt_std)
    output_dir_desc, dt_desc = calculate_descriptors.prepare(
        _args, overwriting)
    df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc)
    df_desc_c = df_desc.copy()
    df_desc_c.loc[:, "descriptor_vector_id"] = (
        df_desc_c.groupby("input_compound_id").ngroup().replace(-1,
                                                                np.nan).add(1))
    df_T6 = df_desc_c[["descriptor_vector_id", "fp_feat", "fp_val"]]
    out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting)

    df_T11 = map_2_cont_id(
        df_T6, "descriptor_vector_id").sort_values("cont_descriptor_vector_id")

    save_df_as_csv(results_dir, df_T11, "T11_pred")
    x_matrix = csv_2_mtx.matrix_from_strucutres(df_T11, bit_size)
    save_mtx_as_npy(x_matrix, out_dir_matrices, "pred_x")
    print(f"Preparation took {time.time() - start:.08} seconds.")
    print(f"Prediction preparation done.")
def main(args: dict = None):
    """
    Main function reading input files, executing functions and writing output files.
    """
    start = time.time()
    if args is None:
        args = vars(init_arg_parser())

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)

    print("Start activity data formatting.")
    output_dir, mapping_table_dir = prepare(args, overwriting)
    results_dir = make_results_dir(args, overwriting)
    # read input files (mapping table T5, T10) activity data T4, and weight table T3
    df_activity_data = read_input_file(args["activity_file"])

    df_weight_table = read_input_file(args["weight_table"])
    mapping_table_T5, mapping_table_T6, mapping_table_T10 = load_mapping_tables(
        args["dir_mapping_tables"])

    # read input files (mapping table T5, T10) activity data T4, and weight table T3
    pd.options.mode.chained_assignment = "raise"

    df_activity_data_formatted = do_actvity_formattting(
        df_activity_data, mapping_table_T5, mapping_table_T10)

    data_failed, data_duplicated_id_pairs, data_excluded = output_tmp_results(
        df_activity_data_formatted)
    write_tmp_output(output_dir, data_failed, data_duplicated_id_pairs,
                     data_excluded)
    del (data_failed, data_duplicated_id_pairs, data_excluded)

    df_T11, df_T10, df_T3_mapped = output_results(df_activity_data_formatted,
                                                  df_weight_table,
                                                  mapping_table_T6)
    write_mappting_tables(mapping_table_dir, df_T3_mapped)
    write_output(results_dir, df_T11, df_T10)
    del (df_activity_data_formatted, df_T11, df_T10, df_T3_mapped)
    end = time.time()
    print(f"Formatting of activity data took {end - start:.08} seconds.")
    print(f"Activity data processing done.")
Exemplo n.º 9
0
def main(args: dict = None):
    """Main wrapper to execute descriptor calculation and fold assignment.

    Args:
        args (dict): argparser dict containing relevant
    """
    start = time.time()
    if args is None:
        args = vars(init_arg_parser())

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    load_config(args)
    load_key(args)
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)
    output_dir, dt = prepare(args, overwriting)

    print("Start calculating descriptors.")

    input_file = args["structure_file"]
    output_file = os.path.join(output_dir, "T2_descriptors.csv")
    error_file = os.path.join(output_dir, "T2_descriptors.FAILED.csv")
    # dupl_file = os.path.join(output_dir, "T2_descriptors.DUPLICATES.csv")
    # mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv")
    # mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv")

    df = pd.read_csv(input_file)
    df_processed, df_failed = dt.process_dataframe(df)
    df_processed.to_csv(output_file, index=False)
    df_failed.to_csv(error_file, index=False)

    # df_T5, df_T6, df_duplicates = format_dataframe(df_processed)
    # df_duplicates.to_csv(dupl_file, index=False)
    # df_T5.to_csv(mapping_file_T5, index=False)
    # df_T6.to_csv(mapping_file_T6, index=False)
    end = time.time()
    print(f"Fingerprint calculation took {end - start:.08} seconds.")
    print(f"Descriptor calculation done.")
Exemplo n.º 10
0
def prepare(args):
    """
    Prepare output directories and instantiate df tansformer object for scaffold based folding

    Args:
        args (dict): argparser arguments

    Returns:
        Tuple(Path, DfTransformer): Path to output directory and instatitaed DfTranfomer for sccaffold folding


    """
    output_dir = make_dir(args, "results_tmp", "folding", args["non_interactive"])
    mapping_table_dir = make_dir(args, "mapping_table", None, args["non_interactive"])

    create_log_files(output_dir)
    create_log_files(mapping_table_dir)

    load_config(args)
    load_key(args)
    key = SecretDict.get_secrets()["key"]
    method_params = ConfigDict.get_parameters()["scaffold_folding"]
    sa = ScaffoldFoldAssign.from_param_dict(
        secret=key, method_param_dict=method_params, verbosity=0
    )
    outcols = ["murcko_smiles", "sn_smiles", "fold_id", "success", "error_message"]
    out_types = ["object", "object", "int", "bool", "object"]
    dt = DfTransformer(
        sa,
        input_columns={"canonical_smiles": "smiles"},
        output_columns=outcols,
        output_types=out_types,
        success_column="success",
        nproc=args["number_cpu"],
        verbosity=0,
    )
    return output_dir, mapping_table_dir, dt
Exemplo n.º 11
0
def do_prepare_training(args):
    """Wrapper to run the entire pipeline for training.

    Args:
        args (Namespace): Subparser argmuents
    #"""
    start_total = time.time()

    start = time.time()
    _args = vars(args)
    if _args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False

    num_cpu = _args["number_cpu"]
    # # load parameters and key
    load_config(_args)
    load_key(_args)
    bit_size = melloddy_tuner.utils.config.parameters.get_parameters(
    )["fingerprint"]["fold_size"]
    #########
    # Consistency check
    print("Consistency checks of config and key files.")
    hash_reference_set.main(_args)
    #########
    start = time.time()
    tag = _args["tag"]

    print("Reading input data.")
    df_T0 = read_input_file(_args["weight_table"])
    df_T1 = read_input_file(_args["activity_file"])
    df_T2 = read_input_file(_args["structure_file"])
    print("Data loaded.")
    print("Start sanity checks of input data.")
    print("Check assay types in T0.")
    sanity_check_assay_type(df_T0)

    print("Check consistency of input_assay_id between T0 and T1.")
    sanity_check_assay_sizes(df_T0, df_T1)

    print("Check consistency of input_compound_id between T1 and T2.")
    sanity_check_compound_sizes(df_T1, df_T2)

    print("Check uniqueness of T0 and T2.")
    sanity_check_uniqueness(df_T0, colname="input_assay_id", filename="T0")
    sanity_check_uniqueness(df_T2, colname="input_compound_id", filename="T2")
    print(f"Sanity checks took {time.time() - start:.08} seconds.")
    print(f"Sanity checks passed.")

    start = time.time()
    print("Start standardizing structures.")

    # Make directories, load input files
    results_dir = make_dir(_args, "results", None, overwriting)
    output_dir_std, dt_std = standardize_smiles.prepare(_args)

    df_smi, sd_smi_failed = standardize_smiles.run(df_T2, dt_std)
    save_df_as_csv(output_dir_std, df_smi, "T2_standardized")
    save_df_as_csv(output_dir_std, sd_smi_failed, "T2_standardized.FAILED")
    del sd_smi_failed, df_T2
    print(f"Standardization took {time.time() - start:.08} seconds.")
    print(f"Standardization done.")
    df_T5 = pd.DataFrame()
    df_T6 = pd.DataFrame()
    if _args["folding_method"] == "scaffold":
        print("Using scaffold-based fold assignment.")

        output_dir_desc, dt_desc = calculate_descriptors.prepare(
            _args, overwriting)

        start = time.time()
        print("Start calculating descriptors.")

        df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc)

        save_df_as_csv(output_dir_desc, df_desc, "T2_descriptors")
        save_df_as_csv(output_dir_desc, df_desc_failed,
                       "T2_descriptors.FAILED")
        del df_smi, df_desc_failed

        print(
            f"Fingerprint calculation took {time.time() - start:.08} seconds.")
        print(f"Descriptor calculation done.")

        start = time.time()
        print("Start computing folds.")
        output_dir_fold, mapping_table_dir, dt_fold = calculate_scaffold_folds.prepare(
            _args)

        df_fold, df_fold_failed = calculate_scaffold_folds.run(
            df_desc, dt_fold)
        save_df_as_csv(output_dir_fold, df_fold, "T2_folds")
        save_df_as_csv(output_dir_fold, df_fold_failed, "T2_folds.FAILED")
        del df_fold_failed, df_desc
        df_T5, df_T6, df_duplicates = helper.format_dataframe(df_fold)
        save_df_as_csv(mapping_table_dir, df_T5, "T5")
        save_df_as_csv(mapping_table_dir, df_T6, "T6")
        save_df_as_csv(output_dir_desc, df_duplicates,
                       "T2_descriptor_vector_id.DUPLICATES")
        del df_duplicates

        print(f"Fold calculation took {time.time() - start:.08} seconds.")
        print(f"Fold calculation done.")

    elif _args["folding_method"] == "lsh":
        print("Using LSH based fold assignment.")
        output_dir_lsh, mapping_table_dir, dt_lsh = calculate_lsh_folds.prepare(
            _args, overwriting)

        output_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.csv")
        error_file = os.path.join(output_dir_lsh,
                                  "T2_descriptors_lsh.FAILED.csv")
        dupl_file = os.path.join(output_dir_lsh,
                                 "T2_descriptors_lsh.DUPLICATES.csv")
        mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv")
        mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv")

        df_desc_lsh, df_desc_lsh_failed = dt_lsh.process_dataframe(df_smi)
        df_desc_lsh.to_csv(output_file, index=False)
        df_desc_lsh_failed.to_csv(error_file, index=False)
        df_T5, df_T6, df_duplicates = helper.format_dataframe(df_desc_lsh)
        df_duplicates.to_csv(dupl_file, index=False)
        df_T5.to_csv(mapping_file_T5, index=False)
        df_T6.to_csv(mapping_file_T6, index=False)
        del df_duplicates
        end = time.time()
        print(
            f"Fingerprint calculation and LSH folding took {end - start:.08} seconds."
        )
        print(f"Descriptor calculation and LSH folding done.")
    else:
        print("Please use scaffold or lsh as folding method.")
        quit()

    start = time.time()

    print("Start aggregating values.")

    output_dir_agg = aggregate_values.prepare(_args, overwriting)

    (
        df_T4r,
        df_failed_range,
        df_failed_aggr,
        df_failed_std,
        df_dup,
        df_T0_upd,
    ) = aggregate_values.aggregate_replicates(
        df_T0, df_T1, df_T5,
        ConfigDict.get_parameters()["credibility_range"], num_cpu)
    df_T4r = df_T4r[[
        "input_assay_id",
        "descriptor_vector_id",
        "fold_id",
        "standard_qualifier",
        "standard_value",
    ]]
    save_df_as_csv(
        output_dir_agg,
        df_T4r,
        "T4r",
        [
            "input_assay_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_range,
        "failed_range_T1",
        [
            "input_compound_id", "input_assay_id", "standard_qualifier",
            "standard_value"
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_aggr,
        "failed_aggr_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_failed_std,
        "failed_std_T1",
        [
            "descriptor_vector_id",
            "input_assay_id",
            "standard_qualifier",
            "standard_value",
            "fold_id",
        ],
    )
    save_df_as_csv(
        output_dir_agg,
        df_dup,
        "duplicates_T1",
        [
            "input_assay_id",
            "input_compound_id",
            "descriptor_vector_id",
            "fold_id",
            "standard_qualifier",
            "standard_value",
        ],
    )
    save_df_as_csv(output_dir_agg, df_T0_upd, "T0_upd")
    del df_T5, df_failed_range, df_failed_aggr, df_dup, df_T1
    print(f"Replicate aggregation took {time.time() - start:.08} seconds.")
    print(f"Replicate aggregation done.")

    start = time.time()
    print("Start thresholding.")
    output_dir_thres = apply_thresholding.prepare(_args, overwriting)
    df_T0_upd = df_T0_upd.astype({"input_assay_id": "int"})
    df_T4r = df_T4r.astype({"input_assay_id": "int"})
    df_T4c, df_T3c = apply_thresholding.run(df_T0_upd, df_T4r, num_cpu)

    # Write final dataframes (T4c, T3c)
    columns_T3c = [
        "classification_task_id",
        "input_assay_id",
        "assay_type",
        "variance_quorum_OK",
        "use_in_regression",
        "is_auxiliary",
        "threshold",
        "threshold_method",
        "direction",
    ]
    columns_T4c = [
        "classification_task_id",
        "descriptor_vector_id",
        "fold_id",
        "input_assay_id",
        "standard_qualifier",
        "standard_value",
        "threshold",
        "class_label",
    ]

    df_T4c.sort_values("classification_task_id", inplace=True)
    df_T3c.sort_values("classification_task_id", inplace=True)

    # Filter ambiguous class labels
    df_T4c_failed = df_T4c[df_T4c.class_label.isna()]
    df_T4c = df_T4c[~df_T4c.class_label.isna()]

    df_T4c = df_T4c[columns_T4c]
    df_T3c = df_T3c[columns_T3c]

    save_df_as_csv(output_dir_thres, df_T4c_failed, "T4c.FAILED")
    save_df_as_csv(output_dir_thres, df_T4c, "T4c")
    save_df_as_csv(output_dir_thres, df_T3c, "T3c")

    print(f"Thresholding took {time.time() - start:.08} seconds.")
    print(f"Thresholding done.")

    print("Start filter classification data.")
    start = time.time()

    output_dir_filter_clf = filter_classification.prepare(_args, overwriting)
    T10c, T8c, T4c_filtered_out, T4c_dedup = filter_classification.filter_clf(
        df_T3c,
        df_T4c,
        ConfigDict.get_parameters()["training_quorum"]["classification"],
        ConfigDict.get_parameters()["evaluation_quorum"]["classification"],
        ConfigDict.get_parameters()["initial_task_weights"],
    )

    filter_classification.write_tmp_output(output_dir_filter_clf, T10c, T8c,
                                           T4c_filtered_out, T4c_dedup)

    del df_T4c, df_T3c, T4c_filtered_out, T4c_dedup

    print(f"Classification filtering took {time.time() - start:.08} seconds.")
    print(f"Classification filtering done.")
    print("Start filter regression data.")
    #####
    start = time.time()
    out_dir_filter_reg = filter_regression.prepare(_args, overwriting)

    T10r, T8r, T4r_filtered_out, T4r_dedup = filter_regression.filter_regression_tasks(
        df_T0_upd,
        df_T4r,
        ConfigDict.get_parameters()["training_quorum"]["regression"],
        ConfigDict.get_parameters()["evaluation_quorum"]["regression"],
        ConfigDict.get_parameters()["initial_task_weights"],
        ConfigDict.get_parameters()["censored_downweighting"],
    )
    filter_regression.write_tmp_output(out_dir_filter_reg, T10r, T8r,
                                       T4r_filtered_out, T4r_dedup)
    del df_T0, df_T4r, T4r_filtered_out, T4r_dedup
    print(f"Filtering regression data took {time.time() - start:.08} seconds.")
    print(f"Filtering regression data done.")

    print("Start creating sparse matrices.")

    start = time.time()
    out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting)

    df_T6_cont, T10c_cont, T10r_cont = csv_2_mtx.get_cont_id(df_T6, T10c, T10r)
    df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]]

    save_df_as_csv(results_dir, T10c_cont, "T10c_cont")
    save_df_as_csv(results_dir, T10r_cont, "T10r_cont")
    save_df_as_csv(results_dir, df_T6_cont, "T6_cont")

    csv_2_mtx.save_csv_output(out_dir_matrices, tag, T8c, T8r)
    del df_T6, df_T6_cont, T10r, T10c

    (
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    ) = csv_2_mtx.make_matrices(df_T11, T10c_cont, T10r_cont, bit_size)
    del df_T11, T10c_cont, T10r_cont
    y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False)
    y_matrix_clf.eliminate_zeros()

    csv_2_mtx.save_npy_matrices(
        out_dir_matrices,
        tag,
        x_matrix,
        fold_vector,
        y_matrix_clf,
        y_matrix_reg,
        censored_mask,
    )

    print(f"Formatting to matrices took {time.time() - start:.08} seconds.")
    end = time.time()
    print(f"Overall processing took {end - start_total:.08} seconds.")
    print(f"Files are ready for SparseChem.")
Exemplo n.º 12
0
def main(args):
    """General wrapper function for thresholding.

    Args:
        args (dict): Dictionary of arguments from argparser

    Returns:
        df_T4c (DataFrame): dataframe containing classified activity data
        df_T3c (DataFrame): dataframe containing classification threshold definitions
    """
    start = time.time()

    if args["non_interactive"] is True:
        overwriting = True
    else:
        overwriting = False
    load_config(args)
    load_key(args)
    num_cpu = args["number_cpu"]
    print("Consistency checks of config and key files.")
    hash_reference_set.main(args)
    print("Start thresholding.")

    # Load files
    output_dir = prepare(args, overwriting)
    T0 = read_input_file(args["assay_file"])
    # T0 = T0.astype({'input_assay_id': 'str'})
    T4r = read_input_file(args["activity_file"])
    # T4r = T4r.astype({'input_assay_id': 'str'})
    # Merge T0 and T4r on input_assay_id

    df_T4c, df_T3c = run(T0, T4r, num_cpu)

    # Write final dataframes (T4c, T3c)
    columns_T3c = [
        "classification_task_id",
        "input_assay_id",
        "assay_type",
        "variance_quorum_OK",
        "use_in_regression",
        "is_auxiliary",
        "threshold",
        "threshold_method",
        "direction",
    ]
    columns_T4c = [
        "classification_task_id",
        "descriptor_vector_id",
        "fold_id",
        "input_assay_id",
        "standard_qualifier",
        "standard_value",
        "threshold",
        "class_label",
    ]

    df_T4c.sort_values("classification_task_id", inplace=True)
    df_T3c.sort_values("classification_task_id", inplace=True)

    # Filter ambiguous class labels
    df_T4c_failed = df_T4c[df_T4c.class_label.isna()]
    df_T4c = df_T4c[~df_T4c.class_label.isna()]

    write_failed_output(output_dir, df_T4c_failed, columns_T4c)
    write_tmp_output(output_dir, df_T4c, df_T3c, columns_T4c, columns_T3c)

    print(f"Thresholding took {time.time() - start:.08} seconds.")
    print(f"Thresholding done.")