示例#1
0
    def __init__(self, *args, **kwargs):
        super(ActivityFormattingTests, self).__init__(*args, **kwargs)
        self.referenceFilePath = curDir + "/output/smiles_prepared.csv"
        self.referenceFilePathDuplicates = curDir + "/input/smiles_with_duplicates.csv"
        self.path_mapping_table_T5 = curDir + "/output/chembl/results_tmp/mapping_table_T5.csv"
        self.path_mapping_table_T10 = curDir + "/output/chembl/results_tmp/mapping_table_T10.csv"
        self.path_T2_T6 = curDir + "/output/chembl/results_tmp/T6.csv"
        self.activity_file = curDir + "/input/chembl/chembl_23_example_T4.csv"
        self.defineConfig()

        mapping_table_T5 = read_csv(self.path_mapping_table_T5)
        activity_data = read_csv(self.activity_file)
        mapping_table_T10 = read_csv(self.path_mapping_table_T10)
        self.act_data_format = ActivityDataFormatting(activity_data,
                                                      mapping_table_T5,
                                                      mapping_table_T10)
        del (activity_data, mapping_table_T5, mapping_table_T10)
        self.act_data_format.run_formatting()
示例#2
0
 def test_activity_formatting_chembl_t11(self):
     self.act_data_format.remapping_2_cont_ids()
     structure_data_T6 = read_csv(self.path_T2_T6)
     structure_data_T11 = self.act_data_format.make_T11(
         structure_data_T6).sort_values('cont_descriptor_vector_id')
     #         structure_data_T11.to_pickle(curDir+"/output/test_activity_formatting_t11.pkl")   #save referene data
     structure_data_T11_ref = pd.read_pickle(
         curDir + "/output/test_activity_formatting_t11.pkl")
     self.assertEqual(structure_data_T11.equals(structure_data_T11_ref),
                      True)
def main(args, overwriting=True):
    """
    Main function reading input files, executing functions and writing output files.
    """
    ####################################
    # get parameters
    start = time.time()

    activity_file = Path(args['activity_file'])
    weight_table = Path(args['weight_table'])
    config_file = Path(args['config_file'])
    if config_file.is_file() is False:
        print('Config file does not exist.')
        quit()

    config.parameters.get_parameters(path=config_file)
    if activity_file.is_file() is False:
        print('Activity file does not exist.')
        quit()

    output_path = Path(args['output_dir'])
    output_path.mkdir(exist_ok=True)
    run_name = args['run_name']
    output_dir = output_path / run_name
    path_results_intern = output_dir / 'results_tmp'
    path_results_intern.mkdir(exist_ok=True)
    path_results_extern = output_dir / 'results'
    path_results_extern.mkdir(exist_ok=True)
    output_dir_standardization = path_results_intern / 'standardization'
    output_dir_descriptors = path_results_intern / 'descriptors'
    output_dir_activity_data = path_results_intern / 'activity_data'
    output_dir_activity_data.mkdir(exist_ok=True)
    if overwriting is False:
        if os.listdir(output_dir_activity_data):
            override = input(
                f'Do you want to override files in {output_dir_activity_data} ? (type y or Y) \n The script will be aborted if you type anything else. '
            )
            if override == 'y' or override == 'Y':
                print(f'Files for run name {run_name} will be overwritten.')
            else:
                print(
                    'Processing aborted. Please change the run name and re-run the script.'
                )
                quit()

    ###################################
    # read mapping table T5 and activity file
    path_mapping_table_T5 = output_dir_descriptors / 'mapping_table_T5.csv'
    path_mapping_table_T10 = output_dir_descriptors / 'mapping_table_T10.csv'
    path_T6_structure_data = output_dir_descriptors / 'T6.csv'

    # check if mapping table exists
    if path_mapping_table_T5.is_file() and path_mapping_table_T10.is_file(
    ) and path_T6_structure_data.is_file() is False:
        print(
            'Structure data file T6, or mapping table T5 or T10 was not found. Please perform first the standardization and fingerprint calculation.'
        )
        quit()
    else:
        print('Start activity data formatting.')

        # read input files (mapping table T5, T10) activity data T4, and weight table T3
        mapping_table_T5 = read_csv(path_mapping_table_T5)
        activity_data = read_csv(activity_file)
        mapping_table_T10 = read_csv(path_mapping_table_T10)
        import pandas as pd
        pd.options.mode.chained_assignment = 'raise'
        act_data_format = ActivityDataFormatting(activity_data,
                                                 mapping_table_T5,
                                                 mapping_table_T10)
        del (activity_data, mapping_table_T5, mapping_table_T10)
        act_data_format.run_formatting()

        # identify and write output file for failed activity entries
        data_failed = act_data_format.filter_failed_structures()
        path_failed_data = output_dir_activity_data / 'T4_failed_structures.csv'
        data_failed.to_csv(path_failed_data,
                           sep=',',
                           columns=[
                               'input_compound_id', 'classification_task_id',
                               'class_label'
                           ],
                           index=False)

        # identify duplicated id pairs and save them.
        data_duplicated_id_pairs = act_data_format.data_duplicates
        path_duplicated_structures = output_dir_activity_data / 'T4_duplicates.csv'
        data_duplicated_id_pairs.to_csv(path_duplicated_structures,
                                        sep=',',
                                        columns=[
                                            'classification_task_id',
                                            'descriptor_vector_id',
                                            'class_label'
                                        ],
                                        index=False)
        del (data_failed, data_duplicated_id_pairs)

        # save excluded data
        data_excluded = act_data_format.select_excluded_data()
        path_excluded_data = output_dir_activity_data / 'T4_excluded_data.csv'
        data_excluded.to_csv(path_excluded_data,
                             sep=',',
                             columns=[
                                 'classification_task_id',
                                 'descriptor_vector_id', 'class_label'
                             ],
                             index=False)

        # save T11
        act_data_format.remapping_2_cont_ids()
        structure_data_T6 = read_csv(path_T6_structure_data)
        structure_data_T11 = act_data_format.make_T11(
            structure_data_T6).sort_values('cont_descriptor_vector_id')
        path_structure_data_final_T11 = path_results_extern / 'T11.csv'
        structure_data_T11.to_csv(path_structure_data_final_T11,
                                  sep=',',
                                  columns=[
                                      'cont_descriptor_vector_id',
                                      'descriptor_vector_id', 'fp_json',
                                      'fp_val_json', 'fold_id'
                                  ],
                                  index=False)
        del (structure_data_T6, structure_data_T11)

        # save T10
        data_remapped = act_data_format.data_remapped.sort_values(
            'cont_classification_task_id')
        path_data_final_T10 = path_results_extern / f'T10.csv'
        data_remapped.to_csv(path_data_final_T10,
                             sep=',',
                             columns=[
                                 'cont_descriptor_vector_id',
                                 'cont_classification_task_id', 'class_label',
                                 'fold_id'
                             ],
                             index=False)
        # count labels per fold and save it.
        data_final_counts = act_data_format.count_labels_per_fold(
            data_remapped).sort_values('cont_classification_task_id')
        path_final_counts = path_results_extern / 'T10_counts.csv'
        data_final_counts.to_csv(path_final_counts,
                                 sep=',',
                                 columns=[
                                     'cont_classification_task_id',
                                     'class_label', 'fold_id', 'label_counts'
                                 ],
                                 index=False)

        del (data_remapped, data_final_counts)

        # update weight table T3 with cont_task_ids and save reduced table T3 as T9
        weight_table_T3 = read_csv(weight_table)
        weight_table_T3_mapped = act_data_format.map_T3(weight_table_T3)

        path_weight_table_T3_mapped = path_results_extern / f'weight_table_T3_mapped.csv'
        weight_table_T3_mapped.to_csv(path_weight_table_T3_mapped,
                                      sep=',',
                                      index=False)

        path_weight_table_T9 = path_results_extern / f'weight_table_T9.csv'
        col_to_keep = ['cont_classification_task_id', 'assay_type', 'weight']
        weight_table_T3_mapped.to_csv(path_weight_table_T9,
                                      sep=',',
                                      index=False,
                                      columns=col_to_keep)

        end = time.time()
        print(f'Formatting of activity data took {end - start:.08} seconds.')
        print(f'Activity data processing of run name {run_name} done.')
示例#4
0
def main(args, overwriting=True):
    """
    Main function reading input files, executing functions and writing output files.
    """
    ####################################
    # get parameters

    start = time.time()

    s_path = Path(args['output_dir'])
    s_path.mkdir(exist_ok=True)
    run_name = args['run_name']
    output_dir = s_path / run_name

    config_file = Path(args['config_file'])
    if config_file.is_file() is False:
        print('Config file does not exist.')
        quit()

    if args['ref_hash'] is None:
        print(
            'No reference hash given. Comparison of generated and reference hash keys will be skipped.'
        )
    else:
        with open(args['ref_hash']) as ref_hash_f:
            ref_hash = json.load(ref_hash_f)
        key_ref = ref_hash['unit_test_hash']
        path_gen_hash = output_dir / 'generated_hash.json'
        with open(path_gen_hash) as hash_f:
            key = json.load(hash_f)
        if key['unit_test_hash'] != key_ref:
            print(
                'Different reference key. Please check the parameters you used for structure preparation.'
            )
            quit()

    if args['prediction_only'] is True:
        print('Formatting data ready for predictions with a ML model.')

        T11_structure_file = output_dir / 'results' / 'T11_prediction_only.csv'
        path_files_4_ml = output_dir / 'files_4_ml_pred_only'
        path_files_4_ml.mkdir(exist_ok=True)
    else:
        print('Formatting data ready for training a ML model.')

        T11_structure_file = output_dir / 'results' / 'T11.csv'
        T10_activity_file = output_dir / 'results' / 'T10.csv'
        T10_activity_file_counts = output_dir / 'results' / 'T10_counts.csv'
        T9_weight_table_file = output_dir / 'results' / 'weight_table_T9.csv'
        if T10_activity_file.is_file() is False:
            print('Activity file does not exist.')
            quit()
        if T10_activity_file_counts.is_file() is False:
            print('Activity count file does not exist.')
            quit()
        if T9_weight_table_file.is_file() is False:
            print('Weight table file does not exist.')
            quit()
        path_files_4_ml = output_dir / 'files_4_ml'
        path_files_4_ml.mkdir(exist_ok=True)

    if T11_structure_file.is_file() is False:
        print('Structure file does not exist.')
        quit()

    if overwriting is False:
        if os.listdir(path_files_4_ml):
            override = input(
                f'Do you want to override files in {path_files_4_ml}?(type y or Y) \n The script will be aborted if you type anything else. '
            )
            if override == 'y' or override == 'Y':
                print(f'Files for run name {run_name} will be overwritten.')
            else:
                print(
                    'Processing aborted. Please change the run name and re-run the script.'
                )
                quit()

    # Read config file and get fold size as matrix dimension
    fp_param = tuner.config.parameters.get_parameters(
        path=config_file)['fingerprint']
    bit_size = fp_param['fold_size']

    # Preparing structure-related X matrix
    T11_structure_df = tuner.read_csv(T11_structure_file)
    structure_matrix = matrix_from_strucutres_for_prediction(
        T11_structure_df, bit_size)
    path_structure_matrix = path_files_4_ml / f'T11_x.mtx'
    path_structure_matrix_npy = path_files_4_ml / f'T11_x.npy'
    mmwrite(os.path.join(path_structure_matrix),
            structure_matrix,
            field='integer')
    np.save(path_structure_matrix_npy, structure_matrix)

    if args['prediction_only'] is True:
        end = time.time()
        print(f'Formatting to matrices took {end - start:.08} seconds.')
        print(
            f'Files in {path_files_4_ml} are ready for prediction with ML model.'
        )
    else:
        shutil.copy(T10_activity_file_counts, path_files_4_ml)
        T9_weight_table = tuner.read_csv(T9_weight_table_file)
        T10_activity_df = tuner.read_csv(T10_activity_file)
        T9_weight_table_red = pd.DataFrame(T9_weight_table, columns=['cont_classification_task_id', 'weight']) \
            .rename(columns={"cont_classification_task_id": "task_id"}).dropna(subset=['task_id'])
        T9_weight_table_red.to_csv(path_files_4_ml / 'T9_red.csv',
                                   sep=',',
                                   index=False)

        activity_matrix = matrix_from_activity(T10_activity_df)
        path_activity_matrix = path_files_4_ml / f'T10_y.mtx'
        path_activity_matrix_npy = path_files_4_ml / f'T10_y.npy'
        mmwrite(os.path.join(path_activity_matrix),
                activity_matrix,
                field='integer')
        np.save(path_activity_matrix_npy, activity_matrix)

        folding_vector = folding_from_structure(T11_structure_df)
        path_structure_fold_vector = path_files_4_ml / f'T11_fold_vector'
        np.save(path_structure_fold_vector, folding_vector)
        end = time.time()
        print(f'Formatting to matrices took {end - start:.08} seconds.')
        print(f'Files in {path_files_4_ml} are ready for ML model.')
def main(args, overwriting=True, process_reference_set=False):
    ####################################
    # start timing
    start = time.time()

    output_dir = Path(args['output_dir'])
    output_dir.mkdir(exist_ok=True)
    run_name = args['run_name']
    num_cpu = args['number_cpu']

    if num_cpu < 1:
        print('Please use a positive number of CPUs.')
        quit()
    config_file = Path(args['config_file'])
    if config_file.is_file() is False:
        print('Config file does not exist.')
        quit()

    tuner.config.parameters.get_parameters(path=config_file)
    if process_reference_set is True:
        structure_file = Path(args['reference_set'])
        dir_out_run = output_dir / run_name
        dir_out_run.mkdir(exist_ok=True)
        dir_run_name = dir_out_run / 'reference_set'

    else:
        structure_file = Path(args['structure_file'])
        dir_run_name = output_dir / run_name

    if structure_file.is_file() is False:
        print('Structure file does not exist.')
        quit()

    s_filename = structure_file.stem
    dir_run_name.mkdir(exist_ok=True)
    path_results_extern = dir_run_name / 'results'
    path_results_extern.mkdir(exist_ok=True)
    path_results_intern = dir_run_name / 'results_tmp'
    path_results_intern.mkdir(exist_ok=True)
    output_dir_standardization = path_results_intern / 'standardization'
    output_dir_standardization.mkdir(exist_ok=True)
    if overwriting is False:
        if os.listdir(output_dir_standardization):
            override = input(
                f'Do you want to override files in {output_dir_standardization}? (type y or Y) \n The script will be aborted if you type anything else. '
            )
            if override == 'y' or override == 'Y':
                print(f'Files for run name {run_name} will be overwritten.')
            else:
                print(
                    'Processing aborted. Please change the run name and re-run the script.'
                )
                quit()

    # Configure the log file
    log_file_path = output_dir_standardization / f'log_standardization.log'
    logging.basicConfig(filename=log_file_path,
                        filemode='w',
                        format='',
                        level=logging.ERROR)
    print(f'Start processing run name {run_name}.')
    ####################################
    # read input file
    structure_data = tuner.read_csv(structure_file)
    input_file_len = len(structure_data)
    if input_file_len == 0:
        print(
            'Structure input is empty. Please provide a suitable structure file.'
        )
        quit()
    ####################################
    # standardize structures with RDKit
    print('Start standardizing molecules.')
    smiles_standardized = tuner.run_standardize(structure_data['smiles'],
                                                num_cpu)
    # formatting data to output dataframes
    df_failed_smi = tuner.output_failed_smiles(smiles_standardized,
                                               structure_data)

    df_processed_smi = tuner.output_processed_smiles(smiles_standardized,
                                                     structure_data)

    ####################################

    ####################################
    # write output file as csv file
    col_to_keep = ['input_compound_id', 'canonical_smiles']
    output_path = output_dir_standardization / 'T2_standardized.csv'
    output_failed_mol_path = output_dir_standardization / 'T2_failed_mols.csv'
    df_processed_smi[col_to_keep].to_csv(output_path, sep=',', index=False)
    df_failed_smi[['input_compound_id',
                   'smiles']].to_csv(output_failed_mol_path,
                                     sep=',',
                                     index=False)

    print(
        f'Overall processing time of {len(structure_data.index)}/{input_file_len}  molecules: {time.time() - start:.08} seconds.'
    )
    print(f'Structure standardization of run name {run_name} done.')