def __init__(self, *args, **kwargs): super(ActivityFormattingTests, self).__init__(*args, **kwargs) self.referenceFilePath = curDir + "/output/smiles_prepared.csv" self.referenceFilePathDuplicates = curDir + "/input/smiles_with_duplicates.csv" self.path_mapping_table_T5 = curDir + "/output/chembl/results_tmp/mapping_table_T5.csv" self.path_mapping_table_T10 = curDir + "/output/chembl/results_tmp/mapping_table_T10.csv" self.path_T2_T6 = curDir + "/output/chembl/results_tmp/T6.csv" self.activity_file = curDir + "/input/chembl/chembl_23_example_T4.csv" self.defineConfig() mapping_table_T5 = read_csv(self.path_mapping_table_T5) activity_data = read_csv(self.activity_file) mapping_table_T10 = read_csv(self.path_mapping_table_T10) self.act_data_format = ActivityDataFormatting(activity_data, mapping_table_T5, mapping_table_T10) del (activity_data, mapping_table_T5, mapping_table_T10) self.act_data_format.run_formatting()
def test_activity_formatting_chembl_t11(self): self.act_data_format.remapping_2_cont_ids() structure_data_T6 = read_csv(self.path_T2_T6) structure_data_T11 = self.act_data_format.make_T11( structure_data_T6).sort_values('cont_descriptor_vector_id') # structure_data_T11.to_pickle(curDir+"/output/test_activity_formatting_t11.pkl") #save referene data structure_data_T11_ref = pd.read_pickle( curDir + "/output/test_activity_formatting_t11.pkl") self.assertEqual(structure_data_T11.equals(structure_data_T11_ref), True)
def main(args, overwriting=True): """ Main function reading input files, executing functions and writing output files. """ #################################### # get parameters start = time.time() activity_file = Path(args['activity_file']) weight_table = Path(args['weight_table']) config_file = Path(args['config_file']) if config_file.is_file() is False: print('Config file does not exist.') quit() config.parameters.get_parameters(path=config_file) if activity_file.is_file() is False: print('Activity file does not exist.') quit() output_path = Path(args['output_dir']) output_path.mkdir(exist_ok=True) run_name = args['run_name'] output_dir = output_path / run_name path_results_intern = output_dir / 'results_tmp' path_results_intern.mkdir(exist_ok=True) path_results_extern = output_dir / 'results' path_results_extern.mkdir(exist_ok=True) output_dir_standardization = path_results_intern / 'standardization' output_dir_descriptors = path_results_intern / 'descriptors' output_dir_activity_data = path_results_intern / 'activity_data' output_dir_activity_data.mkdir(exist_ok=True) if overwriting is False: if os.listdir(output_dir_activity_data): override = input( f'Do you want to override files in {output_dir_activity_data} ? (type y or Y) \n The script will be aborted if you type anything else. ' ) if override == 'y' or override == 'Y': print(f'Files for run name {run_name} will be overwritten.') else: print( 'Processing aborted. Please change the run name and re-run the script.' ) quit() ################################### # read mapping table T5 and activity file path_mapping_table_T5 = output_dir_descriptors / 'mapping_table_T5.csv' path_mapping_table_T10 = output_dir_descriptors / 'mapping_table_T10.csv' path_T6_structure_data = output_dir_descriptors / 'T6.csv' # check if mapping table exists if path_mapping_table_T5.is_file() and path_mapping_table_T10.is_file( ) and path_T6_structure_data.is_file() is False: print( 'Structure data file T6, or mapping table T5 or T10 was not found. Please perform first the standardization and fingerprint calculation.' ) quit() else: print('Start activity data formatting.') # read input files (mapping table T5, T10) activity data T4, and weight table T3 mapping_table_T5 = read_csv(path_mapping_table_T5) activity_data = read_csv(activity_file) mapping_table_T10 = read_csv(path_mapping_table_T10) import pandas as pd pd.options.mode.chained_assignment = 'raise' act_data_format = ActivityDataFormatting(activity_data, mapping_table_T5, mapping_table_T10) del (activity_data, mapping_table_T5, mapping_table_T10) act_data_format.run_formatting() # identify and write output file for failed activity entries data_failed = act_data_format.filter_failed_structures() path_failed_data = output_dir_activity_data / 'T4_failed_structures.csv' data_failed.to_csv(path_failed_data, sep=',', columns=[ 'input_compound_id', 'classification_task_id', 'class_label' ], index=False) # identify duplicated id pairs and save them. data_duplicated_id_pairs = act_data_format.data_duplicates path_duplicated_structures = output_dir_activity_data / 'T4_duplicates.csv' data_duplicated_id_pairs.to_csv(path_duplicated_structures, sep=',', columns=[ 'classification_task_id', 'descriptor_vector_id', 'class_label' ], index=False) del (data_failed, data_duplicated_id_pairs) # save excluded data data_excluded = act_data_format.select_excluded_data() path_excluded_data = output_dir_activity_data / 'T4_excluded_data.csv' data_excluded.to_csv(path_excluded_data, sep=',', columns=[ 'classification_task_id', 'descriptor_vector_id', 'class_label' ], index=False) # save T11 act_data_format.remapping_2_cont_ids() structure_data_T6 = read_csv(path_T6_structure_data) structure_data_T11 = act_data_format.make_T11( structure_data_T6).sort_values('cont_descriptor_vector_id') path_structure_data_final_T11 = path_results_extern / 'T11.csv' structure_data_T11.to_csv(path_structure_data_final_T11, sep=',', columns=[ 'cont_descriptor_vector_id', 'descriptor_vector_id', 'fp_json', 'fp_val_json', 'fold_id' ], index=False) del (structure_data_T6, structure_data_T11) # save T10 data_remapped = act_data_format.data_remapped.sort_values( 'cont_classification_task_id') path_data_final_T10 = path_results_extern / f'T10.csv' data_remapped.to_csv(path_data_final_T10, sep=',', columns=[ 'cont_descriptor_vector_id', 'cont_classification_task_id', 'class_label', 'fold_id' ], index=False) # count labels per fold and save it. data_final_counts = act_data_format.count_labels_per_fold( data_remapped).sort_values('cont_classification_task_id') path_final_counts = path_results_extern / 'T10_counts.csv' data_final_counts.to_csv(path_final_counts, sep=',', columns=[ 'cont_classification_task_id', 'class_label', 'fold_id', 'label_counts' ], index=False) del (data_remapped, data_final_counts) # update weight table T3 with cont_task_ids and save reduced table T3 as T9 weight_table_T3 = read_csv(weight_table) weight_table_T3_mapped = act_data_format.map_T3(weight_table_T3) path_weight_table_T3_mapped = path_results_extern / f'weight_table_T3_mapped.csv' weight_table_T3_mapped.to_csv(path_weight_table_T3_mapped, sep=',', index=False) path_weight_table_T9 = path_results_extern / f'weight_table_T9.csv' col_to_keep = ['cont_classification_task_id', 'assay_type', 'weight'] weight_table_T3_mapped.to_csv(path_weight_table_T9, sep=',', index=False, columns=col_to_keep) end = time.time() print(f'Formatting of activity data took {end - start:.08} seconds.') print(f'Activity data processing of run name {run_name} done.')
def main(args, overwriting=True): """ Main function reading input files, executing functions and writing output files. """ #################################### # get parameters start = time.time() s_path = Path(args['output_dir']) s_path.mkdir(exist_ok=True) run_name = args['run_name'] output_dir = s_path / run_name config_file = Path(args['config_file']) if config_file.is_file() is False: print('Config file does not exist.') quit() if args['ref_hash'] is None: print( 'No reference hash given. Comparison of generated and reference hash keys will be skipped.' ) else: with open(args['ref_hash']) as ref_hash_f: ref_hash = json.load(ref_hash_f) key_ref = ref_hash['unit_test_hash'] path_gen_hash = output_dir / 'generated_hash.json' with open(path_gen_hash) as hash_f: key = json.load(hash_f) if key['unit_test_hash'] != key_ref: print( 'Different reference key. Please check the parameters you used for structure preparation.' ) quit() if args['prediction_only'] is True: print('Formatting data ready for predictions with a ML model.') T11_structure_file = output_dir / 'results' / 'T11_prediction_only.csv' path_files_4_ml = output_dir / 'files_4_ml_pred_only' path_files_4_ml.mkdir(exist_ok=True) else: print('Formatting data ready for training a ML model.') T11_structure_file = output_dir / 'results' / 'T11.csv' T10_activity_file = output_dir / 'results' / 'T10.csv' T10_activity_file_counts = output_dir / 'results' / 'T10_counts.csv' T9_weight_table_file = output_dir / 'results' / 'weight_table_T9.csv' if T10_activity_file.is_file() is False: print('Activity file does not exist.') quit() if T10_activity_file_counts.is_file() is False: print('Activity count file does not exist.') quit() if T9_weight_table_file.is_file() is False: print('Weight table file does not exist.') quit() path_files_4_ml = output_dir / 'files_4_ml' path_files_4_ml.mkdir(exist_ok=True) if T11_structure_file.is_file() is False: print('Structure file does not exist.') quit() if overwriting is False: if os.listdir(path_files_4_ml): override = input( f'Do you want to override files in {path_files_4_ml}?(type y or Y) \n The script will be aborted if you type anything else. ' ) if override == 'y' or override == 'Y': print(f'Files for run name {run_name} will be overwritten.') else: print( 'Processing aborted. Please change the run name and re-run the script.' ) quit() # Read config file and get fold size as matrix dimension fp_param = tuner.config.parameters.get_parameters( path=config_file)['fingerprint'] bit_size = fp_param['fold_size'] # Preparing structure-related X matrix T11_structure_df = tuner.read_csv(T11_structure_file) structure_matrix = matrix_from_strucutres_for_prediction( T11_structure_df, bit_size) path_structure_matrix = path_files_4_ml / f'T11_x.mtx' path_structure_matrix_npy = path_files_4_ml / f'T11_x.npy' mmwrite(os.path.join(path_structure_matrix), structure_matrix, field='integer') np.save(path_structure_matrix_npy, structure_matrix) if args['prediction_only'] is True: end = time.time() print(f'Formatting to matrices took {end - start:.08} seconds.') print( f'Files in {path_files_4_ml} are ready for prediction with ML model.' ) else: shutil.copy(T10_activity_file_counts, path_files_4_ml) T9_weight_table = tuner.read_csv(T9_weight_table_file) T10_activity_df = tuner.read_csv(T10_activity_file) T9_weight_table_red = pd.DataFrame(T9_weight_table, columns=['cont_classification_task_id', 'weight']) \ .rename(columns={"cont_classification_task_id": "task_id"}).dropna(subset=['task_id']) T9_weight_table_red.to_csv(path_files_4_ml / 'T9_red.csv', sep=',', index=False) activity_matrix = matrix_from_activity(T10_activity_df) path_activity_matrix = path_files_4_ml / f'T10_y.mtx' path_activity_matrix_npy = path_files_4_ml / f'T10_y.npy' mmwrite(os.path.join(path_activity_matrix), activity_matrix, field='integer') np.save(path_activity_matrix_npy, activity_matrix) folding_vector = folding_from_structure(T11_structure_df) path_structure_fold_vector = path_files_4_ml / f'T11_fold_vector' np.save(path_structure_fold_vector, folding_vector) end = time.time() print(f'Formatting to matrices took {end - start:.08} seconds.') print(f'Files in {path_files_4_ml} are ready for ML model.')
def main(args, overwriting=True, process_reference_set=False): #################################### # start timing start = time.time() output_dir = Path(args['output_dir']) output_dir.mkdir(exist_ok=True) run_name = args['run_name'] num_cpu = args['number_cpu'] if num_cpu < 1: print('Please use a positive number of CPUs.') quit() config_file = Path(args['config_file']) if config_file.is_file() is False: print('Config file does not exist.') quit() tuner.config.parameters.get_parameters(path=config_file) if process_reference_set is True: structure_file = Path(args['reference_set']) dir_out_run = output_dir / run_name dir_out_run.mkdir(exist_ok=True) dir_run_name = dir_out_run / 'reference_set' else: structure_file = Path(args['structure_file']) dir_run_name = output_dir / run_name if structure_file.is_file() is False: print('Structure file does not exist.') quit() s_filename = structure_file.stem dir_run_name.mkdir(exist_ok=True) path_results_extern = dir_run_name / 'results' path_results_extern.mkdir(exist_ok=True) path_results_intern = dir_run_name / 'results_tmp' path_results_intern.mkdir(exist_ok=True) output_dir_standardization = path_results_intern / 'standardization' output_dir_standardization.mkdir(exist_ok=True) if overwriting is False: if os.listdir(output_dir_standardization): override = input( f'Do you want to override files in {output_dir_standardization}? (type y or Y) \n The script will be aborted if you type anything else. ' ) if override == 'y' or override == 'Y': print(f'Files for run name {run_name} will be overwritten.') else: print( 'Processing aborted. Please change the run name and re-run the script.' ) quit() # Configure the log file log_file_path = output_dir_standardization / f'log_standardization.log' logging.basicConfig(filename=log_file_path, filemode='w', format='', level=logging.ERROR) print(f'Start processing run name {run_name}.') #################################### # read input file structure_data = tuner.read_csv(structure_file) input_file_len = len(structure_data) if input_file_len == 0: print( 'Structure input is empty. Please provide a suitable structure file.' ) quit() #################################### # standardize structures with RDKit print('Start standardizing molecules.') smiles_standardized = tuner.run_standardize(structure_data['smiles'], num_cpu) # formatting data to output dataframes df_failed_smi = tuner.output_failed_smiles(smiles_standardized, structure_data) df_processed_smi = tuner.output_processed_smiles(smiles_standardized, structure_data) #################################### #################################### # write output file as csv file col_to_keep = ['input_compound_id', 'canonical_smiles'] output_path = output_dir_standardization / 'T2_standardized.csv' output_failed_mol_path = output_dir_standardization / 'T2_failed_mols.csv' df_processed_smi[col_to_keep].to_csv(output_path, sep=',', index=False) df_failed_smi[['input_compound_id', 'smiles']].to_csv(output_failed_mol_path, sep=',', index=False) print( f'Overall processing time of {len(structure_data.index)}/{input_file_len} molecules: {time.time() - start:.08} seconds.' ) print(f'Structure standardization of run name {run_name} done.')