def do_data_analysis(data_rdkit, descriptor_name, save_dir, verbose=False): """ Function to analize a dataset. Will compute: descritpor as specify in descriptors_name, Morgan fingerprint, Murcko and generic scaffolds. Parameters: - data_rdkit: list of RDKit mol. - descriptor_name (string): contain name of descriptor to compute. - save_dir (string): Path to save the output of the analysis. """ # Compute the descriptors with rdkit # as defined in the fixed parameter file desc_names = re.compile(FP.DESCRIPTORS['names']) functions, names = hp_chem.get_rdkit_desc_functions(desc_names) descriptors = hp_chem.rdkit_desc(data_rdkit, functions, names) hp.save_obj(descriptors, f'{save_dir}desc') # Compute fingerprints fingerprint = hp_chem.fingerprint_calc(data_rdkit, verbose=verbose) fp_dict = {'fingerprint': fingerprint} hp.save_obj(fp_dict, f'{save_dir}fp') # Extract Murcko and generic scaffolds scaf, generic_scaf = hp_chem.extract_murcko_scaffolds(data_rdkit) desc_scaf = {'scaffolds': scaf, 'generic_scaffolds': generic_scaf} hp.save_obj(desc_scaf, f'{save_dir}scaf') hp.write_in_file(f'{save_dir}generic_scaffolds.txt', generic_scaf) hp.write_in_file(f'{save_dir}scaffolds.txt', scaf)
def do_processing(split, data_path, augmentation, min_len, max_len, save_dir, verbose=True): """ Function to process a dataset. Parameters: - split (float): value used to split the dataset between the training set and the validation set. E.g., if split is 0.8, 80% of the data will go in the training set, and 20% in the validation set. - data_path (string): path to the dataset. - augmentation (int): value to augment the dataset. E.g., if augmentation is 10, the SMILES enumeration will be done to add 10 different SMILES encoding for each SMILES (i.e. resulting in a total of 11 representations) for a given SMILES in the dataset. - min_len (int): minimum length of SMILES to be kept in the dataset. - max_len (int): maximum length of SMILES to be kept in the dataset. - save_dir (string): directory to save the processed dataset. """ # load the data with right SMILES limits, # both in a list and in rdkit mol format data_ori, data_rdkit = load_data(data_path, min_len, max_len, verbose=verbose) # we save the data without augmentation if it was # not already saved. We will need it to check the novelty # of the generated SMILES if os.path.isfile(f'{save_dir}pruned.txt'): hp.write_in_file(f'{save_dir}pruned.txt', data_ori) if verbose: print('Start data analysis') do_data_analysis(data_rdkit, FP.DESCRIPTORS['names'], save_dir) # draw top scaffolds if verbose: print('Start drawing scaffolds') top_common = 20 draw_scaffolds(top_common, save_dir) if verbose: print('Start data processing') # define index for the tr-val split # and shuffle them all_idx = np.arange(len(data_ori)) idx_split = int(split * len(all_idx)) np.random.shuffle(all_idx) # we need to be careful about the case where # idx_split = 0 when there is only one # SMILES in the data, e.g. for fine-tuning if idx_split == 0: # in this case, we use the unique smile both # for the training and validation idx_tr_canon = [0] idx_val_canon = [0] else: idx_tr_canon = all_idx[:idx_split] idx_val_canon = all_idx[idx_split:] assert len(idx_tr_canon) != 0 assert len(idx_val_canon) != 0 if verbose: print(f'Size of the training set after split: {len(idx_tr_canon)}') print(f'Size of the validation set after split: {len(idx_val_canon)}') d = dict(enumerate(data_ori)) data_tr = [d.get(item) for item in idx_tr_canon] data_val = [d.get(item) for item in idx_val_canon] hp.write_in_file(f'{save_dir}data_tr.txt', data_tr) hp.write_in_file(f'{save_dir}data_val.txt', data_val) if augmentation > 0: if verbose: print(f'Data augmentation {augmentation}-fold start') # Augment separately the training and validation splits # It's important to do those steps separetely in order # to avoid to have the same molecule represented in # both splits tr_aug = augment_dataset(data_tr, augmentation, min_len, max_len, verbose=False) val_aug = augment_dataset(data_val, augmentation, min_len, max_len, verbose=False) # Merge with the original data and shuffle full_training_set = list(set(data_tr + tr_aug)) shuffle(full_training_set) full_validation_set = list(set(data_val + val_aug)) shuffle(full_validation_set) full_datalist = full_training_set + full_validation_set if verbose: print( f'Size of the training set after agumentation: {len(full_training_set)}' ) print( f'Size of the validation set after agumentation: {len(full_validation_set)}' ) # Create the partitions for the data generators # with the full augmented dataset idx_tr = np.arange(len(full_training_set)) idx_val = np.arange(len(full_training_set), len(full_training_set) + len(full_validation_set)) # Save hp.write_in_file(f'{save_dir}{save_name}.txt', full_datalist) hp.save_obj(list(idx_tr), save_dir + 'idx_tr') hp.save_obj(list(idx_val), save_dir + 'idx_val') else: # Save hp.write_in_file(f'{save_dir}{save_name}.txt', data_ori) hp.save_obj(list(idx_tr_canon), f'{save_dir}idx_tr') hp.save_obj(list(idx_val_canon), f'{save_dir}idx_val')
if mol is not None: data_rdkit.append(mol) save_name = name.split('_')[1] + '_' + name.split('_')[2] # descriptors desc_names = re.compile(FP.DESCRIPTORS['names']) functions, names = hp_chem.get_rdkit_desc_functions(desc_names) desc_dict = hp_chem.rdkit_desc(data_rdkit, functions, names) hp.save_obj(desc_dict, save_path + f'desc_{save_name}') # scaffolds scaf, generic_scaf = hp_chem.extract_murcko_scaffolds(data_rdkit) desc_scaf = {'scaffolds': scaf, 'generic_scaffolds': generic_scaf} hp.save_obj(desc_scaf, f'{save_path}scaf_{save_name}') hp.write_in_file(f'{save_path}{save_name}_scaffolds.txt', scaf) hp.write_in_file(f'{save_path}{save_name}_generic_scaffolds.txt', generic_scaf) # fingerprints fingerprint = hp_chem.fingerprint_calc(data_rdkit, verbose=verbose) fp_dict = {'fingerprint': fingerprint} hp.save_obj(fp_dict, save_path + f'fp_{save_name}') end = time.time() if verbose: print( f'EXTRACTING DESCRIPTORS, SCAFFOLDS AND FINGERPRINTS DONE in {end - start:.04} seconds' ) ####################################
ft_smiles = ft_smiles_tr + ft_smiles_val ft_fp, ft_smiles = get_fp(ft_smiles) #################################### #################################### # path to the saved novo data path_novo = f'results/{name_data}/novo_molecules/' # Path to save the UMAP plots save_path = f'results/{name_data}/umap/' os.makedirs(save_path, exist_ok=True) #################################### #################################### # save SMILES used for interative UMAP hp.write_in_file(f'{save_path}smiles_src.txt', src_smiles) hp.write_in_file(f'{save_path}smiles_tgt.txt', tgt_smiles) hp.write_in_file(f'{save_path}smiles_ft.txt', ft_smiles) #################################### #################################### # iterate over the generated data path_epoch_start = f'../models/molecules_start_{temp}.txt' e_start_smiles = get_n_random(path_epoch_start, n_gen) e_start_fp, e_start_smiles = get_fp(e_start_smiles) hp.write_in_file(f'{save_path}smiles_start_{temp}.txt', e_start_smiles) path_epoch_end = f'{path_novo}molecules_{e_end}_{temp}.txt' e_end_smiles = get_n_random(path_epoch_end, n_gen) e_end_fp, e_end_smiles = get_fp(e_end_smiles) hp.write_in_file(f'{save_path}smiles_end_{temp}.txt', e_end_smiles)