def do_data_analysis(data_rdkit, descriptor_name, save_dir, verbose=False):
    """
    Function to analize a dataset. Will compute: descritpor as specify in
    descriptors_name, Morgan fingerprint, Murcko and generic scaffolds.
    
    Parameters:
    - data_rdkit: list of RDKit mol.
    - descriptor_name (string): contain name of descriptor to compute.
    - save_dir (string): Path to save the output of the analysis.
    """

    # Compute the descriptors with rdkit
    # as defined in the fixed parameter file
    desc_names = re.compile(FP.DESCRIPTORS['names'])
    functions, names = hp_chem.get_rdkit_desc_functions(desc_names)
    descriptors = hp_chem.rdkit_desc(data_rdkit, functions, names)
    hp.save_obj(descriptors, f'{save_dir}desc')

    # Compute fingerprints
    fingerprint = hp_chem.fingerprint_calc(data_rdkit, verbose=verbose)
    fp_dict = {'fingerprint': fingerprint}
    hp.save_obj(fp_dict, f'{save_dir}fp')

    # Extract Murcko and generic scaffolds
    scaf, generic_scaf = hp_chem.extract_murcko_scaffolds(data_rdkit)
    desc_scaf = {'scaffolds': scaf, 'generic_scaffolds': generic_scaf}
    hp.save_obj(desc_scaf, f'{save_dir}scaf')
    hp.write_in_file(f'{save_dir}generic_scaffolds.txt', generic_scaf)
    hp.write_in_file(f'{save_dir}scaffolds.txt', scaf)
def do_processing(split,
                  data_path,
                  augmentation,
                  min_len,
                  max_len,
                  save_dir,
                  verbose=True):
    """
    Function to process a dataset.
    
    Parameters:
    - split (float): value used to split the dataset between
    the training set and the validation set. E.g., if split is 0.8,
    80% of the data will go in the training set, and 20% in the 
    validation set.
    - data_path (string): path to the dataset.
    - augmentation (int): value to augment the dataset. E.g., if augmentation
    is 10, the SMILES enumeration will be done to add 10 different 
    SMILES encoding for each SMILES (i.e. resulting in a total of 11 representations)
    for a given SMILES in the dataset.
    - min_len (int): minimum length of SMILES to be kept in the dataset.
    - max_len (int): maximum length of SMILES to be kept in the dataset.
    - save_dir (string): directory to save the processed dataset.
    """

    # load the data with right SMILES limits,
    # both in a list and in rdkit mol format
    data_ori, data_rdkit = load_data(data_path,
                                     min_len,
                                     max_len,
                                     verbose=verbose)

    # we save the data without augmentation if it was
    # not already saved. We will need it to check the novelty
    # of the generated SMILES
    if os.path.isfile(f'{save_dir}pruned.txt'):
        hp.write_in_file(f'{save_dir}pruned.txt', data_ori)

    if verbose: print('Start data analysis')
    do_data_analysis(data_rdkit, FP.DESCRIPTORS['names'], save_dir)

    # draw top scaffolds
    if verbose: print('Start drawing scaffolds')
    top_common = 20
    draw_scaffolds(top_common, save_dir)

    if verbose: print('Start data processing')
    # define index for the tr-val split
    # and shuffle them
    all_idx = np.arange(len(data_ori))
    idx_split = int(split * len(all_idx))
    np.random.shuffle(all_idx)

    # we need to be careful about the case where
    # idx_split = 0 when there is only one
    # SMILES in the data, e.g. for fine-tuning
    if idx_split == 0:
        # in this case, we use the unique smile both
        # for the training and validation
        idx_tr_canon = [0]
        idx_val_canon = [0]
    else:
        idx_tr_canon = all_idx[:idx_split]
        idx_val_canon = all_idx[idx_split:]

    assert len(idx_tr_canon) != 0
    assert len(idx_val_canon) != 0

    if verbose:
        print(f'Size of the training set after split: {len(idx_tr_canon)}')
        print(f'Size of the validation set after split: {len(idx_val_canon)}')

    d = dict(enumerate(data_ori))
    data_tr = [d.get(item) for item in idx_tr_canon]
    data_val = [d.get(item) for item in idx_val_canon]
    hp.write_in_file(f'{save_dir}data_tr.txt', data_tr)
    hp.write_in_file(f'{save_dir}data_val.txt', data_val)

    if augmentation > 0:
        if verbose:
            print(f'Data augmentation {augmentation}-fold start')

        # Augment separately the training and validation splits
        # It's important to do those steps separetely in order
        # to avoid to have the same molecule represented in
        # both splits
        tr_aug = augment_dataset(data_tr,
                                 augmentation,
                                 min_len,
                                 max_len,
                                 verbose=False)
        val_aug = augment_dataset(data_val,
                                  augmentation,
                                  min_len,
                                  max_len,
                                  verbose=False)

        # Merge with the original data and shuffle
        full_training_set = list(set(data_tr + tr_aug))
        shuffle(full_training_set)
        full_validation_set = list(set(data_val + val_aug))
        shuffle(full_validation_set)
        full_datalist = full_training_set + full_validation_set

        if verbose:
            print(
                f'Size of the training set after agumentation: {len(full_training_set)}'
            )
            print(
                f'Size of the validation set after agumentation: {len(full_validation_set)}'
            )

        # Create the partitions for the data generators
        # with the full augmented dataset
        idx_tr = np.arange(len(full_training_set))
        idx_val = np.arange(len(full_training_set),
                            len(full_training_set) + len(full_validation_set))

        # Save
        hp.write_in_file(f'{save_dir}{save_name}.txt', full_datalist)
        hp.save_obj(list(idx_tr), save_dir + 'idx_tr')
        hp.save_obj(list(idx_val), save_dir + 'idx_val')
    else:
        # Save
        hp.write_in_file(f'{save_dir}{save_name}.txt', data_ori)
        hp.save_obj(list(idx_tr_canon), f'{save_dir}idx_tr')
        hp.save_obj(list(idx_val_canon), f'{save_dir}idx_val')
                if mol is not None:
                    data_rdkit.append(mol)

            save_name = name.split('_')[1] + '_' + name.split('_')[2]

            # descriptors
            desc_names = re.compile(FP.DESCRIPTORS['names'])
            functions, names = hp_chem.get_rdkit_desc_functions(desc_names)
            desc_dict = hp_chem.rdkit_desc(data_rdkit, functions, names)
            hp.save_obj(desc_dict, save_path + f'desc_{save_name}')

            # scaffolds
            scaf, generic_scaf = hp_chem.extract_murcko_scaffolds(data_rdkit)
            desc_scaf = {'scaffolds': scaf, 'generic_scaffolds': generic_scaf}
            hp.save_obj(desc_scaf, f'{save_path}scaf_{save_name}')
            hp.write_in_file(f'{save_path}{save_name}_scaffolds.txt', scaf)
            hp.write_in_file(f'{save_path}{save_name}_generic_scaffolds.txt',
                             generic_scaf)

            # fingerprints
            fingerprint = hp_chem.fingerprint_calc(data_rdkit, verbose=verbose)
            fp_dict = {'fingerprint': fingerprint}
            hp.save_obj(fp_dict, save_path + f'fp_{save_name}')

    end = time.time()
    if verbose:
        print(
            f'EXTRACTING DESCRIPTORS, SCAFFOLDS AND FINGERPRINTS DONE in {end - start:.04} seconds'
        )
    ####################################
Пример #4
0
    ft_smiles = ft_smiles_tr + ft_smiles_val
    ft_fp, ft_smiles = get_fp(ft_smiles)
    ####################################

    ####################################
    # path to the saved novo data
    path_novo = f'results/{name_data}/novo_molecules/'

    # Path to save the UMAP plots
    save_path = f'results/{name_data}/umap/'
    os.makedirs(save_path, exist_ok=True)
    ####################################

    ####################################
    # save SMILES used for interative UMAP
    hp.write_in_file(f'{save_path}smiles_src.txt', src_smiles)
    hp.write_in_file(f'{save_path}smiles_tgt.txt', tgt_smiles)
    hp.write_in_file(f'{save_path}smiles_ft.txt', ft_smiles)
    ####################################

    ####################################
    # iterate over the generated data
    path_epoch_start = f'../models/molecules_start_{temp}.txt'
    e_start_smiles = get_n_random(path_epoch_start, n_gen)
    e_start_fp, e_start_smiles = get_fp(e_start_smiles)
    hp.write_in_file(f'{save_path}smiles_start_{temp}.txt', e_start_smiles)

    path_epoch_end = f'{path_novo}molecules_{e_end}_{temp}.txt'
    e_end_smiles = get_n_random(path_epoch_end, n_gen)
    e_end_fp, e_end_smiles = get_fp(e_end_smiles)
    hp.write_in_file(f'{save_path}smiles_end_{temp}.txt', e_end_smiles)