Exemplo n.º 1
0
def create_testing_dataset_files(name_to_prepend, dataset,
                                 reactants_to_reactant_id_dict):
    print(f"Going through dataset {name_to_prepend}")

    reactants_interested_in_set = set(reactants_to_reactant_id_dict.keys())

    reactant_bags = []
    corresponding_products = []
    unreachable_reactants = []
    unreachable_products = []

    num_reachable = 0
    num_unreachable = 0
    for reaction_smi_frozen_set, product_smi_frozen_set in tqdm.tqdm(
            dataset, desc=f"Going through {name_to_prepend}"):
        if reaction_smi_frozen_set.issubset(reactants_interested_in_set):
            reactant_bags.append(','.join([
                str(reactants_to_reactant_id_dict[react])
                for react in reaction_smi_frozen_set
            ]))
            corresponding_products.append('.'.join(
                sorted(list(product_smi_frozen_set))))
            num_reachable += 1
        else:
            unreachable_reactants.append('.'.join(
                sorted(list(reaction_smi_frozen_set))))
            unreachable_products.append('.'.join(
                sorted(list(product_smi_frozen_set))))
            num_unreachable += 1

    print(
        f"For dataset {name_to_prepend} have found {num_reachable} and {num_unreachable}"
    )

    # Create file E
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      f'{name_to_prepend}_react_bags.txt'), 'w') as fo:
        fo.write('\n'.join(reactant_bags))

    # Create file F
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      f"{name_to_prepend}_products.txt"), 'w') as fo:
        fo.write('\n'.join(corresponding_products))

    # Create file G
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      f"{name_to_prepend}_unreachable_reactants.txt"),
            'w') as fo:
        fo.write('\n'.join(unreachable_reactants))

    # Create file H
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      f"{name_to_prepend}_unreachable_products.txt"),
            'w') as fo:
        fo.write('\n'.join(unreachable_products))
    def __init__(self):
        self.run_name = str(os.getenv("MCHEF_NAME"))
        print(f"Run name is {self.run_name}")

        processed_data_dir = mchef_config.get_processed_data_dir()

        self.path_mol_details = path.join(processed_data_dir,
                                          'reactants_feats.pick')

        self.path_react_bags_train = path.join(processed_data_dir,
                                               'train_react_bags.txt')
        self.path_react_bags_val = path.join(processed_data_dir,
                                             'valid_react_bags.txt')

        self.path_products_train = path.join(processed_data_dir,
                                             'train_products.txt')
        self.path_products_val = path.join(processed_data_dir,
                                           'valid_products.txt')

        self.num_epochs = 100
        self.batch_size = 25
        self.learning_rate = 0.001

        self.lr_reduction_interval = 40
        self.lr_reduction_factor = 0.1

        self.cuda_details = gnn_utils.CudaDetails(
            use_cuda=torch.cuda.is_available(), gpu_id=0)

        self.lambda_value = 10.  # see WAE paper, section 4
        self.property_pred_factor = 50.
        self.latent_dim = 25
Exemplo n.º 3
0
def plot_reachable(params: Params, return_result_or_product):
    print("Doing reachable")
    grnd_truth_products = _read_in_smiles_file(params.path_reachable_products_ground_truth)
    grnd_truth_reactants = _react_bags_to_smi_list(params.path_react_bags_test, params.reactant_smi_to_id)
    suggested_reactants = _read_in_smiles_file(params.path_reachable_reactants_restrosynthezed)
    suggested_products = [return_result_or_product(reactants) for reactants in suggested_reactants]
    bundle_reachable = _zip_together_cycle(grnd_truth_products, grnd_truth_reactants, suggested_reactants, suggested_products)

    # We also read in the training set, so to exclude those from this set.
    processed_data_dir = mchef_config.get_processed_data_dir()
    train_reactants = _react_bags_to_smi_list(path.join(processed_data_dir, 'train_react_bags.txt'), params.reactant_smi_to_id)
    train_products = _read_in_smiles_file(path.join(processed_data_dir, 'train_products.txt'))
    assert len(train_reactants) == len(train_products)
    
    train_reactants_products = set([(rdkit_general_ops.form_canonical_smi_frozenmultiset(react), rdkit_general_ops.form_canonical_smi_frozenmultiset(prod))
                                     for react, prod in tqdm.tqdm(zip(train_reactants, train_products), total=len(train_reactants),
                                                                  desc="putting train set into a set")
                                     ])
    
    def should_filter(elem):
        reactants_set = rdkit_general_ops.form_canonical_smi_frozenmultiset(elem['ground_truth_reactant'])
        products_set = rdkit_general_ops.form_canonical_smi_frozenmultiset(elem['ground_truth_product'])
        return (reactants_set, products_set) in train_reactants_products

    bundle_reachable = [elem for elem in tqdm.tqdm(bundle_reachable) if not(should_filter(elem))]

    produce_the_kde_plot(bundle_reachable, '#56dcd6', 'reachable_qed')
    print(bundle_reachable[:5])
    print("\n\n")
Exemplo n.º 4
0
    def __init__(self):
        # Training details
        self.batch_size = 50
        self.num_epochs = 30
        self.log_interval = 5
        self.cuda_details = gnn_utils.CudaDetails(use_cuda=torch.cuda.is_available())

        # Molecule details
        self.gnn_hidden_size: int = 101  # our molecule features have this dimensionality.
        self.edge_names = ['single', 'double', 'triple']
        self.gnn_time_steps = 4
        self.gnn_embedding_dim = 50

        #  Data paths
        processed_data_dir = mchef_config.get_processed_data_dir()

        self.path_mol_details = path.join(processed_data_dir, 'reactants_feats.pick')

        self.path_react_bags_train = path.join(processed_data_dir, 'train_react_bags.txt')
        self.path_react_bags_val = path.join(processed_data_dir, 'valid_react_bags.txt')

        self.path_products_train = path.join(processed_data_dir, 'train_products.txt')
        self.path_products_val = path.join(processed_data_dir, 'valid_products.txt')

        # Command line arguments.
        arguments = docopt(__doc__)
        self.weights_to_use = arguments['<input_weights>']
Exemplo n.º 5
0
def create_shared_dataset_files(reactants_to_reactant_id_dict):
    print("creating shared files")
    # Create file A
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      'reactants_to_reactant_id.json'), 'w') as fo:
        json.dump(reactants_to_reactant_id_dict, fo)

    # Create file B
    print(f"Creating reactant smi to reactant_id map.")
    reactant_feats = {}
    for smiles, id in tqdm.tqdm(reactants_to_reactant_id_dict.items()):
        mol = rdkit_general_ops.get_molecule(smiles, kekulize=True)
        mol, am_to_indx_map = rdkit_general_ops.add_atom_mapping(mol)
        reactant_feats[
            id] = rdkit_featurization_ops.mol_to_atom_feats_and_adjacency_list(
                mol, am_to_indx_map)
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      'reactants_feats.pick'), 'wb') as fo:
        pickle.dump(reactant_feats, fo)
Exemplo n.º 6
0
    def __init__(self):
        self.num_to_generate = 20000
        self.batch_size = 2000

        processed_data_dir = mchef_config.get_processed_data_dir()

        self.path_mol_details = path.join(processed_data_dir, 'reactants_feats.pick')

        self.cuda_details = gnn_utils.CudaDetails(use_cuda=torch.cuda.is_available())

        arguments = docopt(__doc__)
        self.weights_to_use = arguments['<input_weights>']
        self.location_for_tokenized_reactants = arguments['<output_name>']
Exemplo n.º 7
0
def create_training_dataset_files_and_reactant_vocab(
        uspto_train_dataset, num_times_reactant_should_occur: int):
    reactant_bundler = ReactantBundler()
    for reaction_smi_frozen_set, product_smi_frozen_set in tqdm.tqdm(
            uspto_train_dataset, desc="Adding reactions to bundler"):
        reactant_bundler.add_reactant(reaction_smi_frozen_set,
                                      product_smi_frozen_set)

    (reactant_bags, product_bags, reactant_vocab,
     _) = reactant_bundler.get_most_popular_reactant_sets_and_equiv_products(
         num_times_reactant_should_occur)

    print(f"Creating reactant smi to reactant_id map.")
    reactants_to_reactant_id_dict = dict(
        zip(reactant_vocab, range(len(reactant_vocab))))
    create_shared_dataset_files(reactants_to_reactant_id_dict)

    print("create training files.")
    # Create file C
    lines = []
    for r_bag in reactant_bags:
        line_str = ','.join(
            [str(reactants_to_reactant_id_dict[react]) for react in r_bag])
        lines.append(line_str)
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      'train_react_bags.txt'), 'w') as fo:
        fo.write('\n'.join(lines))

    # Create file D
    product_lines = ['.'.join(sorted(list(p_bag))) for p_bag in product_bags]
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      'train_products.txt'), 'w') as fo:
        fo.write('\n'.join(product_lines))

    return reactants_to_reactant_id_dict
Exemplo n.º 8
0
    def __init__(self):
        self.num_molecules_to_optimize: int = 250
        self.num_distinct_molecule_steps: int = 10
        self.epsilon: float = 0.5

        self.cuda_details = gnn_utils.CudaDetails(use_cuda=torch.cuda.is_available())

        processed_data_dir = mchef_config.get_processed_data_dir()

        self.path_mol_details = path.join(processed_data_dir, 'reactants_feats.pick')
        self.path_react_bags_train = path.join(processed_data_dir, 'train_react_bags.txt')

        # Command line arguments.
        arguments = docopt(__doc__)
        self.weights_to_use = arguments['<input_weights>']
Exemplo n.º 9
0
    def __init__(self):
        self.cuda_details = gnn_utils.CudaDetails(use_cuda=torch.cuda.is_available())
        # GNN details
        self.gnn_args = dict(output_dim=25, hidden_layer_size=101, edge_names=['single', 'double', 'triple'],
                             embedding_dim=50, T=4)

        # Data Paths
        processed_data_dir = mchef_config.get_processed_data_dir()
        self.path_mol_details = path.join(processed_data_dir, 'reactants_feats.pick')

        self.product_files_to_try = [('test_reachable', path.join(processed_data_dir, 'test_products.txt')),
                                     ('test_unreachable', path.join(processed_data_dir, 'test_unreachable_products.txt'))]

        # Command line arguments.
        arguments = docopt(__doc__)
        self.weights_to_use_mchef = arguments['<input_weights_mchef>']
        self.weights_to_use_regressor = arguments['<input_weights_regressor>']
Exemplo n.º 10
0
    def __init__(self):

        processed_data_dir = mchef_config.get_processed_data_dir()
        self.reactant_smi_to_id = mchef_config.get_reactant_smi_to_reactant_id_dict()

        self.path_reachable_products_ground_truth = path.join(processed_data_dir, "test_products.txt")
        self.path_react_bags_test = path.join(processed_data_dir, "test_react_bags.txt")
        self.path_reachable_reactants_restrosynthezed = "./op/test_reachable_retrosynthesized_reactants.txt"

        self.path_unreachable_products_ground_truth = path.join(processed_data_dir, "test_unreachable_products.txt")
        self.path_unreachable_reactants = path.join(processed_data_dir, "test_unreachable_reactants.txt")
        self.path_unreachable_reactants_restrosynthezed = "./op/test_unreachable_retrosynthesized_reactants.txt"

        arguments = docopt(__doc__)
        self.tokenized_reactants = arguments['<tokenized_reactants_path>']
        self.tokenized_products = arguments['<tokenized_products_path>']
        self.nbest_for_tokenized = int(arguments['--nbest'])
Exemplo n.º 11
0
    def __init__(self):
        # Config to read:
        arguments = docopt(__doc__)
        self.experiments_config = arguments['--config']

        processed_data_dir = mchef_config.get_processed_data_dir()

        # Reactants file
        self.training_reactants_path = path.join(
            processed_data_dir, 'reactants_to_reactant_id.json')

        # Training Products file
        self.training_products_path = path.join(processed_data_dir,
                                                'train_products.txt')

        # Get training data smiles strings
        self.training_data_smi_list = self._get_training_data()