def create_testing_dataset_files(name_to_prepend, dataset, reactants_to_reactant_id_dict): print(f"Going through dataset {name_to_prepend}") reactants_interested_in_set = set(reactants_to_reactant_id_dict.keys()) reactant_bags = [] corresponding_products = [] unreachable_reactants = [] unreachable_products = [] num_reachable = 0 num_unreachable = 0 for reaction_smi_frozen_set, product_smi_frozen_set in tqdm.tqdm( dataset, desc=f"Going through {name_to_prepend}"): if reaction_smi_frozen_set.issubset(reactants_interested_in_set): reactant_bags.append(','.join([ str(reactants_to_reactant_id_dict[react]) for react in reaction_smi_frozen_set ])) corresponding_products.append('.'.join( sorted(list(product_smi_frozen_set)))) num_reachable += 1 else: unreachable_reactants.append('.'.join( sorted(list(reaction_smi_frozen_set)))) unreachable_products.append('.'.join( sorted(list(product_smi_frozen_set)))) num_unreachable += 1 print( f"For dataset {name_to_prepend} have found {num_reachable} and {num_unreachable}" ) # Create file E with open( path.join(mchef_config.get_processed_data_dir(), f'{name_to_prepend}_react_bags.txt'), 'w') as fo: fo.write('\n'.join(reactant_bags)) # Create file F with open( path.join(mchef_config.get_processed_data_dir(), f"{name_to_prepend}_products.txt"), 'w') as fo: fo.write('\n'.join(corresponding_products)) # Create file G with open( path.join(mchef_config.get_processed_data_dir(), f"{name_to_prepend}_unreachable_reactants.txt"), 'w') as fo: fo.write('\n'.join(unreachable_reactants)) # Create file H with open( path.join(mchef_config.get_processed_data_dir(), f"{name_to_prepend}_unreachable_products.txt"), 'w') as fo: fo.write('\n'.join(unreachable_products))
def __init__(self): self.run_name = str(os.getenv("MCHEF_NAME")) print(f"Run name is {self.run_name}") processed_data_dir = mchef_config.get_processed_data_dir() self.path_mol_details = path.join(processed_data_dir, 'reactants_feats.pick') self.path_react_bags_train = path.join(processed_data_dir, 'train_react_bags.txt') self.path_react_bags_val = path.join(processed_data_dir, 'valid_react_bags.txt') self.path_products_train = path.join(processed_data_dir, 'train_products.txt') self.path_products_val = path.join(processed_data_dir, 'valid_products.txt') self.num_epochs = 100 self.batch_size = 25 self.learning_rate = 0.001 self.lr_reduction_interval = 40 self.lr_reduction_factor = 0.1 self.cuda_details = gnn_utils.CudaDetails( use_cuda=torch.cuda.is_available(), gpu_id=0) self.lambda_value = 10. # see WAE paper, section 4 self.property_pred_factor = 50. self.latent_dim = 25
def plot_reachable(params: Params, return_result_or_product): print("Doing reachable") grnd_truth_products = _read_in_smiles_file(params.path_reachable_products_ground_truth) grnd_truth_reactants = _react_bags_to_smi_list(params.path_react_bags_test, params.reactant_smi_to_id) suggested_reactants = _read_in_smiles_file(params.path_reachable_reactants_restrosynthezed) suggested_products = [return_result_or_product(reactants) for reactants in suggested_reactants] bundle_reachable = _zip_together_cycle(grnd_truth_products, grnd_truth_reactants, suggested_reactants, suggested_products) # We also read in the training set, so to exclude those from this set. processed_data_dir = mchef_config.get_processed_data_dir() train_reactants = _react_bags_to_smi_list(path.join(processed_data_dir, 'train_react_bags.txt'), params.reactant_smi_to_id) train_products = _read_in_smiles_file(path.join(processed_data_dir, 'train_products.txt')) assert len(train_reactants) == len(train_products) train_reactants_products = set([(rdkit_general_ops.form_canonical_smi_frozenmultiset(react), rdkit_general_ops.form_canonical_smi_frozenmultiset(prod)) for react, prod in tqdm.tqdm(zip(train_reactants, train_products), total=len(train_reactants), desc="putting train set into a set") ]) def should_filter(elem): reactants_set = rdkit_general_ops.form_canonical_smi_frozenmultiset(elem['ground_truth_reactant']) products_set = rdkit_general_ops.form_canonical_smi_frozenmultiset(elem['ground_truth_product']) return (reactants_set, products_set) in train_reactants_products bundle_reachable = [elem for elem in tqdm.tqdm(bundle_reachable) if not(should_filter(elem))] produce_the_kde_plot(bundle_reachable, '#56dcd6', 'reachable_qed') print(bundle_reachable[:5]) print("\n\n")
def __init__(self): # Training details self.batch_size = 50 self.num_epochs = 30 self.log_interval = 5 self.cuda_details = gnn_utils.CudaDetails(use_cuda=torch.cuda.is_available()) # Molecule details self.gnn_hidden_size: int = 101 # our molecule features have this dimensionality. self.edge_names = ['single', 'double', 'triple'] self.gnn_time_steps = 4 self.gnn_embedding_dim = 50 # Data paths processed_data_dir = mchef_config.get_processed_data_dir() self.path_mol_details = path.join(processed_data_dir, 'reactants_feats.pick') self.path_react_bags_train = path.join(processed_data_dir, 'train_react_bags.txt') self.path_react_bags_val = path.join(processed_data_dir, 'valid_react_bags.txt') self.path_products_train = path.join(processed_data_dir, 'train_products.txt') self.path_products_val = path.join(processed_data_dir, 'valid_products.txt') # Command line arguments. arguments = docopt(__doc__) self.weights_to_use = arguments['<input_weights>']
def create_shared_dataset_files(reactants_to_reactant_id_dict): print("creating shared files") # Create file A with open( path.join(mchef_config.get_processed_data_dir(), 'reactants_to_reactant_id.json'), 'w') as fo: json.dump(reactants_to_reactant_id_dict, fo) # Create file B print(f"Creating reactant smi to reactant_id map.") reactant_feats = {} for smiles, id in tqdm.tqdm(reactants_to_reactant_id_dict.items()): mol = rdkit_general_ops.get_molecule(smiles, kekulize=True) mol, am_to_indx_map = rdkit_general_ops.add_atom_mapping(mol) reactant_feats[ id] = rdkit_featurization_ops.mol_to_atom_feats_and_adjacency_list( mol, am_to_indx_map) with open( path.join(mchef_config.get_processed_data_dir(), 'reactants_feats.pick'), 'wb') as fo: pickle.dump(reactant_feats, fo)
def __init__(self): self.num_to_generate = 20000 self.batch_size = 2000 processed_data_dir = mchef_config.get_processed_data_dir() self.path_mol_details = path.join(processed_data_dir, 'reactants_feats.pick') self.cuda_details = gnn_utils.CudaDetails(use_cuda=torch.cuda.is_available()) arguments = docopt(__doc__) self.weights_to_use = arguments['<input_weights>'] self.location_for_tokenized_reactants = arguments['<output_name>']
def create_training_dataset_files_and_reactant_vocab( uspto_train_dataset, num_times_reactant_should_occur: int): reactant_bundler = ReactantBundler() for reaction_smi_frozen_set, product_smi_frozen_set in tqdm.tqdm( uspto_train_dataset, desc="Adding reactions to bundler"): reactant_bundler.add_reactant(reaction_smi_frozen_set, product_smi_frozen_set) (reactant_bags, product_bags, reactant_vocab, _) = reactant_bundler.get_most_popular_reactant_sets_and_equiv_products( num_times_reactant_should_occur) print(f"Creating reactant smi to reactant_id map.") reactants_to_reactant_id_dict = dict( zip(reactant_vocab, range(len(reactant_vocab)))) create_shared_dataset_files(reactants_to_reactant_id_dict) print("create training files.") # Create file C lines = [] for r_bag in reactant_bags: line_str = ','.join( [str(reactants_to_reactant_id_dict[react]) for react in r_bag]) lines.append(line_str) with open( path.join(mchef_config.get_processed_data_dir(), 'train_react_bags.txt'), 'w') as fo: fo.write('\n'.join(lines)) # Create file D product_lines = ['.'.join(sorted(list(p_bag))) for p_bag in product_bags] with open( path.join(mchef_config.get_processed_data_dir(), 'train_products.txt'), 'w') as fo: fo.write('\n'.join(product_lines)) return reactants_to_reactant_id_dict
def __init__(self): self.num_molecules_to_optimize: int = 250 self.num_distinct_molecule_steps: int = 10 self.epsilon: float = 0.5 self.cuda_details = gnn_utils.CudaDetails(use_cuda=torch.cuda.is_available()) processed_data_dir = mchef_config.get_processed_data_dir() self.path_mol_details = path.join(processed_data_dir, 'reactants_feats.pick') self.path_react_bags_train = path.join(processed_data_dir, 'train_react_bags.txt') # Command line arguments. arguments = docopt(__doc__) self.weights_to_use = arguments['<input_weights>']
def __init__(self): self.cuda_details = gnn_utils.CudaDetails(use_cuda=torch.cuda.is_available()) # GNN details self.gnn_args = dict(output_dim=25, hidden_layer_size=101, edge_names=['single', 'double', 'triple'], embedding_dim=50, T=4) # Data Paths processed_data_dir = mchef_config.get_processed_data_dir() self.path_mol_details = path.join(processed_data_dir, 'reactants_feats.pick') self.product_files_to_try = [('test_reachable', path.join(processed_data_dir, 'test_products.txt')), ('test_unreachable', path.join(processed_data_dir, 'test_unreachable_products.txt'))] # Command line arguments. arguments = docopt(__doc__) self.weights_to_use_mchef = arguments['<input_weights_mchef>'] self.weights_to_use_regressor = arguments['<input_weights_regressor>']
def __init__(self): processed_data_dir = mchef_config.get_processed_data_dir() self.reactant_smi_to_id = mchef_config.get_reactant_smi_to_reactant_id_dict() self.path_reachable_products_ground_truth = path.join(processed_data_dir, "test_products.txt") self.path_react_bags_test = path.join(processed_data_dir, "test_react_bags.txt") self.path_reachable_reactants_restrosynthezed = "./op/test_reachable_retrosynthesized_reactants.txt" self.path_unreachable_products_ground_truth = path.join(processed_data_dir, "test_unreachable_products.txt") self.path_unreachable_reactants = path.join(processed_data_dir, "test_unreachable_reactants.txt") self.path_unreachable_reactants_restrosynthezed = "./op/test_unreachable_retrosynthesized_reactants.txt" arguments = docopt(__doc__) self.tokenized_reactants = arguments['<tokenized_reactants_path>'] self.tokenized_products = arguments['<tokenized_products_path>'] self.nbest_for_tokenized = int(arguments['--nbest'])
def __init__(self): # Config to read: arguments = docopt(__doc__) self.experiments_config = arguments['--config'] processed_data_dir = mchef_config.get_processed_data_dir() # Reactants file self.training_reactants_path = path.join( processed_data_dir, 'reactants_to_reactant_id.json') # Training Products file self.training_products_path = path.join(processed_data_dir, 'train_products.txt') # Get training data smiles strings self.training_data_smi_list = self._get_training_data()