Пример #1
0
def opt_fn(hyperparams,
           data_path,
           test_set,
           save_dir,
           args_file,
           metric='percent_improved_mae',
           input_file=None,
           num_decode=20,
           seed=1):
    
    ## Get args:
    args = read_args(args_file)
    
    run_training(data_path,save_dir,args_file,input_file,hyperparams)
    
    vocab_file = os.path.join(save_dir,'inputs','vocab.txt')
    model_file = os.path.join(save_dir,'models','model.'+str(args['epoch']-1))
    
    if not os.path.isdir(os.path.join(save_dir,'eval')):
        os.mkdir(os.path.join(save_dir,'eval'))
        
    output_file = os.path.join(save_dir,'eval','decoded_mols.csv')
    stats_file = os.path.join(save_dir,'eval','stats.pkl')
    
    decode(test_set,vocab_file,model_file,output_file,args,
        atom_vocab=common_atom_vocab,
        num_decode=num_decode, ## Will not come from run input
        seed=seed,
        hyperparams=hyperparams)

    stats,_ = evaluate_chemprop(output_file,fold_path=args['fold_path'])

    with open(stats_file, 'wb') as f:
        pickle.dump(stats, f, pickle.HIGHEST_PROTOCOL)
        
    return stats[metric]
Пример #2
0
def iterate_round(args_file,
                  save_dir,
                  data_path,
                  chemprop_path,
                  constraint_file=None,
                  iteration_num=1,
                  solvent=None):

    args = read_args(args_file)
    save_dir1 = os.path.join(save_dir, 'iteration' + str(iteration_num))
    if not os.path.isdir(save_dir1):
        os.mkdir(save_dir1)

    # Train model:
    run_training(data_path,
                 save_dir1,
                 args_file,
                 chemprop_path=chemprop_path,
                 constraint_file=constraint_file)

    # Make augment folder
    if not os.path.isdir(os.path.join(save_dir1, 'augment')):
        os.mkdir(os.path.join(save_dir1, 'augment'))
    augment_folder = os.path.join(save_dir1, 'augment')

    molfile = os.path.join(save_dir1, 'inputs', 'mols.txt')
    vocab = os.path.join(save_dir1, 'inputs', 'vocab.txt')
    model = os.path.join(save_dir1, 'models',
                         'model.' + str(args['epoch'] - 1))
    args_file = os.path.join(save_dir1, 'input.dat')

    # Generate new molecules based on original dataset:
    decode(
        molfile,
        vocab,
        model,
        os.path.join(augment_folder, 'gen_out.csv'),
        args,  # Args are needed to make sure the network architecture is correct
        atom_vocab=common_atom_vocab,
        num_decode=args['num_decode'])

    # Assign/predict molecule properties:
    if solvent == None:
        _, preds_tot = evaluate_chemprop(os.path.join(augment_folder,
                                                      'gen_out.csv'),
                                         fold_path=args['fold_path'],
                                         chemprop_path=chemprop_path)
        preds_tot.to_csv(os.path.join(augment_folder, 'gen_evaluated.csv'),
                         index=False)
    else:
        _, preds_tot = evaluate_chemprop_sol(os.path.join(
            augment_folder, 'gen_out.csv'),
                                             solvent=solvent,
                                             fold_path=args['fold_path'],
                                             chemprop_path=chemprop_path)
        preds_tot.to_csv(os.path.join(augment_folder, 'gen_evaluated.csv'),
                         index=False)

    # Apply filters and create new datafile
    update_dataset(os.path.join(augment_folder, 'gen_evaluated.csv'),
                   os.path.join(augment_folder, 'data.csv'),
                   target=args['target'],
                   threshold=args['cutoff_iterations'],
                   min_mol_wt=args['min_mol_wt'],
                   pairing_method=args['pairing_method'],
                   n_clusters=args['n_clusters'],
                   tan_threshold=args['tan_threshold']
                   )  # Reusing cutoff criteria defined for pairing algorithm

    # Return locations of folders
    return augment_folder
Пример #3
0
from g2g_optimization.train.decode import decode
from g2g_optimization.train.args import read_args
from g2g_optimization.hgraph import common_atom_vocab


lg = rdkit.RDLogger.logger() 
lg.setLevel(rdkit.RDLogger.CRITICAL)

parser = argparse.ArgumentParser()
parser.add_argument('--test',required=True)
parser.add_argument('--vocab',required=True)
parser.add_argument('--model',required=True)
parser.add_argument('--output_file',required=True)
parser.add_argument('--args_file',type=str, default=None) #Without an args file, many parameters will revert to default
parser.add_argument('--num_decode',type=int, default=20)
parser.add_argument('--seed',type=int, default=1)

args = parser.parse_args()

if args.args_file == None:
    print('WARNING: You are running without an args_file')
    args_file = {}
else:
    args_file = read_args(args.args_file)
    
decode(args.test,args.vocab,args.model,args.output_file,args_file,
        atom_vocab=common_atom_vocab,
        num_decode=args.num_decode, ## Will not come from run input
        seed=args.seed)
Пример #4
0
def run_training(data_path='data/solvation_open',save_dir='checkpoints',args_file=None,chemprop_path='/data/rsg/chemistry/cbilod/chemprop/',
                 constraint_file=None,
                 input_file=None,
                 hyperparams=None):

    args = {}
    if args_file!=None:
        args = read_args(args_file)
    
    if input_file==None:
        input_file = os.path.join(data_path,'data.csv')
        
    ## Hyperparameters:-------
    
    if hyperparams !=None:
        if 'latent_size' in list(hyperparams.keys()):
            args['latent_size'] = hyperparams['latent_size']
    
    #--------------------------
        
        
    if not os.path.isdir(os.path.join(save_dir,'inputs')):
        os.mkdir(os.path.join(save_dir,'inputs'))
    input_dir = os.path.join(save_dir,'inputs')
    
    mol_file = os.path.join(input_dir,'mols.txt')
    vocab_file = os.path.join(input_dir,'vocab.txt')
    train_file = os.path.join(input_dir,'train_pairs.txt')
    adjacency_file = os.path.join(input_dir,'adjacency.pkl')

    if not os.path.isdir(os.path.join(save_dir,'tensors')):
        os.mkdir(os.path.join(save_dir,'tensors'))
    tensor_dir = os.path.join(save_dir,'tensors')
    
    if not os.path.isdir(os.path.join(save_dir,'models')):
        os.mkdir(os.path.join(save_dir,'models'))
    model_dir = os.path.join(save_dir,'models')

    log_file = os.path.join(save_dir,'run.log')
    f = open(log_file, 'w')
    f.write('Arguments:\n')
    f.write(str(args)+'\n')
    f.write('\n')
    
    f.write('Starting Pair Generation \n')
    start = time.time()
    generate_pairs(input_file, train_file, mol_file,args,chemprop_path,constraint_file,adjacency_file)
    end = time.time()
    f.write('Ending Pair Generation \n')
    f.write('Time Elapsed: '+str(end-start)+'\n')
    f.write('\n')
    
    f.write('Starting Vocab \n')
    start = time.time()
    get_vocab(mol_file,vocab_file)
    end = time.time()
    f.write('Ending Vocab \n')
    f.write('Time Elapsed: '+str(end-start)+'\n')
    f.write('\n')
    
    f.write('Starting Preprocessing \n')
    start = time.time()
    generate_tensors(train_file,vocab_file,tensor_dir,args)
    end = time.time()
    f.write('Ending Preprocessing \n')
    f.write('Time Elapsed: '+str(end-start) + '\n')
    f.write('\n')
    
    f.write('Starting Model Training \n')
    start = time.time()
    print(args)
    gnn_train(tensor_dir,vocab_file,model_dir,args)
    end = time.time()
    f.write('Ending Model Training \n')
    f.write('Time Elapsed: '+str(end-start)+ '\n')
    f.write('\n')
    
    if args_file!=None:
        copyfile(args_file,os.path.join(save_dir,'input.dat'))