def train_neural_fingerprint(train_directory, labels_mapping, tmp_dir, n_epochs=15): global task_params task_params['N_train'] = int(len(os.listdir(train_directory)) * 0.7) task_params['N_valid'] = int(len(os.listdir(train_directory)) * 0.01) task_params['N_test'] = int(len(os.listdir(train_directory)) * 0.01) task_params['data_file'] = tmp_dir global num_epochs num_epochs = n_epochs directory = train_directory output = open(tmp_dir, 'wb+') files = os.listdir(directory) output.write('graph,label\n') for f in files: output.write(directory + '/' + f + ',' + str(labels_mapping[f]) + '\n') output.close() print "Loading data..." traindata, valdata, testdata = load_data(task_params['data_file'], (task_params['N_train'], task_params['N_valid'], task_params['N_test']), input_name='graph', target_name=task_params['target_name']) train_inputs, train_targets = traindata val_inputs, val_targets = valdata print "Regression on", task_params['N_train'], "training points." def print_performance(pred_func): train_preds = pred_func(train_inputs) val_preds = pred_func(val_inputs) print "\nPerformance (RMSE) on " + task_params['target_name'] + ":" print "Train:", rmse(train_preds, train_targets) print "Test: ", rmse(val_preds, val_targets) print "-" * 80 return rmse(val_preds, val_targets) print "-" * 80 print "Mean predictor" y_train_mean = np.mean(train_targets) print_performance(lambda x : y_train_mean) print "Task params", params nn_train_params, vanilla_net_params = parse_training_params(params) conv_arch_params['return_atom_activations'] = False loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) print_performance(predict_func) return trained_weights
def main(_): print("Loading data...") traindata, valdata, testdata = load_data( task_params['data_file'], input_name='smile', target_name=task_params['target_name']) train_inputs, train_targets = traindata val_inputs, val_targets = valdata test_inputs, test_targets = testdata def print_performance(pred_func): train_preds = pred_func(train_inputs) val_preds = pred_func(val_inputs) print("\nPerformance (RMSE) on " + task_params['target_name'] + ":") print("Train:", rmse(train_preds, train_targets)) print("Test: ", rmse(val_preds, val_targets)) print("-" * 80) return rmse(val_preds, val_targets) def run_morgan_experiment(): loss_fun, pred_fun, net_parser = \ build_morgan_deep_net(model_params['fp_length'], model_params['fp_depth'], vanilla_net_params) num_weights = len(net_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) return print_performance(predict_func) def run_conv_experiment(): conv_layer_sizes = [model_params['conv_width'] ] * model_params['fp_depth'] conv_arch_params = { 'num_hidden_features': conv_layer_sizes, 'fp_length': model_params['fp_length'], 'normalize': 1 } loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) test_predictions = predict_func(test_inputs) return rmse(test_predictions, test_targets) print("Task params", task_params) print() print("Starting Morgan fingerprint experiment...") #test_loss_morgan = run_morgan_experiment() print("Starting neural fingerprint experiment...") test_loss_neural = run_conv_experiment() print() #print("Morgan test RMSE:", test_loss_morgan, "Neural test RMSE:", test_loss_neural) print("{} Neural test RMSE:".format(p_i), test_loss_neural)
def main(): # pdb.set_trace() print "Loading data..." traindata, valdata, testdata = load_data( task_params['data_file'], (N_train, N_val, N_test), input_name='smiles', target_name=task_params['target_name']) train_inputs, train_targets = traindata val_inputs, val_targets = valdata test_inputs, test_targets = testdata def print_performance(pred_func): train_preds = pred_func(train_inputs) val_preds = pred_func(val_inputs) print "\nPerformance (RMSE) on " + task_params['target_name'] + ":" print "Train:", rmse(train_preds, train_targets) print "Test: ", rmse(val_preds, val_targets) print "-" * 80 return rmse(val_preds, val_targets) def run_morgan_experiment(): loss_fun, pred_fun, net_parser = \ build_morgan_deep_net(model_params['fp_length'], model_params['fp_depth'], vanilla_net_params) num_weights = len(net_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) return print_performance(predict_func) def run_conv_experiment(): conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth'] print("conv_layer_sizes ",conv_layer_sizes) conv_arch_params = {'num_hidden_features' : conv_layer_sizes, 'fp_length' : model_params['fp_length'], 'normalize' : 1} loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg']) # import pdb; pdb.set_trace() num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) test_predictions = predict_func(test_inputs) return rmse(test_predictions, test_targets) print "Task params", task_params print print "Starting Morgan fingerprint experiment..." test_loss_morgan = run_morgan_experiment() # test_loss_morgan = 0.0 print "Starting neural fingerprint experiment..." test_loss_neural = run_conv_experiment() print print "Morgan test RMSE:", test_loss_morgan, "Neural test RMSE:", test_loss_neural
def train_neural_fingerprint(): print "Loading data..." traindata, valdata, testdata = load_data( task_params['data_file'], (task_params['N_train'], task_params['N_valid'], task_params['N_test']), input_name='smiles', target_name=task_params['target_name']) train_inputs, train_targets = traindata val_inputs, val_targets = valdata print "Regression on", task_params['N_train'], "training points." def print_performance(pred_func): train_preds = pred_func(train_inputs) val_preds = pred_func(val_inputs) print "\nPerformance (RMSE) on " + task_params['target_name'] + ":" print "Train:", rmse(train_preds, train_targets) print "Test: ", rmse(val_preds, val_targets) print "-" * 80 return rmse(val_preds, val_targets) print "-" * 80 print "Mean predictor" y_train_mean = np.mean(train_targets) print_performance(lambda x: y_train_mean) print "Task params", params nn_train_params, vanilla_net_params = parse_training_params(params) conv_arch_params['return_atom_activations'] = False print "Convnet fingerprints with neural net" loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) print_performance(predict_func) return trained_weights
def train_neural_fingerprint(): print "Loading data..." traindata, valdata, testdata = load_data(task_params['data_file'], (task_params['N_train'], task_params['N_valid'], task_params['N_test']), input_name='smiles', target_name=task_params['target_name']) train_inputs, train_targets = traindata val_inputs, val_targets = valdata print "Regression on", task_params['N_train'], "training points." def print_performance(pred_func): train_preds = pred_func(train_inputs) val_preds = pred_func(val_inputs) print "\nPerformance (RMSE) on " + task_params['target_name'] + ":" print "Train:", rmse(train_preds, train_targets) print "Test: ", rmse(val_preds, val_targets) print "-" * 80 return rmse(val_preds, val_targets) print "-" * 80 print "Mean predictor" y_train_mean = np.mean(train_targets) print_performance(lambda x : y_train_mean) print "Task params", params nn_train_params, vanilla_net_params = parse_training_params(params) conv_arch_params['return_atom_activations'] = False print "Convnet fingerprints with neural net" loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) print_performance(predict_func) return trained_weights
def neural_graph_fps(target_name, input_path, len_smi): task_params = { 'target_name': target_name, 'data_file': input_path, } N_train = int(len_smi * 0.7) - int( len_smi * 0.7) % 100 + 100 # must be in hundreds, haven't found why N_val = int(len_smi * 0.1) N_test = len_smi - N_train - N_val train_params = dict(num_iters=100, batch_size=100, init_scale=np.exp(-4), step_size=np.exp(-6)) traindata, valdata, testdata = load_data( task_params['data_file'], (N_train, N_val, N_test), input_name='smiles', target_name=task_params['target_name']) train_inputs, train_targets = traindata val_inputs, val_targets = valdata test_inputs, test_targets = testdata def print_performance(pred_func): train_preds = pred_func(train_inputs) val_preds = pred_func(val_inputs) return r2(val_preds, val_targets) def run_morgan_experiment(): loss_fun, pred_fun, net_parser = \ build_morgan_deep_net(model_params['fp_length'], model_params['fp_depth'], vanilla_net_params) num_weights = len(net_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) return print_performance(predict_func) def run_conv_experiment(model_params): # Define the architecture of the network that sits on top of the fingerprints. vanilla_net_params = dict( layer_sizes=[model_params['fp_length'], model_params['h1_size']], # One hidden layer. normalize=True, L2_reg=model_params['L2_reg'], nll_func=rmse) conv_layer_sizes = [model_params['conv_width'] ] * model_params['fp_depth'] conv_arch_params = { 'num_hidden_features': conv_layer_sizes, 'fp_length': model_params['fp_length'], 'normalize': 1 } loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) test_predictions = predict_func(test_inputs) return r2(test_predictions, test_targets) fp_lengths = [7, 8, 9] fp_depths = [6, 7, 8] conv_widths = [30, 40, 50, 60] h1_sizes = [100] #fp_length: 7 fp_depth: 8 conv_width: 60 h1_size: 100 #Neural test R2: 0.9314066399774756 max_r2 = 0 fp_length_opt, fp_length_opt, conv_width_opt = fp_lengths[0], fp_depths[ 0], conv_widths[0] for fp_length in fp_lengths: for fp_depth in fp_depths: for conv_width in conv_widths: for h1_size in h1_sizes: model_params = dict( fp_length= fp_length, # Usually neural fps need far fewer dimensions than morgan. fp_depth= fp_depth, # The depth of the network equals the fingerprint radius. conv_width= conv_width, # Only the neural fps need this parameter. h1_size= h1_size, # Size of hidden layer of network on top of fps. L2_reg=np.exp(-2)) test_r2_neural = run_conv_experiment(model_params) if max_r2 < test_r2_neural: fp_length_opt = fp_length fp_depth_opt = fp_depth conv_width_opt = conv_width max_r2 = max(test_r2_neural, max_r2) print("fp_length:", fp_length_opt, "fp_depth:", fp_depth_opt, "conv_width:", conv_width_opt, "h1_size:", h1_size) print("Neural test R2:", max_r2)
def plot(trained_weights): print "Loading training data..." traindata, valdata, testdata = load_data(task_params['data_file'], (task_params['N_train'], task_params['N_valid'], task_params['N_test']), input_name='smiles', target_name=task_params['target_name']) train_smiles, train_targets = traindata print "Convnet fingerprints with neural net" conv_arch_params['return_atom_activations'] = True output_layer_fun, parser, compute_atom_activations = \ build_convnet_fingerprint_fun(**conv_arch_params) atom_activations, array_rep = compute_atom_activations(trained_weights, train_smiles) if not os.path.exists('figures'): os.makedirs('figures') parent_molecule_dict = {} for mol_ix, atom_ixs in enumerate(array_rep['atom_list']): for atom_ix in atom_ixs: parent_molecule_dict[atom_ix] = mol_ix atom_neighbor_list = construct_atom_neighbor_list(array_rep) def get_neighborhood_ixs(array_rep, cur_atom_ix, radius): # Recursive function to get indices of all atoms in a certain radius. if radius == 0: return set([cur_atom_ix]) else: cur_set = set([cur_atom_ix]) for n_ix in atom_neighbor_list[cur_atom_ix]: cur_set.update(get_neighborhood_ixs(array_rep, n_ix, radius-1)) return cur_set # Recreate trained network. nn_train_params, vanilla_net_params = parse_training_params(params) conv_arch_params['return_atom_activations'] = False _, _, combined_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty']) net_loss_fun, net_pred_fun, net_parser = build_standard_net(**vanilla_net_params) net_weights = combined_parser.get(trained_weights, 'net weights') last_layer_weights = net_parser.get(net_weights, ('weights', 0)) for fp_ix in range(params['fp_length']): print "FP {0} has linear regression coefficient {1}".format(fp_ix, last_layer_weights[fp_ix][0]) combined_list = [] for radius in all_radii: fp_activations = atom_activations[radius][:, fp_ix] combined_list += [(fp_activation, atom_ix, radius) for atom_ix, fp_activation in enumerate(fp_activations)] unique_list = remove_duplicates(combined_list, key_lambda=lambda x: x[0]) combined_list = sorted(unique_list, key=lambda x: -x[0]) for fig_ix in range(num_figs_per_fp): # Find the most-activating atoms for this fingerprint index, across all molecules and depths. activation, most_active_atom_ix, cur_radius = combined_list[fig_ix] most_activating_mol_ix = parent_molecule_dict[most_active_atom_ix] highlight_list_our_ixs = get_neighborhood_ixs(array_rep, most_active_atom_ix, cur_radius) highlight_list_rdkit = [array_rep['rdkit_ix'][our_ix] for our_ix in highlight_list_our_ixs] print "radius:", cur_radius, "atom list:", highlight_list_rdkit, "activation", activation draw_molecule_with_highlights( "figures/fp_{0}_highlight_{1}.pdf".format(fp_ix, fig_ix), train_smiles[most_activating_mol_ix], highlight_atoms=highlight_list_rdkit)
def fit_fingerprints(task_params, model_params, train_params, verbose): if verbose: print( "Loading data from '{data_fname}' with\n\tsmiles column: '{smiles_column}'\n\ttarget column: '{target_column}'\n\tN_train: {N_train}\n\tN_validate: {N_validate}\n\tN_test: {N_test}\n" .format(**task_params)) data = load_data(filename=task_params['data_fname'], sizes=(task_params['N_train'], task_params['N_validate'], task_params['N_test']), input_name=task_params['smiles_column'], target_name=task_params['target_column']) if verbose: print( "Building fingerprint function of length {fp_length} as a convolutional network with width {fp_width} and depth {fp_depth} ..." .format(**model_params)) # Build deep convolutional neural network that when instantiated # with weights, take a list of smiles produces fingerprint vectors # for each. # weights type: WeightsParser # smiles type: Iterable[str] # output type: ndarray[??] # the see output_layer_fun_and_atom_activations function # fp_func type: Callable[[weights, smiles], output] # fp_parser type: WeightsParser fp_func, fp_parser = \ build_convnet_fingerprint_fun( num_hidden_features = [model_params['fp_width']] * model_params['fp_depth'], fp_length = model_params['fp_length'], normalize = True) if verbose: print("Building regression network ... ") # Builds a deep convolutional neural netowrk that stacks neural # fingerprint network on top of a vanilla convolutional network # with regularlized L2 loss function underneath # loss_fun type: Callable[[weights, smiles, targets], numeric] # pred_fun type: Callable[[weights, smiles], np.array] # combined_parser: WeightsParser net_params = dict( layer_sizes=[model_params['fp_length'], model_params['h1_size']], normalize=True, L2_reg=np.exp(model_params['log_l2_penalty']), nll_func=model_params['nll_func']) loss_fun, pred_fun, combined_parser = \ build_fingerprint_deep_net( net_params=net_params, fingerprint_func=fp_func, fp_parser=fp_parser, fp_l2_penalty=np.exp(model_params['log_l2_penalty'])) if verbose: print("Training model ...") # Train the full network for the activity using the training data # optimizing the loss over the validation data # predict_func type: Callable[[smiles], np.array] # trained_weights type: np.ndarray # training_curve type: Iterable[numeric] predict_func, trained_weights, training_curve = \ train_nn( pred_fun=pred_fun, loss_fun=loss_fun, nll_func_name=model_params['nll_func_name'], nll_func=model_params['nll_func'], num_weights=len(combined_parser), train_smiles=data[0][0], train_raw_targets=data[0][1], train_params=train_params, seed=task_params['seed'], validation_smiles=data[1][0], validation_raw_targets=data[1][1]) if verbose: print_performance(target_name=task_params['target_column'], predict_func=predict_func, nll_func_name=model_params['nll_func_name'], data=data) trained_fp_weights = combined_parser.get(trained_weights, 'fingerprint weights') return trained_fp_weights, training_curve