def build_predictor(net_type, fp_length, fp_depth, conv_width, h1_size, L2_reg, nll_func): if net_type == 'mean': return build_mean_predictor(nll_func) elif net_type == 'conv_plus_linear': vanilla_net_params = dict(layer_sizes=[fp_length], normalize=True, L2_reg=L2_reg, nll_func=nll_func) conv_params = dict(num_hidden_features=[conv_width] * fp_depth, fp_length=fp_length) return build_conv_deep_net(conv_params, vanilla_net_params) elif net_type == 'morgan_plus_linear': vanilla_net_params = dict(layer_sizes=[fp_length], normalize=True, L2_reg=L2_reg, nll_func=nll_func) return build_morgan_deep_net(fp_length, fp_depth, vanilla_net_params) elif net_type == 'conv_plus_net': vanilla_net_params = dict(layer_sizes=[fp_length, h1_size], normalize=True, L2_reg=L2_reg, nll_func=nll_func) conv_params = dict(num_hidden_features=[conv_width] * fp_depth, fp_length=fp_length) return build_conv_deep_net(conv_params, vanilla_net_params) elif net_type == 'morgan_plus_net': vanilla_net_params = dict(layer_sizes=[fp_length, h1_size], normalize=True, L2_reg=L2_reg, nll_func=nll_func) return build_morgan_deep_net(fp_length, fp_depth, vanilla_net_params) else: raise Exception("Unknown network type.")
def run_conv_experiment(model_params): # Define the architecture of the network that sits on top of the fingerprints. vanilla_net_params = dict( layer_sizes=[model_params['fp_length'], model_params['h1_size']], # One hidden layer. normalize=True, L2_reg=model_params['L2_reg'], nll_func=rmse) conv_layer_sizes = [model_params['conv_width'] ] * model_params['fp_depth'] conv_arch_params = { 'num_hidden_features': conv_layer_sizes, 'fp_length': model_params['fp_length'], 'normalize': 1 } loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) test_predictions = predict_func(test_inputs) return r2(test_predictions, test_targets)
def compute_fingerprints(dataset, train_file, test_file, learning_rate): train, val, test = dataset X_train, y_train = train X_val, y_val = val X_test, y_test = test X_train_val = np.concatenate((X_train, X_val)) y_train_val = np.concatenate((y_train, y_val)) global train_params # train_params["num_iters"] = int(len(X_train)/train_params["batch_size"]) train_params["step_size"] = learning_rate smiles_to_fps = {} conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth'] conv_arch_params = { 'num_hidden_features': conv_layer_sizes, 'fp_length': model_params['fp_length'], 'normalize': 1, 'smiles_to_fps': smiles_to_fps } loss_fun, pred_fun, conv_parser = build_conv_deep_net( conv_arch_params, vanilla_net_params, model_params['L2_reg']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = train_nn( pred_fun, loss_fun, num_weights, X_train, y_train, train_params, validation_smiles=X_val, validation_raw_targets=y_val) pred_fun(trained_weights, X_train_val) with open(train_file, "w+") as smiles_fps_file: header = ["smiles", "fingerprints", "target"] file_info = [[smile, smiles_to_fps[smile], target] for smile, target in zip(X_train_val, y_train_val)] writer = csv.writer(smiles_fps_file) writer.writerow(header) for line in file_info: writer.writerow(line) predict_func(X_test) with open(test_file, "w+") as smiles_fps_file: header = ["smiles", "fingerprints", "target"] file_info = [[smile, smiles_to_fps[smile], target] for smile, target in zip(X_test, y_test)] writer = csv.writer(smiles_fps_file) writer.writerow(header) for line in file_info: writer.writerow(line)
def train_neural_fingerprint(train_directory, labels_mapping, tmp_dir, n_epochs=15): global task_params task_params['N_train'] = int(len(os.listdir(train_directory)) * 0.7) task_params['N_valid'] = int(len(os.listdir(train_directory)) * 0.01) task_params['N_test'] = int(len(os.listdir(train_directory)) * 0.01) task_params['data_file'] = tmp_dir global num_epochs num_epochs = n_epochs directory = train_directory output = open(tmp_dir, 'wb+') files = os.listdir(directory) output.write('graph,label\n') for f in files: output.write(directory + '/' + f + ',' + str(labels_mapping[f]) + '\n') output.close() print "Loading data..." traindata, valdata, testdata = load_data(task_params['data_file'], (task_params['N_train'], task_params['N_valid'], task_params['N_test']), input_name='graph', target_name=task_params['target_name']) train_inputs, train_targets = traindata val_inputs, val_targets = valdata print "Regression on", task_params['N_train'], "training points." def print_performance(pred_func): train_preds = pred_func(train_inputs) val_preds = pred_func(val_inputs) print "\nPerformance (RMSE) on " + task_params['target_name'] + ":" print "Train:", rmse(train_preds, train_targets) print "Test: ", rmse(val_preds, val_targets) print "-" * 80 return rmse(val_preds, val_targets) print "-" * 80 print "Mean predictor" y_train_mean = np.mean(train_targets) print_performance(lambda x : y_train_mean) print "Task params", params nn_train_params, vanilla_net_params = parse_training_params(params) conv_arch_params['return_atom_activations'] = False loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) print_performance(predict_func) return trained_weights
def run_conv_experiment(): conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth'] conv_arch_params = {'num_hidden_features' : conv_layer_sizes, 'fp_length' : model_params['fp_length'], 'normalize' : 1} loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) test_predictions = predict_func(test_inputs) return rmse(test_predictions, test_targets)
def fit(self, smiles_list, logS_list, seed=0): train_smiles = [smiles for smiles in smiles_list] train_logS = [logS for logS in logS_list] conv_layer_sizes = [self.model_params['conv_width'] ] * self.model_params['fp_depth'] conv_arch_params = { 'num_hidden_features': conv_layer_sizes, 'fp_length': self.model_params['fp_length'], 'normalize': 1 } # Neural net architecture net_arch_params = dict(layer_sizes=[ self.model_params['fp_length'], self.model_params['h1_size'] ], normalize=True, L2_reg=self.model_params['L2_reg'], nll_func=rmse) loss_fun, pred_fun, conv_parser = build_conv_deep_net( conv_arch_params, net_arch_params, self.model_params['L2_reg']) num_weights = len(conv_parser) init_weights = npr.RandomState(seed).randn( num_weights) * self.train_params['init_scale'] train_logS_norm, undo_norm = normalize_array(train_logS) # Build gradient using autograd. grad_fun = grad(loss_fun) grad_fun_with_data = build_batched_grad( grad_fun, self.train_params['batch_size'], train_smiles, train_logS_norm) # Optimize weights. trained_weights = adam(grad_fun_with_data, init_weights, num_iters=self.train_params['num_iters'], step_size=self.train_params['step_size']) self.model = (undo_norm, trained_weights, pred_fun)
def train_neural_fingerprint(): print "Loading data..." traindata, valdata, testdata = load_data( task_params['data_file'], (task_params['N_train'], task_params['N_valid'], task_params['N_test']), input_name='smiles', target_name=task_params['target_name']) train_inputs, train_targets = traindata val_inputs, val_targets = valdata print "Regression on", task_params['N_train'], "training points." def print_performance(pred_func): train_preds = pred_func(train_inputs) val_preds = pred_func(val_inputs) print "\nPerformance (RMSE) on " + task_params['target_name'] + ":" print "Train:", rmse(train_preds, train_targets) print "Test: ", rmse(val_preds, val_targets) print "-" * 80 return rmse(val_preds, val_targets) print "-" * 80 print "Mean predictor" y_train_mean = np.mean(train_targets) print_performance(lambda x: y_train_mean) print "Task params", params nn_train_params, vanilla_net_params = parse_training_params(params) conv_arch_params['return_atom_activations'] = False print "Convnet fingerprints with neural net" loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) print_performance(predict_func) return trained_weights
def train_neural_fingerprint(): print "Loading data..." traindata, valdata, testdata = load_data(task_params['data_file'], (task_params['N_train'], task_params['N_valid'], task_params['N_test']), input_name='smiles', target_name=task_params['target_name']) train_inputs, train_targets = traindata val_inputs, val_targets = valdata print "Regression on", task_params['N_train'], "training points." def print_performance(pred_func): train_preds = pred_func(train_inputs) val_preds = pred_func(val_inputs) print "\nPerformance (RMSE) on " + task_params['target_name'] + ":" print "Train:", rmse(train_preds, train_targets) print "Test: ", rmse(val_preds, val_targets) print "-" * 80 return rmse(val_preds, val_targets) print "-" * 80 print "Mean predictor" y_train_mean = np.mean(train_targets) print_performance(lambda x : y_train_mean) print "Task params", params nn_train_params, vanilla_net_params = parse_training_params(params) conv_arch_params['return_atom_activations'] = False print "Convnet fingerprints with neural net" loss_fun, pred_fun, conv_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty']) num_weights = len(conv_parser) predict_func, trained_weights, conv_training_curve = \ train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets, nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets) print_performance(predict_func) return trained_weights
def plot(trained_weights): print "Loading training data..." traindata, valdata, testdata = load_data(task_params['data_file'], (task_params['N_train'], task_params['N_valid'], task_params['N_test']), input_name='smiles', target_name=task_params['target_name']) train_smiles, train_targets = traindata print "Convnet fingerprints with neural net" conv_arch_params['return_atom_activations'] = True output_layer_fun, parser, compute_atom_activations = \ build_convnet_fingerprint_fun(**conv_arch_params) atom_activations, array_rep = compute_atom_activations(trained_weights, train_smiles) if not os.path.exists('figures'): os.makedirs('figures') parent_molecule_dict = {} for mol_ix, atom_ixs in enumerate(array_rep['atom_list']): for atom_ix in atom_ixs: parent_molecule_dict[atom_ix] = mol_ix atom_neighbor_list = construct_atom_neighbor_list(array_rep) def get_neighborhood_ixs(array_rep, cur_atom_ix, radius): # Recursive function to get indices of all atoms in a certain radius. if radius == 0: return set([cur_atom_ix]) else: cur_set = set([cur_atom_ix]) for n_ix in atom_neighbor_list[cur_atom_ix]: cur_set.update(get_neighborhood_ixs(array_rep, n_ix, radius-1)) return cur_set # Recreate trained network. nn_train_params, vanilla_net_params = parse_training_params(params) conv_arch_params['return_atom_activations'] = False _, _, combined_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty']) net_loss_fun, net_pred_fun, net_parser = build_standard_net(**vanilla_net_params) net_weights = combined_parser.get(trained_weights, 'net weights') last_layer_weights = net_parser.get(net_weights, ('weights', 0)) for fp_ix in range(params['fp_length']): print "FP {0} has linear regression coefficient {1}".format(fp_ix, last_layer_weights[fp_ix][0]) combined_list = [] for radius in all_radii: fp_activations = atom_activations[radius][:, fp_ix] combined_list += [(fp_activation, atom_ix, radius) for atom_ix, fp_activation in enumerate(fp_activations)] unique_list = remove_duplicates(combined_list, key_lambda=lambda x: x[0]) combined_list = sorted(unique_list, key=lambda x: -x[0]) for fig_ix in range(num_figs_per_fp): # Find the most-activating atoms for this fingerprint index, across all molecules and depths. activation, most_active_atom_ix, cur_radius = combined_list[fig_ix] most_activating_mol_ix = parent_molecule_dict[most_active_atom_ix] highlight_list_our_ixs = get_neighborhood_ixs(array_rep, most_active_atom_ix, cur_radius) highlight_list_rdkit = [array_rep['rdkit_ix'][our_ix] for our_ix in highlight_list_our_ixs] print "radius:", cur_radius, "atom list:", highlight_list_rdkit, "activation", activation draw_molecule_with_highlights( "figures/fp_{0}_highlight_{1}.pdf".format(fp_ix, fig_ix), train_smiles[most_activating_mol_ix], highlight_atoms=highlight_list_rdkit)
def conv_fp_func(conv_params): loss, _, parser = build_conv_deep_net(conv_params, vanilla_net_params, fp_l2_penalty=0.0) return lambda weights: loss(weights, smiles, targets), parser