def embed_nf_original(input_directory, output_directory, weights, verbose = False, overwrite = True): trained_weights = None with open(weights) as f: trained_weights = pickle.load(f) conv_arch_params['return_atom_activations'] = True output_layer_fun, parser, compute_atom_activations = \ build_convnet_fingerprint_fun(**conv_arch_params) progress = tqdm(total = count_files(input_directory), disable = not verbose) files = np.asarray([input_directory + "/" + i for i in os.listdir(input_directory)]) embeddings = output_layer_fun(trained_weights, np.asarray(files)) output_map = {} for i, f in enumerate(files): output_map[f.split('/')[-1].split('.')[0]] = embeddings[i] with open(output_directory + '/' + f.split('/')[-1].split('.')[0] + '.nemb', 'wb+') as f2: for q in embeddings[i]: f2.write(str(q)) f2.write(' ') progress.update(1) progress.close() return output_map
def plot(trained_weights): print "Loading training data..." traindata, valdata, testdata = load_data(task_params['data_file'], (task_params['N_train'], task_params['N_valid'], task_params['N_test']), input_name='smiles', target_name=task_params['target_name']) train_smiles, train_targets = traindata print "Convnet fingerprints with neural net" conv_arch_params['return_atom_activations'] = True output_layer_fun, parser, compute_atom_activations = \ build_convnet_fingerprint_fun(**conv_arch_params) atom_activations, array_rep = compute_atom_activations(trained_weights, train_smiles) if not os.path.exists('figures'): os.makedirs('figures') parent_molecule_dict = {} for mol_ix, atom_ixs in enumerate(array_rep['atom_list']): for atom_ix in atom_ixs: parent_molecule_dict[atom_ix] = mol_ix atom_neighbor_list = construct_atom_neighbor_list(array_rep) def get_neighborhood_ixs(array_rep, cur_atom_ix, radius): # Recursive function to get indices of all atoms in a certain radius. if radius == 0: return set([cur_atom_ix]) else: cur_set = set([cur_atom_ix]) for n_ix in atom_neighbor_list[cur_atom_ix]: cur_set.update(get_neighborhood_ixs(array_rep, n_ix, radius-1)) return cur_set # Recreate trained network. nn_train_params, vanilla_net_params = parse_training_params(params) conv_arch_params['return_atom_activations'] = False _, _, combined_parser = \ build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty']) net_loss_fun, net_pred_fun, net_parser = build_standard_net(**vanilla_net_params) net_weights = combined_parser.get(trained_weights, 'net weights') last_layer_weights = net_parser.get(net_weights, ('weights', 0)) for fp_ix in range(params['fp_length']): print "FP {0} has linear regression coefficient {1}".format(fp_ix, last_layer_weights[fp_ix][0]) combined_list = [] for radius in all_radii: fp_activations = atom_activations[radius][:, fp_ix] combined_list += [(fp_activation, atom_ix, radius) for atom_ix, fp_activation in enumerate(fp_activations)] unique_list = remove_duplicates(combined_list, key_lambda=lambda x: x[0]) combined_list = sorted(unique_list, key=lambda x: -x[0]) for fig_ix in range(num_figs_per_fp): # Find the most-activating atoms for this fingerprint index, across all molecules and depths. activation, most_active_atom_ix, cur_radius = combined_list[fig_ix] most_activating_mol_ix = parent_molecule_dict[most_active_atom_ix] highlight_list_our_ixs = get_neighborhood_ixs(array_rep, most_active_atom_ix, cur_radius) highlight_list_rdkit = [array_rep['rdkit_ix'][our_ix] for our_ix in highlight_list_our_ixs] print "radius:", cur_radius, "atom list:", highlight_list_rdkit, "activation", activation draw_molecule_with_highlights( "figures/fp_{0}_highlight_{1}.pdf".format(fp_ix, fig_ix), train_smiles[most_activating_mol_ix], highlight_atoms=highlight_list_rdkit)
def invariant_conv_fp_func(): fp_func, parser = build_convnet_fingerprint_fun( num_hidden_features=[100, 100, 100], fp_length=64, normalize=False) weights = npr.randn(len(parser)) return lambda smiles: fp_func(weights, (smiles, ))