예제 #1
0
def embed_nf_original(input_directory, 
                      output_directory, 
                      weights,
                      verbose = False,
                      overwrite = True):

    trained_weights = None
    with open(weights) as f:
        trained_weights = pickle.load(f)

    conv_arch_params['return_atom_activations'] = True
    output_layer_fun, parser, compute_atom_activations = \
       build_convnet_fingerprint_fun(**conv_arch_params)
   
    progress = tqdm(total = count_files(input_directory), disable = not verbose)
    files = np.asarray([input_directory + "/" +  i for i in os.listdir(input_directory)])
    embeddings = output_layer_fun(trained_weights, np.asarray(files))

    output_map = {}
    for i, f in enumerate(files):
        output_map[f.split('/')[-1].split('.')[0]] = embeddings[i]
        with open(output_directory + '/' + f.split('/')[-1].split('.')[0] + '.nemb', 'wb+') as f2:
            for q in embeddings[i]:
                f2.write(str(q))
                f2.write(' ')
        progress.update(1)
    progress.close()

    return output_map
def plot(trained_weights):
    print "Loading training data..."
    traindata, valdata, testdata = load_data(task_params['data_file'],
                        (task_params['N_train'], task_params['N_valid'], task_params['N_test']),
                        input_name='smiles', target_name=task_params['target_name'])
    train_smiles, train_targets = traindata

    print "Convnet fingerprints with neural net"
    conv_arch_params['return_atom_activations'] = True
    output_layer_fun, parser, compute_atom_activations = \
       build_convnet_fingerprint_fun(**conv_arch_params)
    atom_activations, array_rep = compute_atom_activations(trained_weights, train_smiles)

    if not os.path.exists('figures'): os.makedirs('figures')

    parent_molecule_dict = {}
    for mol_ix, atom_ixs in enumerate(array_rep['atom_list']):
        for atom_ix in atom_ixs:
            parent_molecule_dict[atom_ix] = mol_ix

    atom_neighbor_list = construct_atom_neighbor_list(array_rep)

    def get_neighborhood_ixs(array_rep, cur_atom_ix, radius):
        # Recursive function to get indices of all atoms in a certain radius.
        if radius == 0:
            return set([cur_atom_ix])
        else:
            cur_set = set([cur_atom_ix])
            for n_ix in atom_neighbor_list[cur_atom_ix]:
                cur_set.update(get_neighborhood_ixs(array_rep, n_ix, radius-1))
            return cur_set

    # Recreate trained network.
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False
    _, _, combined_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])

    net_loss_fun, net_pred_fun, net_parser = build_standard_net(**vanilla_net_params)
    net_weights = combined_parser.get(trained_weights, 'net weights')
    last_layer_weights = net_parser.get(net_weights, ('weights', 0))

    for fp_ix in range(params['fp_length']):
        print "FP {0} has linear regression coefficient {1}".format(fp_ix, last_layer_weights[fp_ix][0])
        combined_list = []
        for radius in all_radii:
            fp_activations = atom_activations[radius][:, fp_ix]
            combined_list += [(fp_activation, atom_ix, radius) for atom_ix, fp_activation in enumerate(fp_activations)]

        unique_list = remove_duplicates(combined_list, key_lambda=lambda x: x[0])
        combined_list = sorted(unique_list, key=lambda x: -x[0])

        for fig_ix in range(num_figs_per_fp):
            # Find the most-activating atoms for this fingerprint index, across all molecules and depths.
            activation, most_active_atom_ix, cur_radius = combined_list[fig_ix]
            most_activating_mol_ix = parent_molecule_dict[most_active_atom_ix]
            highlight_list_our_ixs = get_neighborhood_ixs(array_rep, most_active_atom_ix, cur_radius)
            highlight_list_rdkit = [array_rep['rdkit_ix'][our_ix] for our_ix in highlight_list_our_ixs]

            print "radius:", cur_radius, "atom list:", highlight_list_rdkit, "activation", activation
            draw_molecule_with_highlights(
                "figures/fp_{0}_highlight_{1}.pdf".format(fp_ix, fig_ix),
                train_smiles[most_activating_mol_ix],
                highlight_atoms=highlight_list_rdkit)
def invariant_conv_fp_func():
    fp_func, parser = build_convnet_fingerprint_fun(
        num_hidden_features=[100, 100, 100], fp_length=64, normalize=False)
    weights = npr.randn(len(parser))
    return lambda smiles: fp_func(weights, (smiles, ))