Exemplo n.º 1
0
def test_scheme_C2():

    # Need to test it as a learning curve, not just pick one

    num = 3

    atom_df = pd.read_pickle('tests/test_mols/atoms.pkl')

    pair_df = pd.read_pickle('tests/test_mols/pairs.pkl')
    mol_df, graphs = graphin.make_graph_df(atom_df, pair_df)

    total = len(graphs)
    counter = 0
    chosen = []
    while len(chosen) < len(graphs) - num:

        # Iterations continue until there are not enough graphs to select
        # Each iteration we want (num*n) graphs in train, and (total-(num*n)) graphs in test
        # Make the selection the same each time, we just want to select chosen from the df
        # We then just grow chosen each time using the selection scheme

        counter += 1
        chosen = select_molecules_C2(mol_df,
                                     atom_df,
                                     pair_df,
                                     prev_chosen=chosen,
                                     num=num)
        train_graphs, train_mol_df, test_graphs, test_mol_df = get_split(
            chosen, mol_df, graphs)

        assert len(train_graphs) == num * counter
        assert len(test_graphs) == total - (num * counter)
        assert len(train_mol_df.molecule_name.unique()) == len(
            train_mol_df.molecule_name)
        assert len(test_mol_df.molecule_name.unique()) == len(
            test_mol_df.molecule_name)
        assert counter < total / num

        for molname in train_mol_df.molecule_name.unique():
            assert not molname in test_mol_df.molecule_name.unique()

        for molname1 in train_mol_df.molecule_name:
            mol_df1 = atom_df.loc[(atom_df.molecule_name == molname1)]["conn"]
            for molname2 in test_mol_df.molecule_name:
                mol_df2 = atom_df.loc[(
                    atom_df.molecule_name == molname1)]["conn"]
                count1 = [bond for conn in mol_df1.values
                          for bond in conn].count(2)
                count2 = [bond for conn in mol_df2.values
                          for bond in conn].count(2)
                assert count1 >= count2
Exemplo n.º 2
0
def test_scheme_D6():
    num = 3

    atom_df = pd.read_pickle('tests/test_mols/atoms.pkl')
    pair_df = pd.read_pickle('tests/test_mols/pairs.pkl')
    mol_df, graphs = graphin.make_graph_df(atom_df, pair_df)

    total = len(graphs)
    counter = 0
    chosen = []
    while len(chosen) < len(graphs) - num:

        # Iterations continue until there are not enough graphs to select
        # Each iteration we want (num*n) graphs in train, and (total-(num*n)) graphs in test
        # Make the selection the same each time, we just want to select chosen from the df
        # We then just grow chosen each time using the selection scheme

        counter += 1

        chosen = select_molecules_D6(mol_df,
                                     atom_df,
                                     pair_df,
                                     prev_chosen=chosen,
                                     num=num)
        train_graphs, train_mol_df, test_graphs, test_mol_df = get_split(
            chosen, mol_df, graphs)

        assert len(train_graphs) == num * counter
        assert len(test_graphs) == total - (num * counter)
        assert len(train_mol_df.molecule_name.unique()) == len(
            train_mol_df.molecule_name)
        assert len(test_mol_df.molecule_name.unique()) == len(
            test_mol_df.molecule_name)
        assert counter < total / num

        for molname1 in train_mol_df.molecule_name:
            mol_df1 = atom_df.loc[(
                atom_df.molecule_name == molname1)]["typestr"]
            count1 = 0
            for type in mol_df1.values:
                if type not in ['H', 'C', 'N', 'O', 'F']:
                    count1 += 1
            for molname2 in test_mol_df.molecule_name:
                mol_df2 = atom_df.loc[(
                    atom_df.molecule_name == molname2)]["typestr"]
                count2 = 0
                for type in mol_df2.values:
                    if type not in ['H', 'C', 'N', 'O', 'F']:
                        count2 += 1
                assert count1 >= count2
Exemplo n.º 3
0
def test_scheme_B2():

    # Need to test it as a learning curve, not just pick one

    num = 3

    atom_df = pd.read_pickle('tests/test_mols/atoms.pkl')
    pair_df = pd.read_pickle('tests/test_mols/pairs.pkl')
    mol_df, graphs = graphin.make_graph_df(atom_df, pair_df)

    total = len(graphs)
    counter = 0
    chosen = []
    while len(chosen) < len(graphs) - num:

        # Iterations continue until there are not enough graphs to select
        # Each iteration we want (num*n) graphs in train, and (total-(num*n)) graphs in test
        # Make the selection the same each time, we just want to select chosen from the df
        # We then just grow chosen each time using the selection scheme

        counter += 1

        chosen = select_molecules_B2(mol_df,
                                     atom_df,
                                     pair_df,
                                     prev_chosen=chosen,
                                     num=num)
        train_graphs, train_mol_df, test_graphs, test_mol_df = get_split(
            chosen, mol_df, graphs)

        assert len(train_graphs) == num * counter
        assert len(test_graphs) == total - (num * counter)
        assert len(train_mol_df.molecule_name.unique()) == len(
            train_mol_df.molecule_name)
        assert len(test_mol_df.molecule_name.unique()) == len(
            test_mol_df.molecule_name)
        assert counter < total / num

        for molname in train_mol_df.molecule_name.unique():
            assert not molname in test_mol_df.molecule_name.unique()

        for graph1 in train_graphs:
            for graph2 in test_graphs:
                assert graph1.number_of_nodes() <= graph2.number_of_nodes()
Exemplo n.º 4
0
def add_IMP_to_df(atom_df, pair_df, model_dir):

    mol_df, graphs = graph_in.make_graph_df(atom_df, pair_df)

    print('Making predictions:')

    modelfile = model_dir + 'all_model.torch'

    model = GTNmodel()
    model.load_model(modelfile)
    model.params['batch_size'] = 4
    test_loader = model.get_input(graphs, mol_df)
    graphs_out = model.predict(test_loader, progress=True)

    atom_df, pair_df = model.assign_preds(graphs_out,
                                          mol_df,
                                          atom_df,
                                          pair_df,
                                          assign_to="",
                                          progress=True)

    return atom_df, pair_df
Exemplo n.º 5
0
def test_scheme_I7():

    num = 3

    atom_df = pd.read_pickle('tests/test_mols/atoms.pkl')
    pair_df = pd.read_pickle('tests/test_mols/pairs.pkl')
    mol_df, graphs = graphin.make_graph_df(atom_df, pair_df)

    total = len(graphs)
    counter = 0
    chosen = []
    while len(chosen) < len(graphs) - num:

        atom_df = add_randomised_FEPs(atom_df)
        atom_df = add_randomised_FEP_vars(atom_df)

        counter += 1
        chosen = select_molecules_I7(mol_df,
                                     atom_df,
                                     pair_df,
                                     prev_chosen=chosen,
                                     num=num)
        train_graphs, train_mol_df, test_graphs, test_mol_df = get_split(
            chosen, mol_df, graphs)

        assert len(train_graphs) == num * counter, print(
            len(train_graphs), num, counter)
        assert len(test_graphs) == total - (num * counter)
        assert len(train_mol_df.molecule_name.unique()) == len(
            train_mol_df.molecule_name)
        assert len(test_mol_df.molecule_name.unique()) == len(
            test_mol_df.molecule_name)
        assert counter < total / num

        for molname in train_mol_df.molecule_name.unique():
            assert not molname in test_mol_df.molecule_name.unique()
    assert counter == 2
    assert len(chosen) == len(graphs) - len(graphs) % 3