示例#1
0
def getGraphX(AID):
    '''Takes in AID, finds graphreps for pytroch implementation'''
    AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', AID) 
    save_path = AID_path+ '/' + AID +'mol_processed.pkl'
    pickle_off = open(save_path,'rb')
    activity_table=pickle.load(pickle_off)
    pickle_off.close() 
    graph_rep_list = [mol2graph.mol2vec(m) for m in activity_table['MOL']]
    AID_and_graph_rep = pd.DataFrame()
    AID_and_graph_rep['Graph Rep']= graph_rep_list
    AID_and_graph_rep['PUBCHEM_CID'] = activity_table['PUBCHEM_CID']
    
    main_aid_save_path = AID_path+ '/' + AID +'_processed.pkl'
    pickle_off = open(main_aid_save_path,'rb')
    main_activity_table=pickle.load(pickle_off)
    pickle_off.close()
    main_activity_table = main_activity_table.merge(AID_and_graph_rep,on='PUBCHEM_CID')
    save_df = True
    new_aid_save_path = main_aid_save_path = AID_path+ '/' + AID +'graph_processed.pkl'

    if save_df == True:
        main_activity_table.to_pickle(new_aid_save_path)
    return main_activity_table

#%%
    '''Classifier Section'''
    '''SVM'''
                                          test_size=0.2,
                                          random_state=2562)
 labels = np.array([
     1 if x == 'Active' else 0
     for x in activity_table['PUBCHEM_ACTIVITY_OUTCOME']
 ])
 for big_train_ind, big_test_ind in big_splitter.split(
         activity_table, activity_table['PUBCHEM_ACTIVITY_OUTCOME']):
     train_X = np.atleast_2d(activity_table['MOL'].iloc[big_train_ind]).T
     train_y = labels[big_train_ind]
     test_X = np.atleast_2d(activity_table['MOL'].iloc[big_test_ind]).T
     test_y = labels[big_test_ind]
     train_X_oversampled, train_y_oversampled = ros.fit_resample(
         train_X, train_y)
     train_X = [
         mol2graph.mol2vec(m) for m in np.squeeze(train_X_oversampled)
     ]
     test_X = [mol2graph.mol2vec(m) for m in np.squeeze(test_X)]
     #attach train labels to data
     for data, label in zip(train_X, train_y_oversampled):
         data.y = torch.tensor([[label]], dtype=torch.float)
     for data, label in zip(test_X, test_y):
         data.y = torch.tensor([[label]], dtype=torch.float)
     train_loader = DataLoader(train_X,
                               batch_size=128,
                               shuffle=True,
                               drop_last=False,
                               num_workers=8)
     test_loader = DataLoader(test_X,
                              batch_size=128,
                              shuffle=True,
示例#3
0
            f'Epoch: {epoch}, Loss: {epoch_loss:.3f}, Train acc: {train_acc:.3f}, Val acc: {val_acc:.3f}',
        )

    return hist


if __name__ == '__main__':
    get_data()

    train_mols = [m for m in Chem.SDMolSupplier('solubility.train.sdf')]
    test_mols = [m for m in Chem.SDMolSupplier('solubility.test.sdf')]
    sol_cls_dict = {'(A) low': 0, '(B) medium': 1, '(C) high': 2}

    print(sol_cls_dict)

    train_x = [mol2graph.mol2vec(m) for m in train_mols]
    for i, data in enumerate(train_x):
        y = sol_cls_dict[train_mols[i].GetProp('SOL_classification')]
        data.y = torch.tensor([y], dtype=torch.long)

    test_x = [mol2graph.mol2vec(m) for m in test_mols]
    for i, data in enumerate(test_x):
        y = sol_cls_dict[test_mols[i].GetProp('SOL_classification')]
        data.y = torch.tensor([y], dtype=torch.long)

    print(f'Number of graphs: {len(train_x)}')
    first_sample = train_x[0]
    print(f'Looking at first example ..')
    print(f'\t{train_x[0]}')
    print(f'\t # of nodes: {first_sample.x.shape[0]}')
    print(f'\t # of features per node: {first_sample.x.shape[1]}')