Пример #1
0
def main(args):
    BSZ, RUNS, LR, N_EPOCH = args.batch_size, args.runs, args.lr, args.epochs
    DATAFOLDER = Path(args.datafolder)
    assert DATAFOLDER.exists()
    OUTPUT = args.output_dir + DATAFOLDER.resolve().stem
    Path(OUTPUT).mkdir(parents=True, exist_ok=True)

    # load data
    NCLASS = len(args.target_name)
    if len(args.target_name) > 1:
        OUTPUT += '_'.join(t for t in args.target_name)
    else:
        OUTPUT += args.target_name[0]
    train_data, norm_fn, rstr_fn =\
        construct_dataset(DATAFOLDER/'train.csv', args.target_name,
                          args.sample, args.use_tqdm)
    val_smiles, val_y = load_csv(DATAFOLDER / 'val.csv', args.target_name,
                                 args.sample)
    test_smiles, test_y = load_csv(DATAFOLDER / 'test.csv', args.target_name,
                                   args.sample)
    if len(args.target_name) == 1:
        val_y, test_y = np.squeeze(val_y), np.squeeze(test_y)
    val_data = MolData(val_smiles, norm_fn(val_y), args.use_tqdm)
    test_data = MolData(test_smiles, norm_fn(test_y), args.use_tqdm)

    if args.fp_method == FP_METHODS[0]:
        raise NotImplementedError

    res = []
    for _ in range(RUNS):
        train_loader = DataLoader(train_data,
                                  batch_size=BSZ,
                                  shuffle=True,
                                  drop_last=True,
                                  pin_memory=True)
        valid_loader = DataLoader(val_data,
                                  batch_size=BSZ,
                                  shuffle=False,
                                  pin_memory=True)
        test_loader = DataLoader(test_data, batch_size=BSZ, shuffle=False)
        # net = QSAR(hid_dim=128, n_class=NCLASS)
        net = create_net(hid_dim=128,
                         n_class=NCLASS,
                         pre_trained=args.fine_tune)
        model_path = OUTPUT + "_"
        net = net.fit(train_loader,
                      valid_loader,
                      epochs=N_EPOCH,
                      path=model_path,
                      criterion=nn.MSELoss(),
                      lr=LR)
        score = net.predict(test_loader)
        gt = test_y
        prd = rstr_fn(score)
        res.append(mse(gt, prd, 0))
        print(f"mse_{DATAFOLDER.stem}_RUN_{_}: {mse(gt, prd)}")
        print(f"mse_percls_{DATAFOLDER.stem}_RUN_{_}: {mse(gt, prd, 0)}")

    avg_mse, std_mse = np.asarray(res).mean(), np.asarray(res).std()
    return avg_mse, std_mse
Пример #2
0
def main(reg=False, is_extra=True):
    pair = ['TARGET_CHEMBLID', 'CMPD_CHEMBLID', 'PCHEMBL_VALUE',
            'CANONICAL_SMILES', 'ACTIVITY_COMMENT', 'STANDARD_TYPE', 'RELATION']
    df = pd.read_csv('data/AR_ALL.csv')
    cmps = df.set_index(pair[1])[pair[3]].drop_duplicates()
    df = df[pair].set_index(pair[0:2])
    df['PCHEMBL_VALUE'] = df.groupby(pair[0:2]).mean()
    numery = df[pair[2:4]].dropna().drop_duplicates()

    comments = df[(df.ACTIVITY_COMMENT.str.contains('Not Active') == True)]
    inhibits = df[(df.STANDARD_TYPE == 'Inhibition') & df.RELATION.isin(['<', '<='])]
    relations = df[df.STANDARD_TYPE.isin(['EC50', 'IC50', 'Kd', 'Ki']) & df.RELATION.isin(['>', '>='])]
    binary = pd.concat([comments, inhibits, relations], axis=0)
    binary = binary[~binary.index.isin(numery.index)]
    binary['PCHEMBL_VALUE'] = 3.99
    binary = binary[pair[2:4]].dropna().drop_duplicates()

    df = numery.append(binary)
    df = df[pair[2]].unstack(pair[0])
    df = df.sample(len(df))

    if reg:
        test = binary[pair[2]].sample(len(binary)).unstack(pair[0])
    else:
        df = (df > 6.5).astype(float)
        test = df.sample(len(df)//8)
        df = df.drop(test.index)
    data = df if is_extra else numery.sample(len(numery))

    indep_set = MolData(cmps.loc[test.index], test.values)
    indep_loader = DataLoader(indep_set, batch_size=BATCH_SIZE)
    folds = KFold(5).split(data)
    cvs = np.zeros(data.shape)
    inds = np.zeros(test.shape)
    out = 'output/gcn%s' % ('_' + subset if subset else '')
    for i, (trained, valided) in enumerate(folds):
        trained, valided = data.iloc[trained], data.iloc[valided]
        train_set = MolData(cmps.loc[trained.index], trained.values)
        valid_set = MolData(cmps.loc[valided.index], valided.values)
        train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
        valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE)
        net = QSAR(hid_dim=128, n_class=data.shape[1]).to(util.dev)
        net = net.fit(train_loader, valid_loader, epochs=N_EPOCH, path='%s_%d' % (out, i))
        print('Evaluation of Loss in validation Set: %f' % net.evaluate(valid_loader))
        print('Evaluation of Loss in independent Set: %f' % net.evaluate(indep_loader))
        cvs[valided] = net.predict(valid_loader)
        inds += net.predict(indep_loader)

    data_score, test_score = pd.DataFrame(), pd.DataFrame()
    data_score['LABEL'] = data.stack()
    test_score['LABEL'] = test.stack()
    data_score['SCORE'] = pd.DataFrame(cvs, index=data.index, columns=data.columns).stack()
    test_score['SCORE'] = pd.DataFrame(inds, index=test.index, columns=test.columns).stack()
    data_score.to_csv(out + '.cv.txt')
    test_score.to_csv(out + '.ind.txt')
Пример #3
0
def construct_dataset(data_file, target_name, sample=None, use_tqdm=None):
    x, y = load_csv(data_file, target_name, sample)
    if len(target_name) == 1:
        # PyTorch MSELoss treats one class differently from multi-class
        y = np.squeeze(y)
    norm_fn, rstr_fn = normalize_array(y)
    target = norm_fn(y)
    data = MolData(x, target, use_tqdm=use_tqdm)
    return data, norm_fn, rstr_fn
Пример #4
0
 def build_data_net(args, target):
     if args.fp_method == FP_METHODS[0]:
         #""" CFP """
         data = SmileData(SMILES, target, fp_len=FP_LEN, radius=4)
         net = lambda: MLP(hid_dim=FP_LEN, n_class=1)
         return data, net
     elif args.fp_method == FP_METHODS[1]:
         #""" NFP """
         net = lambda: QSAR(hid_dim=128, n_class=1)
         data = MolData(SMILES, target)
         return data, net
     else:
         raise NotImplementedError
Пример #5
0
    INPUT = Path(args.input_file)
    if not INPUT.exists(): raise FileNotFoundError
    SMILES, TARGET, KEYS = load_multiclass_csv(INPUT,
                                               target_name=args.target_name)
    print(f"column names {INPUT.stem} with {len(KEYS)} columns:\
          {KEYS.tolist()}")
    NCLASS = len(KEYS)
    print(f"NCLASS: {NCLASS}")
    net = try_load_net(args.model)
    train_idx, valid_idx, test_idx = \
        split_train_valid_test(len(TARGET), seed=args.split_seed)
    print(f"split_sig: {SHUFFLE_SIG}")
    norm_func, restore_func = normalize_array(
        np.concatenate([TARGET[train_idx], TARGET[valid_idx]], axis=0))
    target = norm_func(TARGET)
    test_data = MolData(SMILES[test_idx], target[test_idx], use_tqdm=args.tqdm)
    test_loader = DataLoader(test_data, batch_size=BSZ, shuffle=False)
    score = net.predict(test_loader)
    gt = TARGET[test_idx]
    prd = restore_func(score)

    res_r2 = []
    res_cor = []
    res_mae = []
    res_mse = []
    if len(prd.shape) == 1:  # for single class
        prd = np.expand_dims(prd, 1)
    for idx, k in enumerate(KEYS):
        print(f"idx, k, {idx}, {k}, {prd.shape}, {gt.shape}")
        gt_i, prd_i = gt[:, idx], prd[:, idx]
        res_r2.append(r2_score(gt_i, prd_i))
Пример #6
0
                alpha=0.3)
    plt.xlabel("circular fingerprint distance")
    plt.ylabel("neural fingerprint distance")
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title("Correlation = {:.4f}".format(np.corrcoef(res[0], res[1])[0, 1]))
    plt.savefig(filename, dpi=300, bbox_inches='tight')


if __name__ == '__main__':
    # Load Data
    DATAFILE = Path('./dataset/solubility/delaney-processed.csv')
    df = pd.read_csv(DATAFILE)
    target = df['measured log solubility in mols per litre'].values
    target, restore = normalize_array(target)
    data = MolData(df['smiles'], target)
    print(type(df['smiles'][0]), df['smiles'][0])
    tmp = df['smiles'][0]
    print(get_circular_fp(tmp))
    exit()

    # Plot with a random weight and 2048 length as in Figure3Left
    gcn_act = ['sigmoid', 'relu', 'tanh']
    gop_act = ['sigmoid', 'tanh', 'softmax']
    large_weights = [(-1e7, 1e7), (0, 1e7), (-1e3, 1e3), (-10, 10)]
    max_degs = [1, 6]
    res = {}
    for a1, a2, bnds, rd in its.product(gcn_act, gop_act, large_weights,
                                        max_degs):
        SEED, FP_LEN = 7, 1 << 11
        net = QSAR(hid_dim=FP_LEN,