def fit_clfs(chid, n_estimators, n_jobs):
    """
    Args:
        chid: which assay to use:
        external_file:
    Returns:
        clfs: Dictionary of fitted classifiers
        aucs: Dictionary of AUCs
        balance: Two numbers showing the number of actives in split 1 / split 2
        df1: data in split 1
        df2: data in split 2
    """
    # read data and calculate ecfp fingerprints
    assay_file = f'./assays/processed/{chid}.csv'
    print(f'Reading data from: {assay_file}')
    df = pd.read_csv(assay_file)

    df['ecfp'] = ecfp(df.smiles)
    df1, df2 = train_test_split(df, test_size=0.5, stratify=df['label'])

    X1 = np.array(list(df1['ecfp']))
    X2 = np.array(list(df2['ecfp']))

    y1 = np.array(list(df1['label']))
    y2 = np.array(list(df2['label']))

    del df1['ecfp']
    del df2['ecfp']

    balance = (np.mean(y1), np.mean(y2))

    # train classifiers and store them in dictionary
    clfs = {}
    clfs['Split1'] = RandomForestClassifier(
        n_estimators=n_estimators, n_jobs=n_jobs)
    clfs['Split1'].fit(X1, y1)

    clfs['Split1_alt'] = RandomForestClassifier(
        n_estimators=n_estimators, n_jobs=n_jobs)
    clfs['Split1_alt'].fit(X1, y1)

    clfs['Split2'] = RandomForestClassifier(
        n_estimators=n_estimators, n_jobs=n_jobs)
    clfs['Split2'].fit(X2, y2)

    # calculate AUCs for the clfs
    aucs = {}
    aucs['Split1'] = calc_auc(clfs['Split1'], X2, y2)
    aucs['Split1_alt'] = calc_auc(clfs['Split1_alt'], X2, y2)
    aucs['Split2'] = calc_auc(clfs['Split2'], X1, y1)
    print("AUCs:")
    for k, v in aucs.items():
        print(f'{k}: {v}')

    return clfs, aucs, balance, df1, df2
示例#2
0
def train_epoch(model, optimizer, criterion, x_train, x_train_external,
                y_train):
    model.train()
    auc_meter, loss_meter, it_count = 0, 0, 0
    batch_size = config.batch_size

    for i in range(0, len(x_train) - batch_size + 1, batch_size):
        inputs1 = torch.tensor(x_train[i:i + batch_size],
                               dtype=torch.float,
                               device=device)
        inputs2 = torch.tensor(x_train_external[i:i + batch_size],
                               dtype=torch.float,
                               device=device)
        target = torch.tensor(y_train[i:i + batch_size],
                              dtype=torch.float,
                              device=device)
        output = model.forward(inputs1, inputs2)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        loss_meter += loss.item()
        it_count += 1
        auc_meter = auc_meter + utils.calc_auc(target, torch.sigmoid(output))

    return loss_meter / it_count, auc_meter / it_count
示例#3
0
def evalute(sess: tf.Session, test_data: TrainDataIter,
            model: Model) -> typing.Tuple:
    loss_sum = 0.0
    accuracy_sum = 0.0
    aux_loss_sum = 0.0

    cnt = 0

    store_arr = []

    for feature, target in test_data:
        cnt += 1

        user_ids, ad_ids, code_ids, ad_his, code_his, ad_mask, lengths_xx, target = prepare_data(
            feature, target, choose_len=0)

        prob, loss, acc, aux_loss = model.calculate(sess, [
            user_ids, ad_ids, code_ids, ad_his, code_his, ad_mask, target,
            lengths_xx
        ])

        loss_sum += loss
        accuracy_sum += acc
        aux_loss_sum += aux_loss

        prob_1 = prob[:, 1].tolist()
        target_1 = target[:, 1].tolist()

        for p, t in zip(prob_1, target_1):
            store_arr.append([p, t])
    all_auc, r, p, f1 = calc_auc(store_arr)

    return all_auc, r, p, f1, loss_sum / cnt, accuracy_sum / cnt, aux_loss_sum / cnt
示例#4
0
def evaluate(sess, test_data, model):
    test_loss_sum = 0.0
    test_accuracy_sum = 0.0
    test_aux_loss_sum = 0.0
    nums = 0
    stored_arr = []
    for src, tgt in test_data:
        nums += 1
        uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(
            src, tgt, return_neg=True)
        prob, loss, acc, aux_loss = model.calculate(sess, [
            uids, mids, cats, mid_his, cat_his, mid_mask, target, sl,
            noclk_mids, noclk_cats, temp
        ])

        test_loss_sum += loss
        test_accuracy_sum += acc
        test_aux_loss_sum = aux_loss
        prob_1 = prob[:, 0].tolist()
        target_1 = target[:, 0].tolist()
        for p, t in zip(prob_1, target_1):
            stored_arr.append([p, t])

    test_auc = utils.calc_auc(stored_arr)
    test_loss_avg = test_loss_sum / nums
    test_accuracy_avg = test_accuracy_sum / nums
    test_aux_loss_avg = test_aux_loss_sum / nums
    return test_auc, test_loss_avg, test_accuracy_avg, test_aux_loss_avg
示例#5
0
def val_epoch(model, criterion, x_val,x_val_external,y_val):
    model.eval()
    auc_meter,loss_meter, it_count = 0, 0,0
    batch_size=config.batch_size
    
    with torch.no_grad():
        for i in range(0,len(x_val)-batch_size,batch_size):      
            inputs1 = torch.tensor(x_val[i:i+batch_size],dtype=torch.float,device=device)
            inputs2 = torch.tensor(x_val_external[i:i+batch_size],dtype=torch.float,device=device)
            target =  torch.tensor(y_val[i:i+batch_size],dtype=torch.float,device=device)
            output = model(inputs1,inputs2)
            loss = criterion(output, target)
            loss_meter += loss.item()
            it_count += 1 
            auc_meter =auc_meter + utils.calc_auc(target, torch.sigmoid(output))          
    return loss_meter / it_count, auc_meter/ it_count
示例#6
0
def _eval(sess, model, test_data, label):
    ano_scores = []
    for _, batch_test_data in DataInput(test_data, test_batch_size):
        _ano_score, _, _ = model.eval(sess, batch_test_data)
        # Extend
        ano_scores += list(_ano_score)
    ano_scores = np.array(ano_scores).reshape((-1, 1))

    # Highest 80% are anomalous
    prec, rec, f1 = calc_metric(label, ano_scores)
    # Calculate auprc
    _auprc = calc_auc(label, ano_scores)

    global best_f1
    if best_f1 < f1:
        best_f1 = f1

    global best_auprc
    if best_auprc < _auprc:
        best_auprc = _auprc
        model.save(sess, '{}/ckpt'.format(save_path))
    return prec, rec, f1, _auprc
示例#7
0
    test_set = pickle.load(f)

x_test, y_test = test_set

print('test set', x_test.shape)

with tf.Session() as sess:
    model = BiWGAN(input_dim, method, weight, degree)
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())

    model.restore(sess, '{}/ckpt'.format(save_path))

    ano_scores = []
    for _, batch_test_data in DataInput(x_test, test_batch_size):
        _ano_score, _, _ = model.eval(sess, batch_test_data)
        # extend
        ano_scores += list(_ano_score)
    ano_scores = np.array(ano_scores).reshape((-1, 1))

    # Highest 80% are anomalous
    prec, rec, f1 = calc_metric(y_test, ano_scores, percentile=80)

    # Calculate auc
    auprc = calc_auc(y_test, ano_scores)
    print('Prec:{:.4f}  |  Rec:{:.4f}  |  F1:{:.4f}  |  AUPRC:{:.4f}'.format(
        prec, rec, f1, auprc))

    # draw prc curve
    # draw_prc(y_test, ano_scores)
示例#8
0
                        fp, tp = parsers.eval_hof(
                            [gp.compile(i, evo.pset) for i in hof], X[test],
                            y[test])

                elif (cond[0] == 'rf') or (cond[0] == 'svm'):
                    fp, tp = classifier.eval(X[train],
                                             X[test],
                                             y[train],
                                             y[test],
                                             clf=cond[0],
                                             seed=seed)

                tprs.append(tp)
                fprs.append(fp)

            auc_scores[r, :] = utils.calc_auc(fprs,
                                              tprs,
                                              figure,
                                              plot_roc=True)
            plt.savefig(r"./results/images/TrimmedDataset/" + method[n] +
                        "-AUC" + "_reps" + str(r))
            figure += 1
            print('-' * 75)

        utils.csv_save(method[n], auc_scores)
        print(feat)

    print('Done')

    # plt.show() # show figures