Exemplo n.º 1
0
def get_score():
    print('Make Train Features.')
    with open(args.temporary_file, 'rb') as f:
        x_train, x_feat_train, y_train_o, y_aux_train, embedding_matrix = pickle.load(
            f)

    def power_mean(series, p=-5):
        total = sum(np.power(series, p))
        return np.power(total / len(series), 1 / p)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    # all, sub, s&t, !s&t, s&!t, !s&!t
    weight_factor = list(map(float, args.weight_factor.split(',')))
    identity_factor_1 = list(map(float, args.identity_factor_1.split(',')))
    identity_factor_2 = list(map(float, args.identity_factor_2.split(',')))
    model_factor = list(map(int, args.model_factor.split(',')))
    print('weight_factor =', weight_factor)
    print('identity_factor_1 = ', identity_factor_1)
    print('identity_factor_2 = ', identity_factor_2)
    print('model_factor = ', model_factor)
    train = read_competision_file(train=True)
    identity_columns = [
        'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
        'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
    ]
    index_subgroup, index_bpsn, index_bnsp = dict(), dict(), dict()
    for col in identity_columns:
        index_subgroup[col] = (train[col].fillna(0).values >= 0.5).astype(bool)
        index_bpsn[col] = (
            (((train['target'].values < 0.5).astype(bool).astype(np.int) +
              (train[col].fillna(0).values >= 0.5).astype(bool).astype(np.int))
             > 1).astype(bool)) + ((
                 ((train['target'].values >= 0.5).astype(bool).astype(np.int) +
                  (train[col].fillna(0).values < 0.5).astype(bool).astype(
                      np.int)) > 1).astype(bool))
        index_bnsp[col] = (
            (((train['target'].values >= 0.5).astype(bool).astype(np.int) +
              (train[col].fillna(0).values >= 0.5).astype(bool).astype(np.int))
             > 1).astype(bool)) + ((
                 ((train['target'].values < 0.5).astype(bool).astype(np.int) +
                  (train[col].fillna(0).values < 0.5).astype(bool).astype(
                      np.int)) > 1).astype(bool))
    # Overall
    weights = np.ones((len(x_train), )) * weight_factor[0]
    # Subgroup
    weights += (train[identity_columns].fillna(0).values >= 0.5).sum(
        axis=1).astype(bool).astype(np.int) * weight_factor[1]
    weights += (((train['target'].values >= 0.5).astype(bool).astype(np.int) +
                 (train[identity_columns].fillna(0).values >= 0.5).sum(
                     axis=1).astype(bool).astype(np.int)) >
                1).astype(bool).astype(np.int) * weight_factor[2]
    weights += (((train['target'].values >= 0.5).astype(bool).astype(np.int) +
                 (train[identity_columns].fillna(0).values < 0.5).sum(
                     axis=1).astype(bool).astype(np.int)) >
                1).astype(bool).astype(np.int) * weight_factor[3]
    weights += (((train['target'].values < 0.5).astype(bool).astype(np.int) +
                 (train[identity_columns].fillna(0).values >= 0.5).sum(
                     axis=1).astype(bool).astype(np.int)) >
                1).astype(bool).astype(np.int) * weight_factor[4]
    weights += (((train['target'].values < 0.5).astype(bool).astype(np.int) +
                 (train[identity_columns].fillna(0).values < 0.5).sum(
                     axis=1).astype(bool).astype(np.int)) >
                1).astype(bool).astype(np.int) * weight_factor[5]
    index_id1, index_id2 = dict(), dict()
    for col in identity_columns:
        index_id1[col] = (
            ((train[col].fillna(0).values >= 0.5).astype(bool).astype(np.int) +
             (train['target'].values >= 0.5).astype(bool).astype(np.int)) >
            1).astype(bool)
        index_id2[col] = (
            ((train[col].fillna(0).values >= 0.5).astype(bool).astype(np.int) +
             (train['target'].values < 0.5).astype(bool).astype(np.int)) >
            1).astype(bool)
    for col, id1 in zip(identity_columns, identity_factor_1):
        weights[index_id1[col]] += id1
    for col, id2 in zip(identity_columns, identity_factor_2):
        weights[index_id2[col]] += id2

    loss_weight = 1.0 / weights.mean()

    aux_impact_factor = list(map(float, args.aux_impact_factor.split(',')))
    aux_identity_factor = list(map(float, args.aux_identity_factor.split(',')))
    print('aux_impact_factor =', aux_impact_factor)
    print('aux_identity_factor =', aux_identity_factor)

    weights_aux = np.ones((len(x_train), ))
    weights_aux[(train['target'].values >= 0.5).astype(np.int) +
                (train[identity_columns].fillna(0).values < 0.5).sum(axis=1).
                astype(bool).astype(np.int) > 1] = aux_identity_factor[0]
    weights_aux[(train['target'].values >= 0.5).astype(np.int) +
                (train[identity_columns].fillna(0).values >= 0.5).sum(axis=1).
                astype(bool).astype(np.int) > 1] = aux_identity_factor[1]
    weights_aux[(train['target'].values < 0.5).astype(np.int) +
                (train[identity_columns].fillna(0).values < 0.5).sum(axis=1).
                astype(bool).astype(np.int) > 1] = aux_identity_factor[2]
    weights_aux[(train['target'].values < 0.5).astype(np.int) +
                (train[identity_columns].fillna(0).values >= 0.5).sum(axis=1).
                astype(bool).astype(np.int) > 1] = aux_identity_factor[3]

    y_train = np.vstack([y_train_o, weights, weights_aux]).T

    del train

    def custom_loss_aux(data, targets):
        ''' Define custom loss function for weighted BCE on 'target' column '''
        bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,
                                                         1:2])(data[:, :1],
                                                               targets[:, :1])
        bce_loss_aux_1 = nn.BCEWithLogitsLoss(weight=targets[:, 2:3])(
            data[:, 1:2], targets[:, 3:4])
        bce_loss_aux_2 = nn.BCEWithLogitsLoss(weight=targets[:, 2:3])(
            data[:, 2:3], targets[:, 4:5])
        bce_loss_aux_3 = nn.BCEWithLogitsLoss(weight=targets[:, 2:3])(
            data[:, 3:4], targets[:, 5:6])
        bce_loss_aux_4 = nn.BCEWithLogitsLoss(weight=targets[:, 2:3])(
            data[:, 4:5], targets[:, 6:7])
        bce_loss_aux_5 = nn.BCEWithLogitsLoss(weight=targets[:, 2:3])(
            data[:, 5:6], targets[:, 7:8])
        bce_loss_aux_6 = nn.BCEWithLogitsLoss(weight=targets[:, 2:3])(
            data[:, 6:7], targets[:, 8:9])
        return (bce_loss_1 * loss_weight) + (
            bce_loss_aux_1 *
            aux_impact_factor[0]) + (bce_loss_aux_2 * aux_impact_factor[1]) + (
                bce_loss_aux_3 * aux_impact_factor[2]
            ) + (bce_loss_aux_4 * aux_impact_factor[3]) + (
                bce_loss_aux_5 * aux_impact_factor[4]) + (bce_loss_aux_6 *
                                                          aux_impact_factor[5])

    from sklearn.model_selection import KFold, train_test_split
    from sklearn.metrics import classification_report, roc_auc_score
    batch_size = args.batch_size
    lr = args.learning_ratio
    max_features = np.max(x_train)
    kf = KFold(n_splits=5, random_state=12, shuffle=True)
    final_epoch_score_cv = dict()
    final_fold_count = 0
    for fold_id, (big_index, small_index) in enumerate(kf.split(y_train)):
        final_fold_count += 1
        if args.minimize == 1:
            train_index, test_index = train_test_split(np.arange(len(y_train)),
                                                       test_size=0.5,
                                                       random_state=1234,
                                                       shuffle=True)
        elif args.minimize == 2:
            train_index, test_index = train_test_split(np.arange(len(y_train)),
                                                       test_size=0.666,
                                                       random_state=1234,
                                                       shuffle=True)
        elif args.minimize == 3:
            train_index, test_index = big_index[:25600], small_index[:12800]
        else:
            train_index, test_index = big_index, small_index

        if len(args.model_file) > 0:
            train_index = np.arange(len(x_train))

        if args.use_feats_url:
            x_train_train = np.hstack(
                [x_feat_train[train_index], x_train[train_index]])
            x_train_test = np.hstack(
                [x_feat_train[test_index], x_train[test_index]])
            feats_nums = x_feat_train.shape[1]
        else:
            x_train_train = x_train[train_index]
            x_train_test = x_train[test_index]
            feats_nums = 0

        x_train_torch = torch.tensor(x_train_train, dtype=torch.long)
        x_test_torch = torch.tensor(x_train_test, dtype=torch.long)
        y_train_torch = torch.tensor(np.hstack([y_train,
                                                y_aux_train])[train_index],
                                     dtype=torch.float32)
        y_test_torch = torch.tensor(np.hstack([y_train,
                                               y_aux_train])[test_index],
                                    dtype=torch.float32)

        train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
        valid_dataset = data.TensorDataset(x_test_torch, y_test_torch)

        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=False)

        databunch = DataBunch(train_dl=train_loader, valid_dl=valid_loader)

        checkpoint_predictions = []
        weights = []
        seed_everything(args.random_seed + fold_id)
        num_units = list(map(int, args.num_units.split(',')))
        model = get_model(model_factor, num_units[0], num_units[1],
                          embedding_matrix, max_features,
                          y_aux_train.shape[-1], args.num_words, feats_nums)
        model = model.cuda(device=cuda)
        if args.optimizer == 'Nadam':
            from NadamLocal import Nadam
            learn = Learner(databunch,
                            model,
                            loss_func=custom_loss_aux,
                            opt_func=Nadam)
        else:
            learn = Learner(databunch, model, loss_func=custom_loss_aux)
        all_test_preds = []
        checkpoint_weights = [2**epoch for epoch in range(args.num_epochs)]
        test_loader = valid_loader
        n = len(learn.data.train_dl)
        phases = [(TrainingPhase(n).schedule_hp('lr', lr * (0.6**(i))))
                  for i in range(args.num_epochs)]
        sched = GeneralScheduler(learn, phases)
        learn.callbacks.append(sched)
        final_epoch_score = 0
        for global_epoch in range(args.num_epochs):
            print("Fold#", fold_id, "epoch#", global_epoch)
            learn.fit(1)
            if args.minimize < 2 or (args.minimize >= 2 and global_epoch
                                     == int(args.num_epochs - 1)):
                test_preds = np.zeros((len(test_index), 7))
                for i, x_batch in enumerate(test_loader):
                    X = x_batch[0].cuda()
                    y_pred = sigmoid(learn.model(X).detach().cpu().numpy())
                    test_preds[i * batch_size:(i + 1) * batch_size, :] = y_pred

                all_test_preds.append(test_preds)

                prediction_one = test_preds[:, 0].flatten()
                checkpoint_predictions.append(prediction_one)

                weights.append(2**global_epoch)
                predictions = np.average(checkpoint_predictions,
                                         weights=weights,
                                         axis=0)
                y_true = (y_train[test_index, 0]).reshape(
                    (-1, )).astype(np.int)
                roc_sub, roc_bpsn, roc_bnsp = [], [], []
                roc_sub_one, roc_bpsn_one, roc_bnsp_one = [], [], []
                for col in identity_columns:
                    if args.vervose:
                        print("Subgroup#", col, ":")
                        print(
                            classification_report(
                                y_true[index_subgroup[col][test_index]],
                                (predictions[index_subgroup[col][test_index]]
                                 >= 0.5).astype(np.int)))
                    if args.minimize < 2:
                        roc_sub.append(
                            roc_auc_score(
                                y_true[index_subgroup[col][test_index]],
                                predictions[index_subgroup[col][test_index]]))
                    roc_sub_one.append(
                        roc_auc_score(
                            y_true[index_subgroup[col][test_index]],
                            prediction_one[index_subgroup[col][test_index]]))
                    if args.vervose:
                        print("BPSN#", col, ":")
                        print(
                            classification_report(
                                y_true[index_bpsn[col][test_index]],
                                (predictions[index_bpsn[col][test_index]] >=
                                 0.5).astype(np.int)))
                    if args.minimize < 2:
                        roc_bpsn.append(
                            roc_auc_score(
                                y_true[index_bpsn[col][test_index]],
                                predictions[index_bpsn[col][test_index]]))
                    roc_bpsn_one.append(
                        roc_auc_score(
                            y_true[index_bpsn[col][test_index]],
                            prediction_one[index_bpsn[col][test_index]]))
                    if args.vervose:
                        print("BNSP#", col, ":")
                        print(
                            classification_report(
                                y_true[index_bnsp[col][test_index]],
                                (predictions[index_bnsp[col][test_index]] >=
                                 0.5).astype(np.int)))
                    if args.minimize < 2:
                        roc_bnsp.append(
                            roc_auc_score(
                                y_true[index_bnsp[col][test_index]],
                                predictions[index_bnsp[col][test_index]]))
                    roc_bnsp_one.append(
                        roc_auc_score(
                            y_true[index_bnsp[col][test_index]],
                            prediction_one[index_bnsp[col][test_index]]))
                if args.minimize < 2:
                    roc_all = roc_auc_score(y_true, predictions)
                    pm_roc_sub = power_mean(roc_sub)
                    pm_roc_bpsn = power_mean(roc_bpsn)
                    pm_roc_bnsp = power_mean(roc_bnsp)
                    final_epoch_score = (roc_all + pm_roc_sub + pm_roc_bpsn +
                                         pm_roc_bnsp) / 4
                roc_all_one = roc_auc_score(y_true, prediction_one)
                pm_roc_sub_one = power_mean(roc_sub_one)
                pm_roc_bpsn_one = power_mean(roc_bpsn_one)
                pm_roc_bnsp_one = power_mean(roc_bnsp_one)
                final_epoch_score_one = (roc_all_one + pm_roc_sub_one +
                                         pm_roc_bpsn_one + pm_roc_bnsp_one) / 4
                if args.minimize >= 2:
                    return final_epoch_score_one
                if args.vervose:
                    print("roc_sub:", pm_roc_sub)
                    print("roc_bpsn:", pm_roc_bpsn)
                    print("roc_bnsp:", pm_roc_bnsp)
                    print("final score:",
                          (roc_all + pm_roc_sub + pm_roc_bpsn + pm_roc_bnsp) /
                          4)
                if global_epoch not in final_epoch_score_cv.keys():
                    final_epoch_score_cv[global_epoch] = []
                final_epoch_score_cv[global_epoch].append(
                    (final_epoch_score, final_epoch_score_one))
        if len(args.model_file) > 0:
            if args.model_file.endswith('.bz2'):
                model_file = args.model_file
            else:
                model_file = args.model_file + '.bz2'
            model_json_file = model_file[:-4] + '.json'
            model.save_model(model_file)
            with open(model_json_file, 'w') as pf:
                pf.write('{')
                pf.write('\"model_factor\":[' +
                         ','.join(list(map(str, model_factor))) + ']')
                pf.write(',')
                pf.write('\"num_units\":[' +
                         ','.join(list(map(str, num_units))) + ']')
                pf.write(',')
                pf.write('\"num_aux_targets\":%d' % y_aux_train.shape[-1])
                pf.write(',')
                pf.write('\"feats_nums\":%d' % feats_nums)
                pf.write(',')
                pf.write('\"max_seq_len\":%d' % args.num_words)
                pf.write('}')
            break
        if args.minimize > 0:
            break
    return final_epoch_score_cv
Exemplo n.º 2
0


# train model
LSTM_valid_raw_preds = train_model(learn,output_dim=num_targets, lr = 1.0e-3)


# test set prediction

LSTM_pred_raw = torch.zeros(len(X_test), num_targets)
test_preds = np.zeros((len(X_test)))
learn.model.eval()

for i, x_batch in enumerate(test_loader):
    X = x_batch[0].cuda()
    y_pred = nn.Softmax(dim=-1)(learn.model(X).detach())
    LSTM_pred_raw[i * batch_size:(i + 1) * batch_size] = y_pred
    test_preds[i * batch_size:(i + 1) * batch_size] =  y_pred.argmax(dim=-1).cpu().numpy()
###

# save LSTM prediction
np.savetxt("LSTM_ytest.txt",test_preds.astype(int), fmt='%d')


# CNN model class
class CNN_Text(nn.Module):

    def __init__(self, max_features=28, e_char = 50,  kernel_sizes = [3,4,5,6, 10], num_filters =64, dropout_rate = 0.2, num_targets = 12):

        super(CNN_Text, self).__init__()