def train(use_preprocessdata=True):
    df = pandas.read_csv('dataset/gap-test.tsv', sep='\t')
    X, Y = _preprocess_data(df, use_preprocessdata=use_preprocessdata, save_path='preprocess_traindata.pkl')
    Y_labels = stanfordnlp_model._get_classify_labels(df)
    validation_df = pandas.read_csv('dataset/gap-validation.tsv', sep='\t')
    validation_X, validation_Y = _preprocess_data(validation_df, use_preprocessdata=use_preprocessdata, save_path='preprocess_valdata.pkl')
    validation_Y_labels = stanfordnlp_model._get_classify_labels(validation_df)

    def objective(trial):
        eta = trial.suggest_loguniform('eta', 0.001, 0.1)
        max_depth = trial.suggest_int('max_depth', 3, 25)
        gamma = trial.suggest_loguniform('gamma', 0.05, 1.0)
        min_child_weight = trial.suggest_int('min_child_weight', 1, 7)
        subsample = trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.1)
        colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1.0, 0.1)
        model = xgb.XGBClassifier(
            max_depth=max_depth,
            eta=eta,
            gamma=gamma,
            min_child_weight=min_child_weight,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            n_jobs=1,
            random_state=0)

        def _log_loss(y_pred, y):
            """For XGBoost logloss calculator."""
            y_pred = calculate_rate(y_pred)
            return 'logloss', log_loss(validation_Y_labels, y_pred)

        pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation_0-logloss')
        model.fit(
            X,
            Y.flatten(),
            eval_set=[(validation_X, validation_Y.flatten())],
            eval_metric=_log_loss,
            callbacks=[pruning_callback],
            verbose=False)
        return log_loss(validation_Y_labels, calculate_rate(model.predict_proba(validation_X)))

    study = optuna.create_study(
        study_name='gap-conf-kaggle',
        pruner=optuna.pruners.MedianPruner(),
        sampler=optuna.samplers.TPESampler(seed=0))
    study.optimize(objective, n_trials=100, n_jobs=-1)
    print("Best Params", study.best_params)
    print("Best Validation Value", study.best_value)

    model = xgb.XGBClassifier(n_jobs=-1, random_state=0, **study.best_params)
    model.fit(
        np.concatenate([X, validation_X]),
        np.concatenate([Y, validation_Y]).flatten())
    with open('model.pkl', 'wb') as f:
        pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)
    y_pred = calculate_rate(model.predict_proba(X))
    print("Train Accuracy:", accuracy_score(Y_labels, np.argmax(y_pred, axis=1)))
Пример #2
0
def evaluate(test_data, use_preprocessdata=True):
    gpt2_estimator.build()
    validation_df = pandas.read_csv('dataset/gap-validation.tsv', sep='\t')
    data = stanfordnlp_model._load_data(validation_df, use_preprocessdata, 'preprocess_valdata.pkl')
    Y = stanfordnlp_model._get_classify_labels(validation_df)
    predicts = np.ndarray([len(validation_df), 3], dtype=np.float32)
    # data = stanfordnlp_model._load_data(test_data, use_preprocessdata, 'preprocess_testdata.pkl')
    # Y = stanfordnlp_model._get_classify_labels(test_data)
    # predicts = np.ndarray([len(test_data), 3], dtype=np.float32)
    for i, (words, indexes) in enumerate(data):
        # predicts[i] = calculate_syntax_likelihood(words, indexes)
        predicts[i] = calcurate_likelihood(words, indexes)
        if np.argmax(predicts[i]) == 2:
            predicts[i] = calculate_syntax_likelihood(words, indexes)
            # predicts[i] = calcurate_likelihood(words, indexes, Y[i])

    print("A predict", sum(np.argmax(predicts, axis=1) == 0))
    print("B predict", sum(np.argmax(predicts, axis=1) == 1))
    print("Non predict", sum(np.argmax(predicts, axis=1) == 2))
    print("Test Accuracy:", accuracy_score(Y, np.argmax(predicts, axis=1)))
    print("Confusion Matrix:\n", confusion_matrix(Y, np.argmax(predicts, axis=1)))

    non_neithers = ((Y.flatten() != 2) & (np.argmax(predicts, axis=1) != 2))
    print("Non Neithers Counts", sum(non_neithers))
    print("Non Neithers Test Accuracy:", accuracy_score(Y[non_neithers], np.argmax(predicts[non_neithers], axis=1)))

    corrects = (Y.flatten() == np.argmax(predicts, axis=1))
    print("Correct loss", log_loss(Y[corrects], predicts[corrects]))
    print("Loss", log_loss(Y, predicts))

    out_df = pandas.DataFrame(data=predicts, columns=['A', 'B', 'NEITHER'])
    out_df['ID'] = test_data['ID']
    return out_df
Пример #3
0
def _preprocess_data(df, use_preprocessdata=False, save_path=None):
    """Preprocess task speccific pipeline.
    Args:
        df (DataFrame): target pandas DataFrame object.
        use_preprocessdata (bool): Wheter or not to use local preprocess file loading
        save_path (str): local preprocess file path
    Return:
        X (array): explanatory variables in task. shape is (n_sumples, n_features)
        Y (array): objective variables in task. shape is (n_sumples, 1)
    """
    data = stanfordnlp_model._load_data(df, use_preprocessdata, save_path)
    X = []
    X2 = []
    for i, (words, indexes) in enumerate(data):
        X.append(
            stanfordnlp_model._vectorise_bag_of_pos_with_position(
                words,
                indexes,
                stanfordnlp_model.DEFAULT_WINDOW_SIZE,
                targets=[df['Pronoun'][i], df['A'][i], df['B'][i]]))
        X2.append(
            stanfordnlp_model._vectorise_bag_of_pos_with_dependency(
                words, indexes))
    X5 = bert_estimator.embed_by_bert(df)
    X5 = np.array(X5)
    X = np.concatenate([X, X2, X5], axis=-1)
    Y = stanfordnlp_model._get_classify_labels(df)
    return X, Y
def evaluate(test_data, use_preprocessdata=True):
    train()
    X, Y = _preprocess_data(test_data, use_preprocessdata=use_preprocessdata, save_path='preprocess_testdata.pkl')
    Y_labels = stanfordnlp_model._get_classify_labels(test_data)
    with open('model.pkl', 'rb') as f:
        model = pickle.load(f)
    pred = model.predict_proba(X)
    y_pred = calculate_rate(pred)
    print("Test Accuracy:", accuracy_score(Y_labels, np.argmax(y_pred, axis=1)))
    a = (Y_labels.flatten()[:20] != np.argmax(y_pred[:20], axis=1))
    print("Error A count", len(Y_labels[Y_labels.flatten() == 0]), len(Y_labels[Y_labels.flatten() == 0][(Y_labels[Y_labels.flatten() == 0].flatten() != np.argmax(y_pred[Y_labels.flatten() == 0], axis=1))]))
    print("Error B count", len(Y_labels[Y_labels.flatten() == 1]), len(Y_labels[Y_labels.flatten() == 1][Y_labels[Y_labels.flatten() == 1].flatten() != np.argmax(y_pred[Y_labels.flatten() == 1], axis=1)]))
    print("Error Pronnoun count", len(Y_labels[Y_labels.flatten() == 2]), len(Y_labels[Y_labels.flatten() == 2][Y_labels[Y_labels.flatten() == 2].flatten() != np.argmax(y_pred[Y_labels.flatten() == 2], axis=1)]))
    print("Error Case", Y_labels[:20][a])
    print("Error Case Label", np.argmax(y_pred[:20][a], axis=1))
    print("Error Case Rate", y_pred[:20][a])
    print("A predictions", pred[:20][a])
    print("B predictions", pred[int(len(pred)/2):int(len(pred)/2)+20][a])

    data = _load_data(test_data, True, 'preprocess_testdata.pkl')
    for i, (words, indexes) in enumerate(data):
        if i in np.where(a == True)[0]:
            print("Index", i)
            print("Pronounce position", stanfordnlp_model._get_bag_of_pos_with_position(words, indexes[0], DEFAULT_WINDOW_SIZE, target_len=len(test_data['Pronoun'][i].split())))
            print("A position", stanfordnlp_model._get_bag_of_pos_with_position(words, indexes[1], DEFAULT_WINDOW_SIZE, target_len=len(test_data['A'][i].split())))
            print("B position", stanfordnlp_model._get_bag_of_pos_with_position(words, indexes[2], DEFAULT_WINDOW_SIZE, target_len=len(test_data['B'][i].split())))
            print("Pronounce dependency", stanfordnlp_model._get_bag_of_pos_with_dependency(words, indexes[0]))
            print("A dependency", stanfordnlp_model._get_bag_of_pos_with_dependency(words, indexes[1]))
            print("B dependency", stanfordnlp_model._get_bag_of_pos_with_dependency(words, indexes[2]))

    predicts = calculate_rate(model.predict_proba(X))
    out_df = pandas.DataFrame(data=predicts, columns=['A', 'B', 'NEITHER'])
    out_df['ID'] = test_data['ID']
    return out_df
Пример #5
0
def _preprocess_data(df, use_preprocessdata=False, save_path=None):
    """Preprocess task speccific pipeline.
    Args:
        df (DataFrame): target pandas DataFrame object.
        use_preprocessdata (bool): Wheter or not to use local preprocess file loading
        save_path (str): local preprocess file path
    Return:
        X (array): explanatory variables in task. shape is (n_sumples, n_features)
        Y (array): objective variables in task. shape is (n_sumples, 1)
    """
    data = stanfordnlp_model._load_data(df, use_preprocessdata, save_path)
    X = []

    texts = [df['Text'][i] for i in range(len(df))]
    words = [[words[j].text.replace('`', '') for j in indexes] for words, indexes in data]
    indexes = [indexes for words, indexes in data]
    X = bert_estimator._get_token_attentions(texts, words, indexes)
    X = np.array(X)
    print(X.shape)
    Y = stanfordnlp_model._get_classify_labels(df)
    return X, Y
def test_get_classify_labels():
    data = pandas.DataFrame(data=[[True, False], [False, True], [False,
                                                                 False]],
                            columns=['A-coref', 'B-coref'])
    labels = stanfordnlp_model._get_classify_labels(data)
    np.testing.assert_array_equal([[0], [1], [2]], labels)