예제 #1
0
              file=results_file)
        print("Support: %d" % (support if support is not None else -1),
              file=results_file)

    report = classification_report(y_true, y_pred)
    with directory.joinpath("classification_report").open(
            mode='w') as report_file:
        print(report, file=report_file)

    cm = confusion_matrix(y_true, y_pred)
    with directory.joinpath("confusion_matrix").open(mode='w') as cm_file:
        np.savetxt(cm_file, cm, fmt="%d")


if __name__ == "__main__":
    data = read_dataset_splits(reader=data_readers.read_question_only_data,
                               splits=["tiny", "train", "dev"])
    X_train, y_train = prepare_data(data.train)
    X_dev, y_dev = prepare_data(data.dev)

    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS,
                          oov_token="<UNK>",
                          split=' ',
                          lower=True)
    tokenizer.fit_on_texts(X_train)

    embeddings = getFastTextEmbeddings(tokenizer.word_index)

    model = simpleRNN(embeddings, hidden_dim=200)

    X_train = tokenizer.texts_to_sequences(X_train)
    X_dev = tokenizer.texts_to_sequences(X_dev)
예제 #2
0
from model_utils import get_response_time_label, add_cosine_similarity, add_question_length, add_jensen_shannon
from scipy.stats import pearsonr

random.seed(Config.SEED)


def calc_correlation(data, col_name):
    return pearsonr(data.train['response_time_sec'], data.train[col_name])


if __name__ == '__main__':
    results = {}
    data = read_dataset_splits(
        reader=data_readers.read_question_and_context_data,
        window_size=10,
        include_question_text=True,
        include_context_text=True,
        include_context_speaker=False,
        include_context_times=False)
    data = add_jensen_shannon(data)
    results['question_and_js'] = calc_correlation(data, 'jensen_shannon')

    df = read_corpus(split='train')
    all_words = [item for sublist in df.text for item in sublist]
    for N_words in [25, 50, 100]:
        top_words = [
            item[0] for item in Counter(all_words).most_common(N_words)
        ]
        data = read_dataset_splits(
            reader=data_readers.read_question_and_context_data,
            window_size=10,
예제 #3
0
                     ('tfidf', TfidfTransformer()),
                     ('clf', DummyClassifier(random_state=SEED))])

    pipe.fit(train['question'], train['question_class'])
    preds = pipe.predict(dev['question'])
    print("Dummy Classifier: ")
    report = classification_report(dev['question_class'], preds)
    print(report)
    cm = confusion_matrix(dev['question_class'], preds)
    plot_cm(cm, title="Dummy Classifier")

    return models


if __name__ == '__main__':
    data = read_dataset_splits(reader=read_question_only_data)
    data = add_classes(data)
    models = run_baselines(data)

    train, dev, test = data['train'], data['dev'], data['test']
    dev_new = copy.deepcopy(dev)
    pipe = Pipeline([
        ('vect', CountVectorizer(tokenizer=dummy_tokenizer, lowercase=False)),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression(class_weight='balanced', random_state=SEED))
    ])

    #Generate results for Logistic regression and output to CSV file.
    #True class, predicted class and class probabilities.
#    g = models['Logistic Regression']
#    pipe.set_params(**g)
예제 #4
0
    report = classification_report(y_true, y_pred)
    with directory.joinpath("classification_report").open(
            mode='w') as report_file:
        print(report, file=report_file)

    cm = confusion_matrix(y_true, y_pred)
    with directory.joinpath("confusion_matrix").open(mode='w') as cm_file:
        np.savetxt(cm_file, cm, fmt="%d")


if __name__ == "__main__":
    data = read_dataset_splits(
        reader=data_readers.read_question_and_context_data,
        splits=["tiny"],
        window_size=WINDOW_SIZE,
        include_question_text=True,
        include_context_text=True,
        include_context_speaker=True,
        include_context_times=False)
    X_train, y_train = prepare_data(data.tiny)
    X_dev, y_dev = prepare_data(data.tiny)

    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS,
                          oov_token="<UNK>",
                          split=' ',
                          lower=True)
    tokenizer.fit_on_texts(X_train)

    embeddings = getFastTextEmbeddings(tokenizer.word_index)

    model = simpleRNN(embeddings, hidden_dim=200)