示例#1
0
def get_or_create_features_file(dataset_feature_path,
                                datasets,
                                max_vec,
                                shuffle=True):
    # Read the file in the following format:
    # expression    paraphrase  [valid/invalid]
    if os.path.isfile(dataset_feature_path):
        with open(dataset_feature_path, 'rb') as f:
            X, Y, T = pickle.load(f)

    else:
        dataset = read_paraphrased_tsv_files(datasets, processor=remove_marks)
        X, Y, T = to_vector(dataset, max_vec, save=dataset_feature_path)

    if shuffle:
        z = list(zip(X, Y, T))
        random.shuffle(z)
        X, Y, T = zip(*z)

    X = np.array(X)
    Y = np.array(Y)
    T = np.array(T)
    X = np.transpose(X, (0, 2, 1, 3))
    print("Input :", X.shape)
    print("Output :", Y.shape)
    return X, Y, T
示例#2
0
def get_or_create_features_file(dataset_feature_path,
                                datasets_path,
                                max_length=25,
                                shuffle=True):
    if os.path.isfile(dataset_feature_path):
        with open(dataset_feature_path, 'rb') as f:
            Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T = pickle.load(f)
    else:
        dataset = read_paraphrased_tsv_files(datasets_path,
                                             processor=remove_marks)
        Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T = to_vector(
            dataset, max_length, save=dataset_feature_path)

    if shuffle:
        z = list(zip(Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T))
        random.shuffle(z)
        Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T = zip(*z)

    Xp = np.array(Xp)
    Xf1 = np.array(Xf1)
    Xf2 = np.array(Xf2)
    Xf3 = np.array(Xf3)
    Xs1 = np.array(Xs1)
    Xs2 = np.array(Xs2)
    Xs3 = np.array(Xs3)
    Y = np.array(Y)
    T = np.array(T)

    print("Input :", Xp.shape, Xs1.shape, Xf1.shape)
    print("Output :", Y.shape)
    return Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T
示例#3
0
文件: hybrid.py 项目: mysilver/PhD
def get_or_create_features_file(dataset_feature_path,
                                datasets_path,
                                max_length=25,
                                shuffle=True):
    if os.path.isfile(dataset_feature_path):
        with open(dataset_feature_path, 'rb') as f:
            X, X1, X2, Y, T = pickle.load(f)
    else:
        if not msrp:
            dataset = read_paraphrased_tsv_files(datasets_path,
                                                 processor=remove_marks)
        else:
            dataset = msrp_dataset(datasets_path, processor=remove_marks)
        X, X1, X2, Y, T = to_vector(dataset,
                                    max_length,
                                    save=dataset_feature_path)

    if shuffle:
        z = list(zip(X, X1, X2, Y, T))
        random.shuffle(z)
        X, X1, X2, Y, T = zip(*z)

    X = np.array(X)
    X1 = np.array(X1)
    X2 = np.array(X2)
    Y = np.array(Y)
    T = np.array(T)

    print("Input :", X1.shape, X2.shape)
    print("Output :", Y.shape)
    return X, X1, X2, Y, T
示例#4
0
def get_or_create_features_file(dataset_feature_path,
                                datasets,
                                max_vec,
                                shuffle=True):
    # Read the file in the following format:
    # expression    paraphrase  [valid/invalid]
    if os.path.isfile(dataset_feature_path):
        with open(dataset_feature_path, 'rb') as f:
            X1, X2, X3, X4, Y, T = pickle.load(f)
    else:
        dataset = read_paraphrased_tsv_files(datasets,
                                             processor=remove_marks,
                                             by_user=True)
        X1, X2, X3, X4, Y, T = to_vector(dataset,
                                         max_vec,
                                         save=dataset_feature_path)

    if shuffle:
        z = list(zip(X1, X2, X3, X4, Y, T))
        random.shuffle(z)
        X1, X2, X3, X4, Y, T = zip(*z)

    X1 = np.array(X1)
    X2 = np.array(X2)
    X3 = np.array(X3)
    X4 = np.array(X4)
    Y = np.array(Y)
    T = np.array(T)

    print("Input : 4 * ", X1.shape)
    print("Output :", Y.shape)
    return X1, X2, X3, X4, Y, T
示例#5
0
def independent_paraphrases(datasets_path):
    datasets = read_paraphrased_tsv_files(datasets_path, processor=lambda x: x)
    attrs = []
    labels = set()
    for expr in datasets:
        paraphrases = datasets[expr]
        for i, para in enumerate(paraphrases):
            text = para[0]
            label = para[1]
            labels.add(label)
            f = ff.extract(expr, text, i % 3)
            f.append(label)
            attrs.append(f)

    return attrs, labels
示例#6
0
def dependent_paraphrases(datasets_path):
    datasets = read_paraphrased_tsv_files(datasets_path,
                                          processor=remove_marks,
                                          by_user=True)
    datasets = normalize(datasets)
    attrs = []
    labels = set()
    for index, expr in enumerate(datasets):
        paraphrases = datasets[expr]
        for i, para3 in enumerate(paraphrases):
            text_1 = para3[0][0]
            label_1 = para3[0][1]

            text_2 = para3[1][0]
            label_2 = para3[1][1]

            text_3 = para3[2][0]
            label_3 = para3[2][1]

            labels.add(label_1)
            labels.add(label_3)
            labels.add(label_3)

            f = ff.extract(expr, text_1, 1)
            f.extend(pff.extract(text_1, text_2, 0))
            f.extend(pff.extract(text_1, text_3, 0))
            f.append(label_1)
            attrs.append(f)

            f = ff.extract(expr, text_2, 2)
            f.extend(pff.extract(text_2, text_1, 0))
            f.extend(pff.extract(text_2, text_3, 0))
            f.append(label_2)
            attrs.append(f)

            f = ff.extract(expr, text_3, 3)
            f.extend(pff.extract(text_3, text_1, 0))
            f.extend(pff.extract(text_3, text_2, 0))
            f.append(label_3)
            attrs.append(f)
            # break
        # break
        print("Processed Expression", int(100 * (index + 1) / len(datasets)),
              "%")

    return attrs, labels
示例#7
0
        2   0   0   1   0   0   0  11\
        3   0   0   1   0   0   0  23\
        4   0   0   0   4   3   0  24\
        4   0   0   0   1   0   0  25\
        4   0   0   0   1   0   1  50\
        34   0   0   2   5  10  11 416"

        confusion_matrix = [
            int(a) for a in list(filter(None, confusion_matrix.split(" ")))
        ]
        print(confusion_matrix_interpretation(confusion_matrix))

    if dataset_statistics:
        datasets = "../../paraphrasing-data/crowdsourced"
        dataset = read_paraphrased_tsv_files(datasets,
                                             processor=remove_marks,
                                             by_user=True)

        valid_1st = 0
        valid_2nd = 0
        valid_3rd = 0
        all3_valid = 0
        all3_invalid = 0
        counter = 0
        for i, expression in enumerate(dataset):
            for instance in dataset[expression]:
                counter += 1
                paraphrase_1 = instance[0][1] == 'valid'
                paraphrase_2 = instance[1][1] == 'valid'
                paraphrase_3 = instance[2][1] == 'valid'
示例#8
0
load()


def ginger_error_count(text):
    text = remove_marks(text)
    if text in ginger_error_map:
        return ginger_error_map[text]

    return 0, None


if __name__ == "__main__":

    datasets_path = "../paraphrasing-data/crowdsourced"
    datasets = read_paraphrased_tsv_files(datasets_path, by_user=False)

    correction_map = {}
    with open(ginger_file, "wt") as f:
        for index, expr in enumerate(datasets):
            paraphrases = datasets[expr]

            corrected, is_question = correct(expr,
                                             remove_case=True,
                                             sleep=True)
            if not corrected:
                corrected = {}
            else:
                print(expr.strip(), "==>", corrected)

            correction_map[expr] = corrected
示例#9
0
from flask import Flask, render_template, request, redirect, url_for

from utils.dataset import read_paraphrased_tsv_files

app = Flask(__name__)

datasets_path = "../../paraphrasing-data/merged_datasets"
datasets = read_paraphrased_tsv_files(datasets_path, by_user=False, processor=str.strip)
attrs = {}
attrs['index'] = 0
expressions = list(datasets.keys())
expressions.sort()


@app.route('/', methods=['GET'])
def index():
    expr = expressions[attrs['index']]
    return render_template("annotate.html", expressions=expressions, datasets=datasets, expression=expr)


@app.route('/statistics', methods=['GET'])
def statistics():
    counter = {"valid": 0,
               "divergence": 0,
               "spelling": 0,
               "grammar": 0,
               "cheating": 0,
               "misuse": 0,
               "translate": 0,
               "answer": 0}
示例#10
0
文件: analysis.py 项目: mysilver/PhD
from collections import Counter

from utils.dataset import read_paraphrased_tsv_files
from utils.preprocess import remove_marks, tokenize, pos_tag
from nltk.corpus import stopwords
from nltk import ngrams

stopwords = set(stopwords.words('english'))

datasets = "../paraphrasing-data/crowdsourced"
dataset = read_paraphrased_tsv_files(datasets, processor=remove_marks)


def top_words():
    dictionary = Counter()
    for i, expression in enumerate(dataset):
        expr = set(tokenize(expression))
        for instance in dataset[expression]:
            paraphrase = instance[0]
            dictionary.update([
                t for t in tokenize(paraphrase)
                if t not in expr and t not in stopwords
            ])

    for token in dictionary.most_common(50):
        print(token)


def top_ngrams(ngram):
    dictionary = Counter()
    for i, expression in enumerate(dataset):