def get_or_create_features_file(dataset_feature_path, datasets, max_vec, shuffle=True): # Read the file in the following format: # expression paraphrase [valid/invalid] if os.path.isfile(dataset_feature_path): with open(dataset_feature_path, 'rb') as f: X, Y, T = pickle.load(f) else: dataset = read_paraphrased_tsv_files(datasets, processor=remove_marks) X, Y, T = to_vector(dataset, max_vec, save=dataset_feature_path) if shuffle: z = list(zip(X, Y, T)) random.shuffle(z) X, Y, T = zip(*z) X = np.array(X) Y = np.array(Y) T = np.array(T) X = np.transpose(X, (0, 2, 1, 3)) print("Input :", X.shape) print("Output :", Y.shape) return X, Y, T
def get_or_create_features_file(dataset_feature_path, datasets_path, max_length=25, shuffle=True): if os.path.isfile(dataset_feature_path): with open(dataset_feature_path, 'rb') as f: Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T = pickle.load(f) else: dataset = read_paraphrased_tsv_files(datasets_path, processor=remove_marks) Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T = to_vector( dataset, max_length, save=dataset_feature_path) if shuffle: z = list(zip(Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T)) random.shuffle(z) Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T = zip(*z) Xp = np.array(Xp) Xf1 = np.array(Xf1) Xf2 = np.array(Xf2) Xf3 = np.array(Xf3) Xs1 = np.array(Xs1) Xs2 = np.array(Xs2) Xs3 = np.array(Xs3) Y = np.array(Y) T = np.array(T) print("Input :", Xp.shape, Xs1.shape, Xf1.shape) print("Output :", Y.shape) return Xp, Xf1, Xf2, Xf3, Xs1, Xs2, Xs3, Y, T
def get_or_create_features_file(dataset_feature_path, datasets_path, max_length=25, shuffle=True): if os.path.isfile(dataset_feature_path): with open(dataset_feature_path, 'rb') as f: X, X1, X2, Y, T = pickle.load(f) else: if not msrp: dataset = read_paraphrased_tsv_files(datasets_path, processor=remove_marks) else: dataset = msrp_dataset(datasets_path, processor=remove_marks) X, X1, X2, Y, T = to_vector(dataset, max_length, save=dataset_feature_path) if shuffle: z = list(zip(X, X1, X2, Y, T)) random.shuffle(z) X, X1, X2, Y, T = zip(*z) X = np.array(X) X1 = np.array(X1) X2 = np.array(X2) Y = np.array(Y) T = np.array(T) print("Input :", X1.shape, X2.shape) print("Output :", Y.shape) return X, X1, X2, Y, T
def get_or_create_features_file(dataset_feature_path, datasets, max_vec, shuffle=True): # Read the file in the following format: # expression paraphrase [valid/invalid] if os.path.isfile(dataset_feature_path): with open(dataset_feature_path, 'rb') as f: X1, X2, X3, X4, Y, T = pickle.load(f) else: dataset = read_paraphrased_tsv_files(datasets, processor=remove_marks, by_user=True) X1, X2, X3, X4, Y, T = to_vector(dataset, max_vec, save=dataset_feature_path) if shuffle: z = list(zip(X1, X2, X3, X4, Y, T)) random.shuffle(z) X1, X2, X3, X4, Y, T = zip(*z) X1 = np.array(X1) X2 = np.array(X2) X3 = np.array(X3) X4 = np.array(X4) Y = np.array(Y) T = np.array(T) print("Input : 4 * ", X1.shape) print("Output :", Y.shape) return X1, X2, X3, X4, Y, T
def independent_paraphrases(datasets_path): datasets = read_paraphrased_tsv_files(datasets_path, processor=lambda x: x) attrs = [] labels = set() for expr in datasets: paraphrases = datasets[expr] for i, para in enumerate(paraphrases): text = para[0] label = para[1] labels.add(label) f = ff.extract(expr, text, i % 3) f.append(label) attrs.append(f) return attrs, labels
def dependent_paraphrases(datasets_path): datasets = read_paraphrased_tsv_files(datasets_path, processor=remove_marks, by_user=True) datasets = normalize(datasets) attrs = [] labels = set() for index, expr in enumerate(datasets): paraphrases = datasets[expr] for i, para3 in enumerate(paraphrases): text_1 = para3[0][0] label_1 = para3[0][1] text_2 = para3[1][0] label_2 = para3[1][1] text_3 = para3[2][0] label_3 = para3[2][1] labels.add(label_1) labels.add(label_3) labels.add(label_3) f = ff.extract(expr, text_1, 1) f.extend(pff.extract(text_1, text_2, 0)) f.extend(pff.extract(text_1, text_3, 0)) f.append(label_1) attrs.append(f) f = ff.extract(expr, text_2, 2) f.extend(pff.extract(text_2, text_1, 0)) f.extend(pff.extract(text_2, text_3, 0)) f.append(label_2) attrs.append(f) f = ff.extract(expr, text_3, 3) f.extend(pff.extract(text_3, text_1, 0)) f.extend(pff.extract(text_3, text_2, 0)) f.append(label_3) attrs.append(f) # break # break print("Processed Expression", int(100 * (index + 1) / len(datasets)), "%") return attrs, labels
2 0 0 1 0 0 0 11\ 3 0 0 1 0 0 0 23\ 4 0 0 0 4 3 0 24\ 4 0 0 0 1 0 0 25\ 4 0 0 0 1 0 1 50\ 34 0 0 2 5 10 11 416" confusion_matrix = [ int(a) for a in list(filter(None, confusion_matrix.split(" "))) ] print(confusion_matrix_interpretation(confusion_matrix)) if dataset_statistics: datasets = "../../paraphrasing-data/crowdsourced" dataset = read_paraphrased_tsv_files(datasets, processor=remove_marks, by_user=True) valid_1st = 0 valid_2nd = 0 valid_3rd = 0 all3_valid = 0 all3_invalid = 0 counter = 0 for i, expression in enumerate(dataset): for instance in dataset[expression]: counter += 1 paraphrase_1 = instance[0][1] == 'valid' paraphrase_2 = instance[1][1] == 'valid' paraphrase_3 = instance[2][1] == 'valid'
load() def ginger_error_count(text): text = remove_marks(text) if text in ginger_error_map: return ginger_error_map[text] return 0, None if __name__ == "__main__": datasets_path = "../paraphrasing-data/crowdsourced" datasets = read_paraphrased_tsv_files(datasets_path, by_user=False) correction_map = {} with open(ginger_file, "wt") as f: for index, expr in enumerate(datasets): paraphrases = datasets[expr] corrected, is_question = correct(expr, remove_case=True, sleep=True) if not corrected: corrected = {} else: print(expr.strip(), "==>", corrected) correction_map[expr] = corrected
from flask import Flask, render_template, request, redirect, url_for from utils.dataset import read_paraphrased_tsv_files app = Flask(__name__) datasets_path = "../../paraphrasing-data/merged_datasets" datasets = read_paraphrased_tsv_files(datasets_path, by_user=False, processor=str.strip) attrs = {} attrs['index'] = 0 expressions = list(datasets.keys()) expressions.sort() @app.route('/', methods=['GET']) def index(): expr = expressions[attrs['index']] return render_template("annotate.html", expressions=expressions, datasets=datasets, expression=expr) @app.route('/statistics', methods=['GET']) def statistics(): counter = {"valid": 0, "divergence": 0, "spelling": 0, "grammar": 0, "cheating": 0, "misuse": 0, "translate": 0, "answer": 0}
from collections import Counter from utils.dataset import read_paraphrased_tsv_files from utils.preprocess import remove_marks, tokenize, pos_tag from nltk.corpus import stopwords from nltk import ngrams stopwords = set(stopwords.words('english')) datasets = "../paraphrasing-data/crowdsourced" dataset = read_paraphrased_tsv_files(datasets, processor=remove_marks) def top_words(): dictionary = Counter() for i, expression in enumerate(dataset): expr = set(tokenize(expression)) for instance in dataset[expression]: paraphrase = instance[0] dictionary.update([ t for t in tokenize(paraphrase) if t not in expr and t not in stopwords ]) for token in dictionary.most_common(50): print(token) def top_ngrams(ngram): dictionary = Counter() for i, expression in enumerate(dataset):