def main(sts_train_file, sts_dev_file): """Fits a logistic regression for paraphrase identification, using string similarity metrics as features. Prints accuracy on held-out data. Data is formatted as in the STS benchmark""" min_paraphrase = 4.0 max_nonparaphrase = 3.0 # loading train train_texts_sts, train_y_sts = parse_sts(sts_train_file) # loading dev dev_texts_sts, dev_y_sts = parse_sts(sts_dev_file)
def main(sts_train_file, sts_dev_file, w2v_file): """Fits a logistic regression for paraphrase identification, using string similarity metrics and vector similarity as features. Prints results on held-out data. Data is formatted as in the STS benchmark""" min_paraphrase = 4.0 max_nonparaphrase = 3.0 # TODO 1: Load data partitions and convert to paraphrase dataset as in the lab # You will train a logistic regression on the TRAIN partition train_texts_sts, train_y_sts = parse_sts(sts_train_file) # You will evaluate predictions on the VALIDATION partition dev_texts_sts, dev_y_sts = parse_sts(sts_dev_file)
def main(sts_data): """Calculate NIST metric for pairs of strings Data is formatted as in the STS benchmark""" # TODO 1: define a function to read the data in util texts, labels = parse_sts(sts_data) print(f"Found {len(texts)} STS pairs") # take a sample of sentences so the code runs fast for faster debugging # when you're done debugging, you may want to run this on more! sample_text = texts[120:140] sample_labels = labels[120:140] # zip them together to make tuples of text associated with labels sample_data = zip(sample_labels, sample_text) scores = [] for label, text in enumerate(sample_data): t1, t2 = text print(f"Sentences: {t1}\t{t2}") # TODO 2: Calculate NIST for each pair of sentences # calculate NIST(a,b) and NIST(b,a) and # catch any exceptions and assign 0.0 for that part of the score nist_score = 0.0 print(f"Label: {label}, NIST: {nist_score:0.02f}\n") scores.append(score)
def main(sts_data): texts, labels = parse_sts(sts_data) # TODO 1: get a single list of texts to determine vocabulary and document frequency # create a TfidfVectorizer # fit to the training data # TODO 2: Can normalization like removing stopwords remove differences that aren't meaningful? # fill in preprocess_text above preproc_train_texts = [preprocess_text(text) for text in texts] # TODO 3: Learn another TfidfVectorizer for preprocessed data # Use token_pattern "\S+" in the TfidfVectorizer to split on spaces # TODO 4: compute cosine similarity for each pair of sentences, both with and without preprocessing cos_sims = [] cos_sims_preproc = [] for pair in texts: t1, t2 = pair # TODO 5: measure the correlations pearson = 0.0 preproc_pearson = 0.0 print(f"default settings: r={pearson:.03}") print(f"preprocessed text: r={preproc_pearson:.03}")
def main(sts_dev, w2v_file): # load the texts dev_texts, dev_y = parse_sts(sts_dev) # load word2vec using gensim KeyedVectors object w2v_vecs = None # get cosine similarities of every pair in dev # if either sentence is completely out of vocabulary, record "0" as the similarity cos_sims_mean = [] cos_sims_product = [] pearson_mean = 0 print(f"word2vec mean pearsons: r={pearson_mean[0]:.03}") pearson_prod = 0 print(f"word2vec product pearsons: r={pearson_prod[0]:.03}")
def main(sts_data): """Calculate NIST metric for pairs of strings Data is formatted as in the STS benchmark""" # read the dataset texts, labels = parse_sts(sts_data) print(f"Found {len(texts)} STS pairs") for i,pair in enumerate(texts[120:140]): label = labels[i+120] t1, t2 = pair print(f"Sentences: {t1}\t{t2}") # TODO: Calculate for each pair of sentences # catch any exceptions and assign 0.0 nist_score = 0.0 print(f"Label: {label}, NIST: {nist_score:0.02f}\n")
def main(sts_data): """Transform a semantic textual similarity dataset into a paraphrase identification. Data is formatted as in the STS benchmark""" max_nonparaphrase = 3.0 min_paraphrase = 4.0 # read the dataset texts, labels = parse_sts(sts_data) labels = np.asarray(labels) pi_texts, pi_labels = sts_to_pi(texts, labels) # calculate to check your split agrees with mine num_nonparaphrase = 0 num_paraphrase = 0 # 957 for dev print(f"{num_nonparaphrase} non-paraphrase") # 264 for dev print(f"{num_paraphrase} paraphrase") # Instantiate a TFIDFVectorizer to create representations for sentences # compute cosine similarity for each pair of sentences # use a threshold of 0.7 to convert each similarity score into a paraphrase prediction cos_sims_preproc = [] predictions = np.asarray(cos_sims_preproc) > 0.7 # calculate and print precision and recall statistics for your system num_pred = 0 print(f"Number predicted paraphrase: {num_pred}") num_pos = 0 print(f"Number positive: {num_pos}") num_true_pos = 0 print(f"Number true positive: {num_true_pos}") precision = 0 recall = 0 print(f"Scores: precision {precision:0.03}\trecall {recall:0.03}")
def main(sts_data): """Calculate pearson correlation between semantic similarity scores and string similarity metrics. Data is formatted as in the STS benchmark""" # TODO 1: read the dataset; implement in util.py texts, labels = parse_sts(sts_data) print(f"Found {len(texts)} STS pairs") # TODO 2: Calculate the metrics here score_types = [ "NIST", "BLEU", "Word Error Rate", "Longest common substring", "Edit Distance" ] # Sample code to print results. You can alter the printing as you see fit. It is most important to put the results # in a table in the README print(f"Semantic textual similarity for {sts_data}\n") for metric_name in score_types: score = 0.0 print(f"{metric_name} correlation: {score:.03f}")
def main(sts_data): """Calculate pearson correlation between semantic similarity scores and string similarity metrics. Data is formatted as in the STS benchmark""" # read the dataset # TODO: implement in util.py texts, labels = parse_sts(sts_data) print(f"Found {len(texts)} STS pairs") score_types = ["NIST", "BLEU", "Word Error Rate", "Longest common substring", "Levenshtein distance"] scores = {score_type: [] for score_type in score_types} # TODO: Calculate the metrics here to fill the lists in scores # This can stay as-is to print similar output to the sample print(f"Semantic textual similarity for {sts_data}\n") for metric_name, dists in scores.items(): score, sig = pearsonr(dists, labels) print(f"{metric_name} correlation: {score:.03}")
def main(sts_dev, w2v_file): # TODO 1: load the texts dev_texts, dev_y = parse_sts(sts_dev) # TODO 2: load word2vec using gensim KeyedVectors object # WARNING: you may need to downgrade gensim to version 3.4 w2v_vecs = None # TODO 3: Define the functions above that compose word representations into sentence representations # TODO 4: get cosine similarities of every sentence pair in dev # if either sentence is completely out of vocabulary, record "0" as the similarity for the pair cos_sims_mean = [] cos_sims_product = [] # TODO 5: Measure correlation with STS labels for the two ways of computing word2vec sentence representations pearson_mean = 0 print(f"word2vec mean pearsons: r={pearson_mean[0]:.03}") pearson_prod = 0 print(f"word2vec product pearsons: r={pearson_prod[0]:.03}")