def _parse_unigrams(self, path): print('parsing unigram data...') data = data2numpy(path) for row in data: word = row[0].lower() unigram = float(row[1]) self.unigram[word] = unigram print('done') return
def __init__(self, relatedness_paths, homophone_unigram_paths, puns_data_path, self_relatedness=13.0): self.self_relatedness = self_relatedness self.use_trigrams = True self.unigram = {} self.trigram = {} self.homo_unigram = {} self.relatedness = {} self.word_vector = {} self.get_idx = map_puntypeID_to_idx(data2numpy(puns_data_path)) self._parse_relatedness(data2numpy(relatedness_paths[0]), 'near') self._parse_relatedness(data2numpy(relatedness_paths[1]), 'identical') self._parse_unigrams(data2numpy(homophone_unigram_paths[0]), 'near') self._parse_unigrams(data2numpy(homophone_unigram_paths[1]), 'identical') print('loaded data model v1')
def _parse_relatedness(self, path): print('parsing relatedness data...') data = data2numpy(path) for row in data: words = [row[0].lower(), row[1].lower()] words.sort() words = tuple(words) relatedness = float(row[2]) self.relatedness[words] = relatedness print('done') return
def _parse_trigrams(self, path): print('parsing trigram data...') data = data2numpy(path) for row in data: idx = int(row[0]) word = row[1].lower() m1_trigram = float(row[2]) m2_trigram = float(row[3]) # save trigram data if idx not in self.trigram: self.trigram[idx] = {} self.word_vector[idx] = [] self.trigram[idx]['m1'] = [] self.trigram[idx]['m2'] = [] self.trigram[idx]['m1'].append(m1_trigram) self.trigram[idx]['m2'].append(m2_trigram) self.word_vector[idx].append(word) print('done') return
#!/usr/bin/env python3 from utils import data2numpy near_unigram_datapath = './homophones_unigram_near.csv' identical_unigram_datapath = './homophones_unigram_identical.csv' # get the whole homophone unigram data in a single list unigram_data = [] near_unigram_data = data2numpy(near_unigram_datapath) for row in near_unigram_data: unigram_data.append(row) data = data2numpy(identical_unigram_datapath) for row in data: unigram_data.append(row) # make word to unigram mapping # ideally, it should be a one to one mapping unigram = {} for row in unigram_data: w1 = row[2].lower() w2 = row[3].lower() # unigram probabilities p_w1 = float(row[4]) p_w2 = float(row[5]) if w1 not in unigram: unigram[w1] = [] if w2 not in unigram: unigram[w2] = [] # log mutliple unigram values for a single key, if any
""" This script merges two model output files for near and identical type puns which are computed by author's original script 'computeMeasures.py'. Measures are saved in a new csv file. """ from utils import data2numpy, map_puntypeID_to_idx, save_results if __name__ == '__main__': jokes = data2numpy("../../data/data-agg.csv") get_idx = map_puntypeID_to_idx(jokes) jokes_data = {} # fill dictionary with model output on identical and near puns jokes_data = save_results('../ModelOutputs/near_trigram_13_0.csv', jokes_data, get_idx) jokes_data = save_results('../ModelOutputs/identical_trigram_13_0.csv', jokes_data, get_idx) # write model outputs in a new csv file output_path = "../ModelOutputs/data.csv" f_v0 = open(output_path, "wr") f_v0.write( "idx,sentenceID,punType,sentenceType,ambiguity,distinctiveness," + "sentence\n") # parse measures from original data sheet pun-model/data/data-agg.csv and # save it in the same format as data.csv for comparing model results f_original = open("../ModelOutputs/data-agg-measures.csv", "wr") f_original.write("idx,sentenceID,punType,sentenceType,ambiguity," + "distinctiveness,sentence\n") for idx in range(len(jokes_data)):
""" Fits a linear regression model over the computed measures - ambiguity and distinctiveness. Groundtruth funniness ratings are obtained from pun-model/data/data-agg.csv. Regression model summary is printed on console. """ import statsmodels.api as sm import numpy as np from utils import data2numpy if __name__ == '__main__': # read groundtruth funniness ratings jokes_data_path = "../../data/data-agg.csv" jokes_data = data2numpy(jokes_data_path) gt = np.array(jokes_data[:, -1]).astype(float) # read ambiguity and distinctiveness measures computed by model model_output = data2numpy("../results/data.csv") amb = np.array(model_output[:, 4]).astype(float) dist = np.array(model_output[:, 5]).astype(float) # fit linear regression model X = np.stack([amb, dist]).transpose() y = gt model = sm.OLS(y, X) results = model.fit() print(results.summary())
""" Fits a linear regression model over the computed measures - ambiguity and distinctiveness. Groundtruth funniness ratings are obtained from pun-model/data/data-agg.csv. Regression model summary is printed on console. """ import statsmodels.api as sm import numpy as np from utils import data2numpy if __name__ == '__main__': # read ground truth funniness ratings jokes_data_path = "../../data/data-agg.csv" jokes_data = data2numpy(jokes_data_path) gt = np.array(jokes_data[:, -1]).astype(float) # read ambiguity and distinctiveness measures computed by model model_output = data2numpy("../ModelOutputs/data.csv") amb = np.array(model_output[:, 4]).astype(float) dist = np.array(model_output[:, 5]).astype(float) # fit linear regression model X = np.stack([amb, dist]).transpose() y = gt model = sm.OLS(y, X) results = model.fit() print(results.summary())
#!/usr/bin/env python3 from utils import data2numpy, map_puntypeID_to_idx relatedness_near_datapath = ("./wordPair_relatedness_" + "smoothedTrigrams_near.csv") relatedness_identical_datapath = ("./wordPair_relatedness_" + "smoothedTrigrams_identical.csv") # get m1 and m2 (two meanings/interpretations) for each sentence in the dataset meanings = {} puns_datapath = '../../data/data-agg.csv' puns_data = data2numpy(puns_datapath) for i, row in enumerate(puns_data): meanings[i] = [row[-3], row[-2]] # map puntype (near/identical) and puntypeID to index in puns dataset # for example, m['near'][1] will yield index of the pun with 'near' homophone # and having 'near' sentence ID = 1 in puns dataset [data-agg.csv] get_idx = map_puntypeID_to_idx(data2numpy(puns_datapath)) # clean up relatedness data relatedness = {} data = data2numpy(relatedness_near_datapath) for row in data: id = int(row[0]) idx = get_idx['near'][id] m1 = meanings[idx][0] m2 = meanings[idx][1] word = row[3] m1_relatedness = float(row[4]) m2_relatedness = float(row[5])