def gather_performance(output): sim_extractor = Similarity() allfeatures = sorted(sim_extractor.features.keys()) allbody = [] for k in range(len(allfeatures) + 1): #features = allfeatures#['WordEmbedding'] if k == len(allfeatures): #use all features features = allfeatures else: features = [allfeatures[k]] #features = allfeatures[0:k] + allfeatures[k+1:] name = '_'.join(features) resultfile = '../data/%s/simlearning.cv.svm.%s.txt' % (course, name) head, body = fio.ReadMatrix(resultfile, hasHead=True) #get the average allhead = ['name'] + head[2:] average = [name] for i in range(2, len(head)): #start from the third one values = [float(row[i]) for row in body] average.append(np.mean(values)) allbody.append(average) fio.WriteMatrix(output, allbody, allhead)
def get_X_Y(input, index): head, body = fio.ReadMatrix(input, hasHead=True) X = [int(row[0]) for row in body[:-1]] #week Y = [float(row[index]) for row in body[:-1]] #rouge return X, Y
def getPhraseClusterAll(sennafile, weightfile, output, ratio=None, MalformedFlilter=False, source=None, np=None): NPCandidates, sources = getNPs(sennafile, MalformedFlilter, source=source, np=np) if len(NPCandidates) == 0: return NPs, matrix = fio.ReadMatrix(weightfile, hasHead=True) #change the similarity to distance matrix = Similarity2Distance(matrix) index = {} for i, NP in enumerate(NPs): index[NP] = i newMatrix = [] for NP1 in NPCandidates: assert (NP1 in index) i = index[NP1] row = [] for NP2 in NPCandidates: if NP2 not in index: print NP2, weightfile, np j = index[NP2] row.append(matrix[i][j]) newMatrix.append(row) V = len(NPCandidates) if ratio == "sqrt": K = int(math.sqrt(V)) elif float(ratio) > 1: K = int(ratio) else: K = int(ratio * V) if K < 1: K = 1 clusterid = ClusterWrapper.KMedoidCluster(newMatrix, K) body = [] for NP, id in zip(NPCandidates, clusterid): row = [] row.append(NP) row.append(id) body.append(row) fio.WriteMatrix(output, body, header=None)
def getPhraseClusterPhrase(phrasefile, weightfile, output, ratio=None, method=None): NPCandidates = fio.ReadFile(phrasefile) if len(NPCandidates) == 0: return NPs, matrix = fio.ReadMatrix(weightfile, hasHead=True) #change the similarity to distance matrix = Similarity2Distance(matrix) index = {} for i, NP in enumerate(NPs): index[NP] = i newMatrix = [] for NP1 in NPCandidates: if NP1 not in index: continue i = index[NP1] row = [] for NP2 in NPCandidates: if NP2 not in index: print NP2, weightfile, method continue j = index[NP2] row.append(matrix[i][j]) newMatrix.append(row) V = len(NPCandidates) if ratio == "sqrt": K = int(math.sqrt(V)) elif float(ratio) >= 1: K = int(ratio) else: K = int(ratio * V) if K < 1: K = 1 K = min(K, V) clusterid = ClusterWrapper.KMedoidCluster(newMatrix, K) body = [] for NP, id in zip(NPCandidates, clusterid): row = [] row.append(NP) row.append(id) body.append(row) fio.WriteMatrix(output, body, header=None)
def getPhraseCluster(phrasedir, method='lexicalOverlapComparer', ratio=None): sheets = range(0, 12) for sheet in sheets: week = sheet + 1 for type in ['POI', 'MP', 'LP']: weightfilename = phrasedir + str(week) + '/' + type + '.' + method print weightfilename NPs, matrix = fio.ReadMatrix(weightfilename, hasHead=True) #change the similarity to method for i, row in enumerate(matrix): for j, col in enumerate(row): matrix[i][j] = 1 - float( matrix[i][j]) if matrix[i][j] != "NaN" else 0 V = len(NPs) if ratio == None: K = int(math.sqrt(V)) else: K = int(ratio * V) K = 10 clusterid = ClusterWrapper.KMedoidCluster(matrix, K) # sorted_lists = sorted(zip(NPs, clusterid), key=lambda x: x[1]) # NPs, clusterid = [[x[i] for x in sorted_lists] for i in range(2)] dict = defaultdict(int) for id in clusterid: dict[id] = dict[id] + 1 body = [] for NP, id in zip(NPs, clusterid): row = [] row.append(NP) row.append(id) #row.append(dict[id]) body.append(row) if ratio == None: file = phrasedir + '/' + str( week ) + '/' + type + ".cluster.kmedoids." + "sqrt" + "." + method else: file = phrasedir + '/' + str( week) + '/' + type + ".cluster.kmedoids." + str( ratio) + "." + method fio.WriteMatrix(file, body, header=None)
def writegraph_leave_one_lecture_out_lsa(model_dir, phrasedir, modelname='lsa'): lectures = annotation.Lectures for i, lec in enumerate(lectures): test = [lec] path = os.path.join(phrasedir, str(lec)) for q in ['q1', 'q2']: #write the output phrasefile = os.path.join(path, "%s.%s.key" % (q, method)) phrases = fio.LoadList(phrasefile) if modelname == 'lsa': similarties_results = os.path.join( path, "%s.%s.optimumComparerLSATasa" % (q, method)) elif modelname == 'svm': similarties_results = os.path.join(path, "%s.%s.svm" % (q, method)) simhead, simbody = fio.ReadMatrix(similarties_results, hasHead=True) assert (len(simhead) == len(phrases)) body = [] for i, p1 in enumerate(phrases): for j, p2 in enumerate(phrases): if j <= i: continue #undirect graph score = simbody[i][j] score = float(score) if score != 'NaN' else 0.0 #if score == 0.0: score = 0.000001 #if score < 0.5: continue if score == 0.0: continue #row = [i, j, '%f'%score] row = [i, j] body.append(row) output = os.path.join( path, "%s.%s.%s%s" % (q, method, modelname, net_exe)) fio.WriteMatrix(output, body)
def gather_rouge(output): datadir = '../data/%s/' % course #output = '../data/IE256/result.rouge.txt' models = [ 'QPS_NP', #'QPS_A1_N', 'QPS_A2_N', 'QPS_union', 'QPS_intersect', 'QPS_combine' ] methods = [ 'rouge_crf_optimumComparerLSATasa', 'rouge_crf_ct.svm.default', #'rouge_crf_svm', #'rouge_crf_svr', 'rouge_crf_ct.svm.default', #'rouge_crf_ct.svr.default', ] Header = [ 'method', 'model', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P', 'RSU4-F', ] xbody = [] for method in methods: for model in models: filename = os.path.join(datadir, model, "%s.txt" % method) if not fio.IsExist(filename): continue head, body = fio.ReadMatrix(filename, hasHead=True) row = [method, model] row += body[-1][1:] xbody.append(row) fio.WriteMatrix(output, xbody, Header)
def split_rouge(filename, prefix, N=2): head, body = fio.ReadMatrix(filename, hasHead=True) newbodies = [[] for i in range(N)] for i, row in enumerate(body[:-1]): newbodies[i % N].append(row) #compute the new average for k in range(len(newbodies)): row = ['ave'] for i in range(1, len(head)): scores = [float(xx[i]) for xx in newbodies[k]] row.append(numpy.mean(scores)) newbodies[k].append(row) for i, newbody in enumerate(newbodies): fio.WriteMatrix('%s_q%d.txt' % (prefix, i + 1), newbody, head)
def __init__(self, prefix=""): self.features = { 'optimumComparerLSATasa': self.LSA, 'LexicalOverlap': self.LexicalOverlap, 'optimumComparerWNLin': self.LIN, 'BLEU': self.BLEU, 'ROUGE': self.ROUGE, 'Cosine': self.Cosine, 'WordEmbedding': self.WordEmbedding, #'WMD': self.WMD, } self.prefix = prefix self.Cache = {} self.cachefile = os.path.join(prefix + 'cache.json') print self.cachefile if fio.IsExist(self.cachefile): with open(self.cachefile, 'r') as fin: self.Cache = json.load(fin) if self.prefix != '': self.matrixdict = {} for sim in [ 'optimumComparerLSATasa', 'LexicalOverlap', 'optimumComparerWNLin', 'BLEU' ]: self.matrixdict[sim] = {} filename = self.prefix + sim phrases, matrix = fio.ReadMatrix(filename, hasHead=True) index = {} for i, p in enumerate(phrases): index[p] = i self.matrixdict[sim]['index'] = index self.matrixdict[sim]['matrix'] = matrix self.word2vec = fio.LoadDictJson(global_params.word2vec_model)
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'): #K is the number of words per points sheets = range(0,maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: path = folder + str(week)+ '/' fio.NewPath(path) filename = path + type + '.%d.summary'%ratio #produce the cluster file on the fly phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key') if not fio.IsExist(phrasefile): continue print excelfile, sheet, type cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method print cluster_output weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity print weightfile if not fio.IsExist(cluster_output): #if True: print "clustering" phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method) if not fio.IsExist(cluster_output): continue body = fio.ReadMatrix(cluster_output, False) NPCandidates = fio.ReadFile(phrasefile) lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict" lexdict = fio.LoadDict(lexfile, 'float') NPs = [row[0] for row in body] clusterids = [row[1] for row in body] #assert(NPCandidates == NPs) if NPCandidates != NPs: print NPCandidates print NPs cluster = {} for row in body: cluster[row[0]] = int(row[1]) Summary = [] #sort the clusters according to the number of response keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids) total_word = 0 word_count = 0 for key in keys: #phrase = NPs[key] phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict) if phrase in Summary: continue word_count = len(phrase.split()) total_word = total_word + word_count #if total_word <= K: if len(Summary) + 1 <= K: Summary.append(phrase) fio.SaveList(Summary, filename)
def gather_rouge(): Allbody = [] for cid in [ 'IE256', 'IE256_2016', 'CS0445', ]: ilpdir = "../data/%s/" % cid baseline_rougefile = os.path.join(ilpdir, 'rouge_np.txt') if not fio.IsExist(baseline_rougefile): continue basehead, basebody = fio.ReadMatrix(baseline_rougefile, hasHead=True) row = [cid, '', 'PhrasSum' ] + ['%.3f' % float(x) for x in basebody[-1][1:-3]] Allbody.append(row) for A in [ '1', '2', ]: for model in [ 'optimumComparerLSATasa', 'oracle', 'oracle_selection' ]: modeldir = os.path.join(ilpdir, 'oracle_annotator_%s' % A) model_rouge_file = os.path.join( modeldir, 'rouge_annotator%s_%s.txt' % (A, model)) head, body = fio.ReadMatrix(model_rouge_file, hasHead=True) if model == 'optimumComparerLSATasa': basehead1, basebody1 = fio.ReadMatrix(model_rouge_file, hasHead=True) elif model == 'oracle': basehead2, basebody2 = fio.ReadMatrix(model_rouge_file, hasHead=True) row = [cid, 'A%s' % A, model ] + ['%.3f' % float(x) for x in body[-1][1:-3]] print cid, model print model_rouge_file print baseline_rougefile #get p values from stats_util import get_ttest_pvalues pvalues = get_ttest_pvalues(basebody[1:-1], body[1:-1], range(1, len(head) - 3)) if model == 'optimumComparerLSATasa': k = 3 for p in pvalues: if p < 0.05: row[k] = row[k] + '$^*$' k += 1 elif model == 'oracle': pvalues1 = get_ttest_pvalues(basebody1[1:-1], body[1:-1], range(1, len(head) - 3)) k = 3 for p1, p2 in zip(pvalues, pvalues1): if p1 < 0.05 and p2 < 0.05: row[k] = row[k] + '$^{*\dag}$' elif p1 < 0.05: row[k] = row[k] + '$^*$' elif p2 < 0.05: row[k] = row[k] + '$^\dag$' k += 1 elif model == 'oracle_selection': pvalues1 = get_ttest_pvalues(basebody1[1:-1], body[1:-1], range(1, len(head) - 3)) pvalues2 = get_ttest_pvalues(basebody2[1:-1], body[1:-1], range(1, len(head) - 3)) k = 3 for p1, p2, p3 in zip(pvalues, pvalues1, pvalues2): if p1 >= 0.05 and p2 >= 0.05 and p3 >= 0.05: k += 1 continue row[k] = row[k] + '$^{' if p1 < 0.05: row[k] = row[k] + '*' if p2 < 0.05: row[k] = row[k] + '\dag' if p3 < 0.05: row[k] = row[k] + '\circ' row[k] = row[k] + '}$' k += 1 Allbody.append(row) output = '../data/rouge_oracle_all_gather.txt' fio.Write2Latex(output, Allbody, [''] + head)
def gather_rouge(output): courses = ['IE256', 'IE256_2016', 'CS0445'] rouges = [ ('LexRank', 'QPS_NP', 'rouge_LexRank'), ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa'), ('SequenceSum', 'QPS_combine_coling', 'rouge_crf_optimumComparerLSATasa'), ('SimSum', 'QPS_combine_coling', 'rouge_crf_svm'), ('CDSum', 'QPS_combine_coling', 'rouge_crf_ct.svm.default'), ] baseline1 = ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa') baseline2 = ('SequenceSum', 'QPS_combine_coling', 'rouge_crf_optimumComparerLSATasa') Header = [ 'course', 'name', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P', 'RSU4-F', ] ROUGE_Head = [ 'id', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P', 'RSU4-F' ] ROUGE_index = [ ROUGE_Head.index(name) for name in ROUGE_Head if name != 'id' ] xbody = [] for course in courses: for name, model, method in rouges: datadir = '../data/%s/' % course filename = os.path.join(datadir, model, "%s.txt" % method) if not fio.IsExist(filename): continue baseline1_name = os.path.join(datadir, baseline1[1], "%s.txt" % baseline1[2]) baseline2_name = os.path.join(datadir, baseline2[1], "%s.txt" % baseline2[2]) if name in ['LexRank', 'SequenceSum', 'SimSum', 'CDSum']: pvalues1 = get_pvalues(filename, baseline1_name, ROUGE_index) else: pvalues1 = [1] * len(ROUGE_index) if name in ['SimSum', 'CDSum']: pvalues2 = get_pvalues(filename, baseline2_name, ROUGE_index) else: pvalues2 = [1] * len(ROUGE_index) head, body = fio.ReadMatrix(filename, hasHead=True) row = [course, name] row += [ '%.3f%s%s' % (float(x), '*' if pvalues1[i] < 0.05 else '', '+' if pvalues2[i] < 0.05 else '') for i, x in enumerate(body[-1][1:]) ] xbody.append(row) fio.WriteMatrix(output, xbody, Header)
def get_pvalues(input1, input2, index): head, body1 = fio.ReadMatrix(input1, hasHead=True) head, body2 = fio.ReadMatrix(input2, hasHead=True) p_values = get_ttest_pvalues(body1[:-1], body2[:-1], index) return p_values