def solvegraph_leave_one_lecture_out(phrasedir, modelname='svr'): lectures = annotation.Lectures oslom = OSLOM(oslom_parms) if modelname == 'svm': weighted = False undirect = True elif modelname == 'lsa': weighted = False undirect = True else: #svr, lsa weighted = True undirect = True for i, lec in enumerate(lectures): path = os.path.join(phrasedir, str(lec)) for q in ['q1', 'q2']: #write the output phrasefile = os.path.join(path, "%s.%s.key" % (q, method)) phrases = fio.LoadList(phrasefile) netgraphfile = os.path.join( path, "%s.%s.%s%s" % (q, method, modelname, net_exe)) oslom.solve_graph(netgraphfile, undirect, weighted)
def extractPhrasePaireFeature(phrasedir): for lec in annotation.Lectures: path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.' % (prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrasefile = os.path.join(path, "%s.%s.key" % (prompt, method)) phrases = fio.LoadList(phrasefile) for p1 in phrases: for p2 in phrases: featureset.append( (feature_extractor.get_features(p1, p2), 0.0, { 'p1': p1, 'p2': p2 })) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def __init__(self, key_prefix, sum_prefix, N): ''' N is number of annotators ''' self.key_prefix = key_prefix self.sum_prefix = sum_prefix self.N = N #load phrase color map phrasefile = key_prefix + phrase_exe phrases = fio.LoadList(phrasefile) colorfile = key_prefix + color_exe color_map = fio.LoadDictJson(colorfile) phrase_color_map = self.combine_phrase_color(phrases, color_map) #get phrase summary color map sumfile = sum_prefix + sum_exe summaries = fio.LoadList(sumfile) self.summary_color = self.get_summary_color(summaries, phrase_color_map) #get summary count sumcountfile = sum_prefix + sum_count_exe self.summary_no = [int(x) for x in fio.LoadList(sumcountfile)] assert(len(self.summary_color) == len(self.summary_no)) #load human_summary color map self.ref_color = [] for i in range(N): d = {} ref_sumcolor_file = '%s%s.%d.color'%(sum_prefix, ref_exe, i) ref_sumno_file = '%s%s.%d.no'%(sum_prefix, ref_exe, i) for color, no in zip(fio.LoadList(ref_sumcolor_file), fio.LoadList(ref_sumno_file)): d[int(color)] = int(no) self.ref_color.append(d)
def writegraph_leave_one_lecture_out_lsa(model_dir, phrasedir, modelname='lsa'): lectures = annotation.Lectures for i, lec in enumerate(lectures): test = [lec] path = os.path.join(phrasedir, str(lec)) for q in ['q1', 'q2']: #write the output phrasefile = os.path.join(path, "%s.%s.key" % (q, method)) phrases = fio.LoadList(phrasefile) if modelname == 'lsa': similarties_results = os.path.join( path, "%s.%s.optimumComparerLSATasa" % (q, method)) elif modelname == 'svm': similarties_results = os.path.join(path, "%s.%s.svm" % (q, method)) simhead, simbody = fio.ReadMatrix(similarties_results, hasHead=True) assert (len(simhead) == len(phrases)) body = [] for i, p1 in enumerate(phrases): for j, p2 in enumerate(phrases): if j <= i: continue #undirect graph score = simbody[i][j] score = float(score) if score != 'NaN' else 0.0 #if score == 0.0: score = 0.000001 #if score < 0.5: continue if score == 0.0: continue #row = [i, j, '%f'%score] row = [i, j] body.append(row) output = os.path.join( path, "%s.%s.%s%s" % (q, method, modelname, net_exe)) fio.WriteMatrix(output, body)
def predict_IE256(train_course, model_dir, phrasedir, modelname='svm'): sim_extractor = Similarity() allfeatures = sorted(sim_extractor.features.keys()) features = allfeatures name = '_'.join(features) lectures = annotation.Lectures for i, lec in enumerate(lectures): test = [lec] print test model_file = os.path.join(model_dir, '%s_%s.model' % (train_course, name)) with open(model_file, 'rb') as handle: clf = pickle.load(handle) path = os.path.join(phrasedir, str(lec)) for q in ['q1', 'q2']: test_X, test_Y = combine_files_test(phrasedir, test, features, prompts=[q]) predict_Y = clf.predict(test_X) #write the output phrasefile = os.path.join(path, "%s.%s.key" % (q, method)) phrases = fio.LoadList(phrasefile) assert (len(predict_Y) == len(phrases) * len(phrases)) k = 0 body = [] for p1 in phrases: row = [] for p2 in phrases: row.append(predict_Y[k]) k += 1 body.append(row) output = os.path.join(path, "%s.%s.%s" % (q, method, modelname)) fio.WriteMatrix(output, body, phrases)
def readgraph_leave_one_lecture_out(phrasedir, modelname='svr'): lectures = annotation.Lectures oslom = OSLOM() if modelname == 'svr': weighted = True undirect = True else: weighted = False undirect = True for i, lec in enumerate(lectures): path = os.path.join(phrasedir, str(lec)) for q in ['q1', 'q2']: #write the output phrasefile = os.path.join(path, "%s.%s.key" % (q, method)) phrases = fio.LoadList(phrasefile) netgraphfile = os.path.join( path, "%s.%s.%s%s_oslo_files" % (q, method, modelname, net_exe), 'tp') if not fio.IsExist(netgraphfile): #no communities print netgraphfile communites = [[x] for x in range(len(phrases))] else: communites = oslom.readgraph_partitions(netgraphfile) #if len(communites) == 1:#break it # communites = [[x] for x in range(len(phrases))] name = 'ct.%s.%s' % (modelname, 'default') output = os.path.join( path, "%s.cluster.kmedoids.sqrt.%s.%s" % (q, name, method)) write_communite_to_clusters(communites, phrases, output) print "%d\t%s\t%d" % (lec, q, len(communites))
def writegraph_leave_one_lecture_out(model_dir, phrasedir, modelname='svr', traincourse=None): from sklearn import svm from sklearn.metrics import mean_squared_error, precision_recall_fscore_support, accuracy_score import QPS_simlearning sim_extractor = Similarity() allfeatures = sorted(sim_extractor.features.keys()) features = allfeatures name = '_'.join(features) lectures = annotation.Lectures for i, lec in enumerate(lectures): test = [lec] print test model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name)) #model_file = os.path.join(model_dir, '%s_%s.model'%('IE256_2016', name)) with open(model_file, 'rb') as handle: clf = pickle.load(handle) path = os.path.join(phrasedir, str(lec)) for q in ['q1', 'q2']: test_X, test_Y = QPS_simlearning.combine_files_test(phrasedir, test, features, prompts=[q]) predict_Y = clf.predict(test_X) #write the output phrasefile = os.path.join(path, "%s.%s.key" % (q, method)) phrases = fio.LoadList(phrasefile) assert (len(predict_Y) == len(phrases) * len(phrases)) k = 0 body = [] for i, p1 in enumerate(phrases): for j, p2 in enumerate(phrases): if j <= i: k += 1 continue #undirect graph if modelname == 'svm': if predict_Y[k] == 1.0: #row = [i,j, '%.1f'%predict_Y[k]] row = [i, j] body.append(row) else: row = [i, j, '%.2f' % predict_Y[k]] body.append(row) k += 1 output = os.path.join( path, "%s.%s.%s%s" % (q, method, modelname, net_exe)) fio.WriteMatrix(output, body)