def WriteCluster(excelfile, folder, np=None): sheets = range(0, maxWeek) for type in ['q1', 'q2', 'q3', 'q4']: for sheet in sheets: week = sheet + 1 student_summaryList = CourseMirror_Survey.getStudentResponseList( excelfile, course, week, type, withSource=True) if len(student_summaryList) == 0: continue path = folder + str(week) + '/' fio.NewPath(path) path = path + type + '/' fio.NewPath(path) filename = path + type + '.cluster' #create a XML file root = ET.Element(tag='CLUSTER', attrib={'LANG': "ENG"}) root.tail = '\n' tree = ET.ElementTree(root) DID = str(sheet + 1) + '_' + type node = ET.Element(tag='D', attrib={'DID': str(DID)}) node.tail = '\n' root.append(node) tree.write(filename)
def WriteDocsent(excelfile, folder, phrasedir, np=None): sheets = range(0, maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: phrasefile = os.path.join(phrasedir, str(week), type + '.' + method + '.key') if not fio.IsExist(phrasefile): continue print phrasefile DID = str(week) + '_' + type path = folder + str(week) + '/' fio.NewPath(path) path = path + type + '/' fio.NewPath(path) path = path + 'docsent/' fio.NewPath(path) filename = path + DID + '.docsent' #create a XML file root = ET.Element(tag='DOCSENT', attrib={ 'DID': DID, 'LANG': "ENG" }) root.tail = '\n' tree = ET.ElementTree(root) phrases = fio.ReadFileUTF8(phrasefile) sno_id = 1 for par, phrase in enumerate(phrases): phrase = phrase.rstrip() s = [phrase] for RSNT, value in enumerate(s): node = ET.Element(tag='S', attrib={ 'PAR': str(par + 1), 'RSNT': str(RSNT + 1), 'SNO': str(sno_id) }) node.text = value node.tail = '\n' root.append(node) sno_id = sno_id + 1 tree.write(filename)
def test_cross_course(train, name='all'): wapiti_home = global_params.wapiti_dir pattern_file = '../data/%s.pattern.txt' % name model_dir = '../data/%s/%s/model/%s/' % (course, system, name) fio.NewPath(model_dir) feature_dir = '../data/%s/%s/extraction/' % (course, system) feature_cv_dir = '../data/%s/%s/extraction/%s/' % (course, system, name) fio.NewPath(feature_cv_dir) outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name) fio.NewPath(outputdir) lectures = annotation.Lectures dict = defaultdict(int) for i, lec in enumerate(lectures): test = [lec] model_file = os.path.join(model_dir, '%s.model' % train) print model_file crf = CRF(wapiti_home) if not fio.IsExist(model_file): print "Model is not available" for q in ['q1', 'q2']: test_filename = os.path.join(feature_cv_dir, 'test_%d_%s.feature.crf' % (i, q)) output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q)) dict['test_%d_%s' % (i, q)] = 1 if method == 'combine': test_filename_old = test_filename.replace('_combine', '_A1') cmd = 'cp %s %s' % (test_filename_old, test_filename) os.system(cmd) else: combine_files(feature_dir, test, test_filename, prompts=[q]) crf.predict(test_filename, model_file, output_file) if debug: break file_util.save_dict2json(dict, class_index_dict_file)
def train_leave_one_lecture_out_NP(name='cv'): feature_dir = '../data/%s/%s/extraction/' % (course, system) outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name) fio.NewPath(outputdir) lectures = annotation.Lectures dict = defaultdict(int) for i, lec in enumerate(lectures): train = [x for x in lectures if x != lec] test = [lec] for q in ['q1', 'q2']: output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q)) dict['test_%d_%s' % (i, q)] = 1 combine_files(feature_dir, test, output_file, prompts=[q]) if debug: break file_util.save_dict2json(dict, class_index_dict_file)
def extractPhraseFromCRFWithColor(phrasedir, systemdir): crf_reader = CRF() aligner = AlignPhraseAnnotation() lectures = annotation.Lectures for i, lec in enumerate(lectures): path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' extracted_phrases = [] extracted_colors = [] crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out' % (i, prompt)) for tokens, tags, color0, color1 in crf_reader.read_file_generator_index( crf_file, [0, -1, -4, -3]): phrases, phrase_colors = aligner.get_phrase_with_colors( tokens, tags, [color0, color1]) for phrase, phrase_color in zip(phrases, phrase_colors): extracted_phrases.append(phrase.lower()) extracted_colors.append(phrase_color) fio.SaveList(extracted_phrases, filename) filename = path + prompt + '.' + method + '.key.color' fio.SaveDict2Json(extracted_colors, filename)
def extractPhrase(excelfile, folder, sennadatadir, method): sheets = range(0, maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: #for type in ['POI', 'MP']: print excelfile, sheet, type student_summaryList = CourseMirror_Survey.getStudentResponseList( excelfile, course, week, type, withSource=False) if len(student_summaryList) == 0: continue path = folder + str(week) + '/' fio.NewPath(path) filename = path + type + '.' + method + '.key' sennafile = sennadatadir + "senna." + str( week) + "." + type + '.output' if not fio.IsExist(sennafile): continue phrases = getKeyPhrases(student_summaryList, sennafile, method=method, MalformedFlilter=True) fio.SaveList(phrases, filename)
def extractPhrasePaireFeature(phrasedir): for lec in annotation.Lectures: path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.' % (prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrasefile = os.path.join(path, "%s.%s.key" % (prompt, method)) phrases = fio.LoadList(phrasefile) for p1 in phrases: for p2 in phrases: featureset.append( (feature_extractor.get_features(p1, p2), 0.0, { 'p1': p1, 'p2': p2 })) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def correlation_analysis(course): phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course outdir = '../data/%s/simlearning/' % course fio.NewPath(outdir) sim_extractor = Similarity() features = sorted(sim_extractor.features.keys()) head = features + ['score', 'predict'] body = [] lectures = annotation.Lectures name = '_'.join(features) for i, lec in enumerate(lectures): model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name)) with open(model_file, 'rb') as handle: clf = pickle.load(handle) for q in ['q1', 'q2']: outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe)) for phrasedir in [phrasedir1, phrasedir2]: path = phrasedir + str(lec) + '/' filename = os.path.join(path, q + sim_exe) data = fio.LoadDictJson(filename) for fdict, score, _ in data: row = [] for fname in features: x = fdict[fname] if str(x) == 'nan': x = 0.0 row.append(x) predict_score = clf.predict([row]) row.append(score) row.append(predict_score[0]) body.append(row) out_correlation = os.path.join(outdir, 'data.txt') print out_correlation fio.WriteMatrix(out_correlation, body, head)
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id): for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.' % (prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrase_annotation = task.get_phrase_annotation(prompt) #positive examples for rank1 in sorted(phrase_annotation): for rank2 in sorted(phrase_annotation): if rank1 == rank2: score = 1.0 else: score = 0.0 phrases1 = phrase_annotation[rank1] phrases2 = phrase_annotation[rank2] for phrasedict1 in phrases1: p1 = phrasedict1['phrase'].lower().strip() for phrasedict2 in phrases2: p2 = phrasedict2['phrase'].lower().strip() featureset.append( (feature_extractor.get_features(p1, p2), score, { 'p1': p1, 'p2': p2 })) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def PrepareIE256(): cid = "IE256" maxWeek = 25 excelfile = "../data/CourseMirror/Reflection.json" sennadir = "../../AbstractPhraseSummarization/data/IE256/senna/" #fio.NewPath(sennadir) #getStudentResponses4Senna(excelfile, cid, maxWeek, sennadir) outdirs = [ #'../../AbstractPhraseSummarization/data/IE256/ILP_Baseline_Sentence/', #'../../AbstractPhraseSummarization/data/IE256/MC/', #'../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_MC/', '../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_Supervised_FeatureWeightingAveragePerceptron/', ] sheets = range(1, maxWeek + 1) for outdir in outdirs: for sheet in sheets: week = sheet for type in ['q1', 'q2', 'q3', 'q4']: student_summaryList = getStudentResponseList( excelfile, cid, week, type, True) if len(student_summaryList) == 0: continue path = os.path.join(outdir, str(week)) fio.NewPath(path) source = {} responses = [] count = defaultdict(int) for response, student in student_summaryList: responses.append(response) count[response] += 1 if response not in source: source[response] = [] source[response].append(student) outout = os.path.join(path, type + ".sentence.key") fio.SaveList(set(responses), outout) output = os.path.join(path, type + '.sentence.keys.source') fio.SaveDict2Json(source, output) output = os.path.join(path, type + '.sentence.dict') fio.SaveDict(count, output)
def train_on_course(traincourse, name='all'): wapiti_home = global_params.wapiti_dir pattern_file = '../data/%s.pattern.txt' % name model_dir = '../data/%s/%s/model/%s/' % (course, system, name) fio.NewPath(model_dir) feature_dir = '../data/%s/%s/extraction/' % (traincourse, system) feature_cv_dir = '../data/%s/%s/extraction/%s/' % (traincourse, system, name) fio.NewPath(feature_cv_dir) outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name) fio.NewPath(outputdir) if traincourse == 'IE256': lectures = [x for x in range(14, 26) if x != 22] else: lectures = [x for x in range(3, 27)] dict = defaultdict(int) train = [x for x in lectures] train_filename = os.path.join(feature_cv_dir, 'train.feature.crf') model_file = os.path.join(model_dir, '%s.model' % traincourse) print train_filename print model_file crf = CRF(wapiti_home) if not fio.IsExist(model_file): #if True: combine_files(feature_dir, train, train_filename) crf.train(train_filename, pattern_file, model_file)
def correlation_analysis_noduplicate(): phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course outdir = '../data/%s/simlearning/' % course fio.NewPath(outdir) sim_extractor = Similarity() features = sorted(sim_extractor.features.keys()) head = features + ['score'] body = [] lectures = annotation.Lectures for i, lec in enumerate(lectures): for q in ['q1', 'q2']: outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe)) for phrasedir in [phrasedir1, phrasedir2]: path = phrasedir + str(lec) + '/' filename = os.path.join(path, q + sim_exe) data = fio.LoadDictJson(filename) for fdict, score, pd in data: if pd['p1'] == pd['p2']: print pd['p1'] continue row = [] for name in features: x = fdict[name] if str(x) == 'nan': x = 0.0 row.append(x) row.append(score) body.append(row) out_correlation = os.path.join(outdir, 'data.txt') fio.WriteMatrix(out_correlation, body, head)
def get_phrase_reference_summary_phrase(outputs = None): for output in outputs: fio.NewPath(output) counts = [] for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators, lectures=annotation.Lectures): print doc task = annotation.Task() task.loadjson(doc) sub_tasks = task.get_tasks() for sub_task in sub_tasks: if sub_task["task_name"] == "Phrase": if sub_task['prompt'] == 0: #POI type = 'q1' else: type = 'q2' summary_filename = os.path.join(output, str(lec), type+'.ref.' + str(annotation.anotator_dict[annotator])) #summary_filename = os.path.join(output, str(lec), type+'.ref.summary') print summary_filename summaries = [row[1] for row in sub_task["summary"][1:]] colors = [row[0].strip()[1] for row in sub_task["summary"][1:]] student_numbers = [row[2].strip() for row in sub_task["summary"][1:]] count = 0 for summary in summaries: count += len(NLTKWrapper.wordtokenizer(summary)) counts.append(count) fio.SaveList(summaries, summary_filename) color_filename = os.path.join(output, str(lec), '%s.ref.%s.color'%(type, str(annotation.anotator_dict[annotator]))) fio.SaveList(colors, color_filename) no_filename = os.path.join(output, str(lec), '%s.ref.%s.no'%(type, str(annotation.anotator_dict[annotator]))) fio.SaveList(student_numbers, no_filename) print counts print numpy.mean(counts) print numpy.median(counts)
def extractPhraseFromCRF(phrasedir, systemdir): crf_reader = CRF() aligner = AlignPhraseAnnotation() lectures = annotation.Lectures for i, lec in enumerate(lectures): path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' phrases = [] crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out' % (i, prompt)) for tokens, tags in crf_reader.read_file_generator(crf_file): for phrase in aligner.get_phrase(tokens, tags): phrases.append(phrase.lower()) fio.SaveList(phrases, filename)
def extractPhraseFromAnnotationIntersect(phrasedir, annotators): for docs in annotation.generate_all_files_by_annotators( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): doc0, lec0, annotator0 = docs[0] doc1, lec1, annotator1 = docs[1] assert (lec0 == lec1) lec = lec0 #load tasks task0 = annotation.Task() task0.loadjson(doc0) task1 = annotation.Task() task1.loadjson(doc1) path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' print filename extracted_phrases = [] phrase_annotation0 = task0.get_phrase_annotation(prompt) phrase_annotation1 = task1.get_phrase_annotation(prompt) aligner = AlignPhraseAnnotation(task0, task1, prompt) aligner.align() extracted_phrases = aligner.get_intersect() fio.SaveList(extracted_phrases, filename)
fio.WriteMatrix(output, allbody, allhead) def check_stopword(): from CourseMirror_Survey import stopwords vocab = fio.LoadDictJson(global_params.vocab) for word, count in vocab.items(): if count < 5: continue if word in stopwords: print word, '\t', count if __name__ == '__main__': course = global_params.g_cid for system, method in [ ('QPS_combine', 'crf'), ]: phrasedir = "../data/" + course + "/" + system + "/phrase/" # extractPhrasePaireFeature(phrasedir) model_dir = "../data/" + course + "/simlearning/svm" fio.NewPath(model_dir) train_leave_one_lecture_out_svm(model_dir) predict_leave_one_lecture_out(model_dir, phrasedir, modelname='svm')
if response not in source: source[response] = [] source[response].append(student) outout = os.path.join(path, type + ".sentence.key") fio.SaveList(set(responses), outout) output = os.path.join(path, type + '.sentence.keys.source') fio.SaveDict2Json(source, output) output = os.path.join(path, type + '.sentence.dict') fio.SaveDict(count, output) #write human summary if __name__ == '__main__': #PrepareIE256() #exit(0) cid = sys.argv[1] maxWeek = int(sys.argv[2]) # cid = 'CS0445' # maxWeek = 28 excelfile = "../data/CourseMirror/reflections.json" annotation_dir = "../data/Annotation/" + cid + '/' sennadir = "../data/" + cid + "/senna/" fio.NewPath(sennadir) getStudentResponses4Senna(excelfile, cid, maxWeek, sennadir)
fio.WriteMatrix(datadir + "summary.txt", body, head) if __name__ == '__main__': course = sys.argv[1] maxWeek = int(sys.argv[2]) system = sys.argv[3] method = sys.argv[4] similarity = sys.argv[5] K = int(sys.argv[6]) excelfile = "../data/CourseMIRROR/reflections.json" clusterdir = "../data/"+course+"/"+system+"/phrase/" fio.NewPath(clusterdir) datadir = "../data/"+course+"/"+system+"/PhraseMead/" GetLexRankScore(datadir, method, clusterdir) for ratio in [K]: for lex in ['lexrankmax']: datadir = "../data/"+course+"/"+system+ '/ClusterARank/' #fio.DeleteFolder(datadir) ShallowSummary(excelfile, datadir, clusterdir, K=5, method = method, similarity=similarity, ratio=ratio, lex=lex) #PrintClusterRankSummary(datadir) print "done"
def GetLexRankScore(datadir, np, outputdir): sheets = range(0, maxWeek) for type in ['q1', 'q2', 'q3', 'q4']: for sheet in sheets: week = sheet + 1 DID = str(week) + '_' + type phrases = [] scores = [] #read Docsent path = datadir + str(week)+ '/' path = path + type + '/' path = path + 'docsent/' filename = path + DID + '.docsent' #print filename if not fio.IsExist(filename): continue tree = ET.parse(filename) root = tree.getroot() for child in root: phrases.append(child.text) #read feature path = datadir + str(week)+ '/' path = path + type + '/' path = path + 'feature/' filename = path + type + '.LexRank.sentfeature' if fio.IsExist(filename): tree = ET.parse(filename) root = tree.getroot() for child in root: feature = child[0] #print feature.tag, feature.attrib, feature.attrib['V'] #print child.tag, child.attrib scores.append(feature.attrib['V']) else: for phrase in phrases: scores.append("0") #write assert(len(phrases) == len(scores)) dict = {} for phrase, score in zip(phrases, scores): dict[phrase.lower()] = score output = outputdir + str(week)+ '/' + str(type) + "." + np + ".lexrank.dict" fio.NewPath(outputdir + str(week)+ '/') fio.SaveDict(dict, output, SortbyValueflag=True) dict = {} for phrase, score in zip(phrases, scores): if phrase.lower() in dict: dict[phrase.lower()] = max(score, dict[phrase.lower()]) else: dict[phrase.lower()] = score output = outputdir + str(week)+ '/' + str(type) + "." + np + ".lexrankmax.dict" fio.SaveDict(dict, output, SortbyValueflag=True)
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'): #K is the number of words per points sheets = range(0,maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: path = folder + str(week)+ '/' fio.NewPath(path) filename = path + type + '.%d.summary'%ratio #produce the cluster file on the fly phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key') if not fio.IsExist(phrasefile): continue print excelfile, sheet, type cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method print cluster_output weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity print weightfile if not fio.IsExist(cluster_output): #if True: print "clustering" phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method) if not fio.IsExist(cluster_output): continue body = fio.ReadMatrix(cluster_output, False) NPCandidates = fio.ReadFile(phrasefile) lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict" lexdict = fio.LoadDict(lexfile, 'float') NPs = [row[0] for row in body] clusterids = [row[1] for row in body] #assert(NPCandidates == NPs) if NPCandidates != NPs: print NPCandidates print NPs cluster = {} for row in body: cluster[row[0]] = int(row[1]) Summary = [] #sort the clusters according to the number of response keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids) total_word = 0 word_count = 0 for key in keys: #phrase = NPs[key] phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict) if phrase in Summary: continue word_count = len(phrase.split()) total_word = total_word + word_count #if total_word <= K: if len(Summary) + 1 <= K: Summary.append(phrase) fio.SaveList(Summary, filename)
def extractPhraseFromAnnotation(phrasedir, annotator, summarydir=None): for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotator, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) path = phrasedir + str(lec) + '/' fio.NewPath(path) #Add a cache to make it faster Cache = {} cachefile = phrasedir + str(lec) + '/' + 'cache.json' if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' cluster_output = path + prompt + '.cluster.kmedoids.sqrt.oracle.%s' % method if summarydir: fio.NewPath(os.path.join(summarydir, str(lec))) summary_file = os.path.join(summarydir, str(lec), '%s.summary' % prompt) body = [] if summarydir: summaries = [] phrase_summary_dict = task.get_phrase_summary_textdict(prompt) extracted_phrases = [] phrase_annotation = task.get_phrase_annotation(prompt) for rank in sorted(phrase_annotation): rank_phrases = [] phrases = phrase_annotation[rank] for phrasedict in phrases: phrase = phrasedict['phrase'].lower() extracted_phrases.append(phrase) rank_phrases.append(phrase) row = [phrase, rank] body.append(row) if summarydir: rank_summary = phrase_summary_dict[rank] max_summary = get_max_phrase_by_ROUGE( rank_summary, rank_phrases, Cache) print max_summary summaries.append(max_summary) fio.SaveList(extracted_phrases, filename) fio.WriteMatrix(cluster_output, body, header=None) if summarydir: fio.SaveList(summaries, summary_file) with open(cachefile, 'w') as outfile: json.dump(Cache, outfile, indent=2)
if __name__ == '__main__': # course = global_params.g_cid # output = "../data/"+course + '/length.txt' # compare_length(annotation.anotators[:1], output) # exit(-1) course = sys.argv[1] maxWeek = int(sys.argv[2]) system = sys.argv[3] method = sys.argv[4] sennadir = "../data/" + course + "/senna/" excelfile = "../data/CourseMIRROR/reflections.json" systemdir = "../data/" + course + "/" + system + "/" fio.NewPath(systemdir) phrasedir = "../data/" + course + "/" + system + "/phrase/" fio.NewPath(phrasedir) summarydir = "../data/" + course + "/" + system + "/ClusterARank/" if summarydir: fio.NewPath(summarydir) # output = "../data/"+course + '/statistics.txt' # extractStatistics(annotation.anotators[:1], output) # exit(-1) if method == 'syntax': extractPhrase(excelfile, phrasedir, sennadir, method=method) elif method == 'annotator1':
def extractPhraseFromSyntax(extractiondir, annotators): for docs in annotation.generate_all_files_by_annotators( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): doc0, lec0, annotator0 = docs[0] doc1, lec1, annotator1 = docs[1] assert (lec0 == lec1) lec = lec0 # if lec != 11: continue #load tasks task0 = annotation.Task() task0.loadjson(doc0) task1 = annotation.Task() task1.loadjson(doc1) path = extractiondir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.feature.crf' print filename fout = codecs.open(filename, "w", "utf-8") extracted_phrases = [] phrase_annotation0 = task0.get_phrase_annotation(prompt) phrase_annotation1 = task1.get_phrase_annotation(prompt) aligner = AlignPhraseAnnotation(task0, task1, prompt) aligner.align() for d in aligner.responses: tokens = [token.lower() for token in d['response']] tags = d['tags'][0] colors = d['colors'] n_tokens = [] n_tags = [] for token, tag in zip(tokens, tags): if len(token) == 0: continue n_tokens.append(token) n_tags.append(tag) if len(n_tokens) == 0: continue tokens = n_tokens tags = n_tags body = [] words = tokens N = len(tokens) #first row: the word token for word in words: row = [] row.append(word) body.append(row) #add the color for color in colors: for i, tag in enumerate(tags): body[i].append(str(color[i])) #last row: tags = [tag for tag in tags] for i, tag in enumerate(tags): body[i].append(tag) #extract the NP tags psg_tags = getSennaPSGtags(tokens) for i, tag in enumerate(psg_tags): body[i].append(tag) for row in body: fout.write(' '.join(row)) fout.write('\n') fout.write('\n') fout.close() if debug: break
#exit(-1) debug = False course = sys.argv[1] maxWeek = int(sys.argv[2]) system = sys.argv[3] method = sys.argv[4] empty = sys.argv[5] excelfile = "../data/CourseMIRROR/reflections.json" extractVocab(annotation.anotators, '../data/%s/vocab.json' % course) extractiondir = "../data/" + course + "/" + system + "/extraction/" fio.NewPath(extractiondir) class_index_dict_file = '../data/%s/class_dict.json' % course if method == 'NP': extractPhraseFromSyntax(extractiondir, annotation.anotators) train_leave_one_lecture_out_NP('all') elif method == 'annotator1': extractPhraseFeatureFromAnnotation(extractiondir, annotation.anotators, 0, empty) elif method == 'annotator2': extractPhraseFeatureFromAnnotation(extractiondir, annotation.anotators, 1, empty) elif method == 'union': extractPhraseFeatureFromUnion(extractiondir, annotation.anotators,
def extractPhraseFeatureFromCombine(extractiondir, annotators, empty='N'): for docs in annotation.generate_all_files_by_annotators( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): doc0, lec0, annotator0 = docs[0] doc1, lec1, annotator1 = docs[1] assert (lec0 == lec1) lec = lec0 #print lec #if lec != 17: continue #load tasks task0 = annotation.Task() task0.loadjson(doc0) task1 = annotation.Task() task1.loadjson(doc1) path = extractiondir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: #if prompt != 'q2': continue filename = path + prompt + '.feature.crf' print filename fout = codecs.open(filename, "w", "utf-8") extracted_phrases = [] phrase_annotation0 = task0.get_phrase_annotation(prompt) phrase_annotation1 = task1.get_phrase_annotation(prompt) aligner = AlignPhraseAnnotation(task0, task1, prompt) aligner.align() crf_feature_extractor = CRF_Extractor() #add sentences to the extrator for global feature extraction for d in aligner.responses: tokens = [token.lower() for token in d['response']] colors = d['colors'] if d['tags'][0] == d['tags'][1]: combinetags = [d['tags'][0]] else: combinetags = [d['tags'][0], d['tags'][1]] for tags in combinetags: n_tokens = [] n_tags = [] for token, tag in zip(tokens, tags): if len(token) == 0: continue n_tokens.append(token) n_tags.append(tag) if len(n_tokens) == 0: continue tokens = n_tokens tags = n_tags crf_feature_extractor.add_sentence((tokens, tags, colors)) for tokens, tags, colors in crf_feature_extractor.sentences: if empty == 'Y': flag = True for tag in tags: if tag != 'O': flag = False if flag: continue body = crf_feature_extractor.extract_crf_features( tokens, tags, prompt, colors) for row in body: fout.write(' '.join(row)) fout.write('\n') fout.write('\n') fout.close()
def train_leave_one_lecture_out(name='cv'): wapiti_home = global_params.wapiti_dir pattern_file = '../data/%s.pattern.txt' % name model_dir = '../data/%s/%s/model/%s/' % (course, system, name) fio.NewPath(model_dir) feature_dir = '../data/%s/%s/extraction/' % (course, system) feature_cv_dir = '../data/%s/%s/extraction/%s/' % (course, system, name) fio.NewPath(feature_cv_dir) outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name) fio.NewPath(outputdir) lectures = annotation.Lectures dict = defaultdict(int) for i, lec in enumerate(lectures): train = [x for x in lectures if x != lec] test = [lec] train_filename = os.path.join(feature_cv_dir, 'train_%d.feature.crf' % i) model_file = os.path.join(model_dir, '%d.model' % i) print train_filename print model_file crf = CRF(wapiti_home) if not fio.IsExist(model_file): #if True: combine_files(feature_dir, train, train_filename) crf.train(train_filename, pattern_file, model_file) for q in ['q1', 'q2']: test_filename = os.path.join(feature_cv_dir, 'test_%d_%s.feature.crf' % (i, q)) output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q)) dict['test_%d_%s' % (i, q)] = 1 if empty == 'Y': test_filename_old = test_filename.replace('_Y', '_N') cmd = 'cp %s %s' % (test_filename_old, test_filename) os.system(cmd) else: if method == 'combine': test_filename_old = test_filename.replace( '_combine', '_A1') cmd = 'cp %s %s' % (test_filename_old, test_filename) os.system(cmd) else: combine_files(feature_dir, test, test_filename, prompts=[q]) crf.predict(test_filename, model_file, output_file) if debug: break file_util.save_dict2json(dict, class_index_dict_file)