def runNWay(sessionNames): for groundTruth in sessionNames: for test in sessionNames: fname = "data/Direction understanding subjects Floor 8 (Final).ods" groundTruthSessions = readSession(fname, groundTruth) testSessions = readSession(fname, test) score = compareAnnotations(groundTruthSessions, testSessions) print "score", groundTruth, test, score
def trainingAndTestingFloor1(): training = readSession( TKLIB_HOME + "/nlp/data/Direction understanding subjects Floor 1 (Final).ods", 'stefie10') testing = readSession( TKLIB_HOME + "/nlp/data/Direction understanding subjects Floor 8 (Final).ods", 'stefie10') return training, testing
def main(): fname = "data/Direction understanding subjects Floor 8 (Final).ods" #fname = "data/Direction understanding subjects Floor 1 (Final).ods" sessions = readSession(fname, "crf_chunker") sent_num = 0 doc = Document() routeInstructionsXml = doc.createElement("routeInstructions") for session in sessions: sessionXml = doc.createElement("session") sessionXml.setAttribute("subject", session.subject) for instructionIdx, instruction in enumerate( session.routeInstructions): instructionXml = doc.createElement("instruction") instructionXml.setAttribute("instruction_id", ` sent_num `) textXml = doc.createElement("text") textXml.appendChild(doc.createTextNode(instruction)) instructionXml.appendChild(textXml) sdcXml = doc.createElement("sdcs") for annotation in session.routeAnnotations[instructionIdx]: sdcXml.appendChild(annotation.toXml(doc)) instructionXml.appendChild(sdcXml) sent_num += 1 sessionXml.appendChild(instructionXml) routeInstructionsXml.appendChild(sessionXml) doc.appendChild(routeInstructionsXml) outfile = open("out.xml", "w") outfile.write(doc.toprettyxml()) outfile.close()
def testChildrenMap(self): fname = "data/Direction understanding subjects Floor 1 (Final).ods" sessions = readSession(fname, "stefie10-d1-hierarchical") for session in sessions: for instructionIdx, instruction in enumerate( session.routeInstructions): sdcs = session.routeAnnotations[instructionIdx] sdc_to_children = childrenMap(sdcs) for parent, children in sdc_to_children.iteritems(): for child in children: self.assertEqual(parent.contains(child), True) print sdc_to_children for sdc in sdcs: count = 0 for parent, children in sdc_to_children.iteritems(): if sdc in children: count += 1 print sdc, count self.assertTrue(count <= 1) print print "printing map" sdc_to_children = childrenMap(sdcs) for parent in sdcs: children = sdc_to_children[parent] print parent, len(children) for child in children: print " ", child
def __init__(self, fname): QMainWindow.__init__(self) self.setupUi(self) self.sdcAnnotator = sdcAnnotator.MainWindow() self.sdcAnnotator.show() self.connect(self.actionSave, SIGNAL("triggered()"), self.save) self.activeData = None #self.sessions = readSession(fname, "dlaude") self.sessions = readSession(fname, "stefie10-d1-hierarchical") #self.sessions = readSession(fname, "tkollar") #self.sessions = readSession(fname, "regexp_chunker") #self.sessions = readSession(fname, "crf_chunker") self.setWindowTitle(self.sessions.annotationSource) self.instructionModel = instructionTableModel.Model( self.instructionTable, self.sessions) self.instructionTable.verticalHeader().hide() self.instructionTable.horizontalHeader().hide() self.instructionTable.horizontalHeader().setStretchLastSection(True) self.connect( self.instructionTable.selectionModel(), SIGNAL("selectionChanged ( QItemSelection, QItemSelection )"), self.selectInstruction) self.instructionTable.selectRow(0)
def chunkInstructions(fname): chunker = Chunker() sentenceTokenizer = SentenceTokenizer() sessions = readSession(fname, "regexp_chunker") for session in sessions: session.clearAnnotations() for instructionIdx, instruction in enumerate( session.routeInstructions): for sentenceStandoff in sentenceTokenizer.tokenize( session, instructionIdx): offset = sentenceStandoff.start print "instruction", instruction indexes, tokens, chunks = chunker.chunk(sentenceStandoff.text) print "# of chunks", len(chunks) for i, c in enumerate(chunks.leaves()): str, tag = c range = Standoff( session, instructionIdx, (indexes[i] + offset, indexes[i] + len(str) + offset)) chunks[chunks.leaf_treeposition(i)] = (str, tag, range) semantics = addChunks(chunks, session, instructionIdx) session.saveAnnotations()
def destWriteTraining(self): fname = "data/Direction understanding subjects Floor 8 (Final).ods" sessions = readSession(fname, 'stefie10') chunker = crfEntityExtractor.CrfChunker("data/out.model") chunker.writeTraining(sessions, "training.txt") self.fail()
def main(): floor8 = readSession("data/Direction understanding subjects Floor 8 (Final).ods", "stefie10") floor1 = readSession("data/Direction understanding subjects Floor 1 (Final).ods", "stefie10") #sessions = SessionGroup(floor8.sessions + floor1.sessions) sessions = floor8 print "all words" orphanedWords(sessions, withStopwords=True) print "all non-stopword orphans" orphanedWords(sessions, withStopwords=False) print "floor 8" handledSdcs(floor8) print "all" sdcsWithoutLandmarks(sessions)
def trainingAndTestingSplit(): fname = TKLIB_HOME + "/nlp/data/Direction understanding subjects Floor 8 (Final).ods" sessions = readSession(fname, 'stefie10') splitPoint = len(sessions) / 2 training = sessions[0:splitPoint] testing = sessions[splitPoint:] return training, testing
def __init__(self, corpus_fn, annotations): self.dsession = readSession(corpus_fn, annotations) self.data = {} for session in self.dsession: for instructionIdx, instruction in enumerate( session.routeInstructions): annotations = session.routeAnnotations[instructionIdx] self.data[str(instruction)] = annotations
def destOpen(self): session = None try: fname = "data/Direction understanding subjects Floor 1 (Final).ods" sessions = readSession(fname, "test") session = sessions[0] self.assertEqual(sessions[0].followed[0], "followed") self.assertEqual(sessions[6].followed[0], "questionable") self.assertEqual(sessions[3].followed[1], "not followed") self.assertEqual(sessions[14].followed[0], None) self.assertEqual(sessions[14].followed[9], None) instructionIdx = 0 spatialRelation = Standoff(session, instructionIdx, (0, 1)) session.addAnnotation( instructionIdx, Annotation(verb=None, landmark=None, spatialRelation=spatialRelation)) session.saveAnnotations() reloadedSessions = readSession(fname, "test") reloadedSession = reloadedSessions[0] reloadedInstruction = reloadedSession.routeAnnotations[ instructionIdx][0] self.assertEqual(reloadedInstruction.__class__, Annotation) self.assertEqual(reloadedInstruction.spatialRelation.start, 0) self.assertEqual(reloadedInstruction.spatialRelation.end, 1) self.assertEqual(reloadedInstruction.verb.start, 0) self.assertEqual(reloadedInstruction.verb.end, 0) finally: if not (session is None): try: os.remove(session.annotationFname) except OSError: pass # ignore if file doesn't exist
def destDotty(self): fname = "data/Direction understanding subjects Floor 1 (Final).ods" sessions = readSession(fname, "stefie10") session = sessions[0] instructionIdx = 0 import dottyExporter dottyExporter.exportAsDotty(session, instructionIdx, "test.dot")
def chunkInstructions(): training, testing = trainingAndTesting() fname = "data/Direction understanding subjects Floor 8 (Final).ods" sessions = readSession(fname, 'stefie10') # splitPoint = len(sessions) / 2 # outputSessions = readSession(fname, 'crf_chunker')[splitPoint:] outputSessions = readSession(fname, 'crf_chunker') trainingFile = "training.txt" testingFile = "testing.txt" modelFile = "out.model" outputFile = "out.txt" chunker = CrfChunker() chunker.writeTraining(testing, testingFile) outputFile = "floor1.out.txt" chunker.runTesting(modelFile, testingFile, outputFile) cm, baselineCm = chunker.confusionMatrix(outputFile) print "system" print cm print "accuracy", cm.accuracy() print print "baseline (always pick landmark)" print baselineCm print "accuracy", baselineCm.accuracy() iChunker = SpatialDescriptionClauseExtractor() for session in outputSessions: session.clearAnnotations() for instructionIdx, instruction in enumerate( session.routeInstructions): annotations = iChunker.chunk( session.routeInstructions[instructionIdx]) for a in annotations: print "a", a, a.__class__ session.addAnnotation(instructionIdx, a) session.saveAnnotations()
def run_dataset(self, prior_cache, corpus_fn): dsession = readSession(corpus_fn, "none") prior_cache = make_symmetric(prior_cache) words = set([]) for elt_i, elt in enumerate(dsession): print str(elt_i / (1.0 * len(dsession))) + "% done" for i in range(len(elt.routeInstructions)): sentence = elt.routeInstructions[i] words.update(self.run_text(prior_cache, sentence)) return words
def destOpenPartial(self): fname = "data/Direction understanding subjects Floor 1 (Final).ods" sessions = readSession(fname, "none", quadrant=0) self.assertEqual(len(sessions), 7) for s in sessions: self.assertEqual(len(s.routeInstructions), 5) sessions = readSession(fname, "none", quadrant=1) self.assertEqual(len(sessions), 7) for s in sessions: self.assertEqual(len(s.routeInstructions), 5) sessions = readSession(fname, "none", quadrant=2) self.assertEqual(len(sessions), 8) for s in sessions: self.assertEqual(len(s.routeInstructions), 5) sessions = readSession(fname, "none", quadrant=3) self.assertEqual(len(sessions), 8) for s in sessions: self.assertEqual(len(s.routeInstructions), 5)
def testAncestorMap(self): fname = "data/Direction understanding subjects Floor 1 (Final).ods" sessions = readSession(fname, "stefie10") for session in sessions: for instructionIdx, instruction in enumerate( session.routeInstructions): sdcs = session.routeAnnotations[instructionIdx] sdc_to_ancestors = ancestorMap(sdcs) for child, parents in sdc_to_ancestors.iteritems(): for parent in parents: self.assertEqual(parent.contains(child), True)
def loadCorpus(corpus_fname): from routeDirectionCorpusReader import readSession tokens = corpus_fname.split("/") after_tokens = [] for token in tokens: if len(after_tokens) > 0: after_tokens.append(token) else: if "slu" in token: after_tokens.append(token) corpus_fname = os.path.join(*[TKLIB_HOME] + after_tokens[1:]) print corpus_fname return readSession(corpus_fname, "stefie10")
def main(): fname = "data/Direction understanding subjects Floor 8 (Final).ods" sessions = readSession(fname, "stefie10") session = sessions[0] instructionIdx = 0 annotation = session.routeAnnotations[instructionIdx][4] print "range", annotation.range exportAsDotty(session, instructionIdx, "test.dot", TextStandoff(annotation.entireText, (83, 189)))
def testSentenceTokenizer(self): fname = "data/Direction understanding subjects Floor 1 (Final).ods" sessions = readSession(fname, "stefie10") session = sessions[0] tokenizer = sentenceTokenizer.SentenceTokenizer() standoffs = tokenizer.tokenize(session.routeInstructions[0]) self.assertEqual(len(standoffs), 4) self.assertEqual(standoffs[0].text, "With your back to the glass entryways, walk toward the question mark sign.") for session in sessions: for instructionIdx, instruction in enumerate(session.routeInstructions): standoffs = tokenizer.tokenize(session.routeInstructions[0])
def testConverter(self): try: from routeDirectionCorpusReader import readSession import routeDirections except ImportError: print "skipping because no route directions module" return fname = ( os.environ["SLU_HOME"] + "/nlp/data/Direction understanding subjects Floor 1 (Final).ods") sessions = readSession(fname, "stefie10-d1-hierarchical") for session in sessions: for instructionIdx, instruction in enumerate( session.routeInstructions): print instruction sdcs = session.routeAnnotations[instructionIdx] esdcs = routeDirections.fromRouteDirectionSdc(sdcs) for esdc in esdcs: print esdc
def evaluateRouteInstructionCorpus(annotationName, title): from routeDirectionCorpusReader import readSession from environ_vars import SLU_HOME fname = "%s/nlp/data/Direction understanding subjects Floor 1 (Final).ods" % SLU_HOME sessions = readSession(fname, annotationName) i = 0 corpus = [] for session in sessions: for instructionIdx, instruction in enumerate( session.routeInstructions): sdcs = session.routeAnnotations[instructionIdx] annotatedEsdcs = routeDirections.fromRouteDirectionSdc(sdcs) corpus.append((instruction, annotatedEsdcs)) i += 1 extractors = [("SDCs", routeDirections.FlatEsdcExtractor()), ("ESDCs", stanfordParserExtractor.Extractor())] graphScores(extractors, corpus) mpl.title(title) mpl.savefig("esdcs.routeInstructions.%s.png" % annotationName)
def main(): fname = "data/Direction understanding subjects Floor 8 (Final).ods" sessions = readSession(fname, "stefie10") doc = Document() examples = doc.createElement("examples") doc.appendChild(examples) exampleId = 0 for session in sessions: for instructionIdx, sdcs in session.routeAnnotations.iteritems(): for sdc in sdcs: if sdc.text.strip() != "": example = doc.createElement("example") example.setAttribute("id", "%d" % exampleId) examples.appendChild(example) nl = doc.createElement("nl") nl.setAttribute("lang", "en") nl.appendChild(doc.createTextNode(sdc.text)) example.appendChild(nl) mrl = doc.createElement("mrl") mrl.setAttribute("lang", "edu.mit.csail.ar.language.waspParser.SdcGrammar") mrl.appendChild(doc.createTextNode(exportSdcAsWasp(sdc))) example.appendChild(mrl) print sdc exampleId += 1 output_fname = "out.xml" output_file = open(output_fname, "w") output_file.write(doc.toprettyxml(indent=" ")) output_file.close()
def main(): floor8 = readSession( "data/Direction understanding subjects Floor 8 (Final).ods", "stefie10") floor1 = readSession( "data/Direction understanding subjects Floor 1 (Final).ods", "stefie10") quad = readSession( "data/Direction understanding subjects Floor 1 (Helicopter).ods", "stefie10") sessions = SessionGroup(floor8.sessions) sessions = SessionGroup(floor1.sessions + floor8.sessions) sessions = SessionGroup(floor1.sessions + floor8.sessions + quad.sessions) maxCols = 10 leftAdjust = 0.45 bottomAdjust = 0.24 topAdjust = 0.85 figsize = (5, 4) mpl.rcParams.update({ 'font.family': 'serif', }) graphSession(sessions, "Landmarks", "landmark", maxCols=maxCols, leftAdjust=leftAdjust, bottomAdjust=bottomAdjust, topAdjust=topAdjust, figsize=figsize, showEntries=False) graphSession(sessions, "Spatial Relations", "spatialRelation", maxCols=maxCols, leftAdjust=leftAdjust, bottomAdjust=bottomAdjust, topAdjust=topAdjust, figsize=figsize, showEntries=False) graphSession(sessions, "Verbs and Satellites", "verb", maxCols=maxCols, leftAdjust=leftAdjust, bottomAdjust=bottomAdjust, topAdjust=topAdjust, figsize=figsize, showEntries=False) graphSession(sessions, "Figures", "figure", maxCols=maxCols, leftAdjust=leftAdjust, bottomAdjust=bottomAdjust, topAdjust=topAdjust, figsize=figsize, showEntries=False) count_words(sessions) mpl.show()
from sys import argv from routeDirectionCorpusReader import readSession import cPickle from tag_util import * from du.eval_util import * import math2d if __name__ == "__main__": sentence_fn = argv[1] dg_cache_fn = argv[2] gtruth_tag_fn = argv[3] map_fn = argv[4] dsession = readSession(sentence_fn, "none") dg_model = cPickle.load(open(dg_cache_fn, 'r')) sent_num = 0 tf = tag_file(gtruth_tag_fn, map_fn) topohash = get_region_to_topo_hash_containment(tf, dg_model) total_length = 0.0 for elt in dsession: for i in range(len(elt.routeInstructions)): sentence = elt.routeInstructions[i] start_true, end_true = elt.columnLabels[i].split("to") start_true = start_true.strip() end_true = end_true.strip() sloc = topohash[start_true][0] eloc = topohash[end_true][0] print "sloc", sloc print "eloc", eloc
def runNWay(sessionNames): for groundTruth in sessionNames: for test in sessionNames: fname = "data/Direction understanding subjects Floor 8 (Final).ods" groundTruthSessions = readSession(fname, groundTruth) testSessions = readSession(fname, test) score = compareAnnotations(groundTruthSessions, testSessions) print "score", groundTruth, test, score if __name__ == "__main__": runNWay(["dlaude"]) runNWay(["stefie10", "dlaude", "crf_chunker"]) fname = "data/Direction understanding subjects Floor 8 (Final).ods" groundTruthSessions = readSession(fname, "stefie10") testSessions = readSession(fname, "crf_chunker") score = compareAnnotations(groundTruthSessions, testSessions) print "score", score data = [compareAnnotations(groundTruthSessions, testSessions, [key]) for key in Annotation.keys] labels = list(Annotation.abbrvKeys) data.append(compareAnnotations(groundTruthSessions, testSessions, ["spatialRelation", "landmark"])) labels.append("SR and L") data.append(compareAnnotations(groundTruthSessions, testSessions)) labels.append("All")
def testHelicopter(self): fname = "data/Direction understanding subjects Floor 1 (Helicopter).ods" sessions = readSession(fname, "none") self.assertTrue(sessions != None)
def plot_distance_curve_random(model, corpus_fn, gtruth_tag_fn, map_fn, color, marker, label='', linestyle="-", region_to_topology=None): """ Needs the viewpoints and stuff from the model. """ print "starting random" dsession = readSession(corpus_fn, "none") if gtruth_tag_fn != None: tf = tag_file(gtruth_tag_fn, map_fn) topohash = get_region_to_topo_hash_containment(tf, model) else: topohash = region_to_topology Dists = [] for elt in dsession: for i in range(len(elt.routeInstructions)): if (elt.columnLabels[i] is None): print "sentence", i, "was", elt.columnLabels[i] continue start_true, end_true = elt.columnLabels[i].split("to") start_true = str(start_true.strip()) end_true = str(end_true.strip()) iSlocTopo = topohash[start_true][0] iElocTopo = topohash[end_true][0] eloc = model.tmap_locs[iElocTopo] total_dist = 0.0 for vp in model.viewpoints: topo, orient = vp.split("_") vp_loc = model.tmap_locs[float(topo)] total_dist += math2d_dist(vp_loc, eloc) expected_dist = total_dist / len(model.viewpoints) Dists.append(expected_dist) Y = [] X = [] for threshold in Dists: #get the ones above the threshold #print nonzero(array(Dists) > threshold) #print array(Dists) > threshold Itrue, = nonzero(array(Dists) <= threshold) Y.append(len(Itrue) / (1.0 * len(Dists))) X.append(threshold) num_correct_at_threshold = len(nonzero(array(Dists) <= 10)[0]) print "random less than 10 meters", num_correct_at_threshold, print "%.3f%%" % (num_correct_at_threshold / (1.0 * len(Dists))) print "sorting" X, I = quicksort(X) print "taking" Y = array(Y).take(I) print "plotting" if (X[0] > 0.0): Xf = [X[0]] Xf.extend(X) Yf = [0] Yf.extend(Y) X = Xf Y = Yf p = plot_markers_evenly(X, Y, label, marker, color, linewidth=2.5, linestyle=linestyle) xlabel('distance from destination (m)') ylabel('proportion correct') return p
def __init__(self, corpus_fn, model_fn, gtruth_tag_fn, map_fn, output_dir, options, evaluation_mode="specialized", num_to_run=None, is_sum_product=False, num_align=None, no_spatial_relations=False, do_exploration=False, quadrant_number=None, wizard_of_oz_sdcs=None, run_description=None, inference="global", topN_num_paths=None,num_explorations=None,exploration_heuristics_name=None, parameters=None): print "num_to_run", num_to_run print "options", options options["model_fn"] = model_fn options["corpus_fn"] = corpus_fn options["gtruth_tag_fn"] = gtruth_tag_fn if inference == "": inference = "global" options["inference"]=inference self.range_to_run = None if num_to_run == "": num_to_run = None elif type(num_to_run)==type("abc") and num_to_run.find(":")!=-1: range_from = int(num_to_run.split(":")[0]) range_to = int(num_to_run.split(":")[1]) self.range_to_run = range(range_from,range_to) num_to_run = range_to elif num_to_run != None: num_to_run = int(num_to_run) if type(num_to_run) == type(1) and self.range_to_run==None: self.range_to_run = range(num_to_run) if self.range_to_run == None: #running all of them. if(quadrant_number==None): self.dsession = readSession(corpus_fn, "none") else: self.dsession = readSession(corpus_fn, "none", quadrant=int(quadrant_number)) self.range_to_run = [] sent_num_i = 0 for elt in self.dsession: for i in range(len(elt.routeInstructions)): self.range_to_run.append(sent_num_i) sent_num_i += 1 if num_explorations in [None, ""]: num_explorations=50 else: num_explorations=int(num_explorations) self.options = options self.output_dir = output_dir self.inference = inference self.num_align = num_align self.num_to_run = num_to_run self.is_sum_product = is_sum_product self.num_align = num_align if run_description == None: run_description = model_fn if inference !=None: run_description += " " + run_description if no_spatial_relations: run_description += " -sr" else: run_description += " +sr" self.run_description = run_description if(quadrant_number==None): self.dsession = readSession(corpus_fn, "none") #res = raw_input("running all examples! Continue?") #if(res.lower() == 'n' or res.lower() == "no"): # sys.exit(0); else: self.dsession = readSession(corpus_fn, "none", quadrant=int(quadrant_number)) self.dg_model = cPickle.load(open(model_fn, 'r')) self.dg_model.use_spatial_relations = not no_spatial_relations if inference == "greedy": self.dg_model = greedy.model(self.dg_model) elif inference == "last_sdc": self.dg_model = last_sdc.model(self.dg_model) elif inference == "topN": if topN_num_paths == None or topN_num_paths=="": self.topN_num_paths = 10 else: self.topN_num_paths = int(topN_num_paths) self.dg_model = topN.model(self.dg_model,self.topN_num_paths) elif inference == "global": pass else: raise ValueError("Bad inference value: " + inference) #self.do_exploration = eval(str(do_exploration)) self.do_exploration = do_exploration if evaluation_mode == "best_path": self.orient = get_orientations_each elif evaluation_mode == "max_prob": self.orient = get_orientations_all elif evaluation_mode == "specialized": self.orient = get_orientations_annotated else: raise ValueError("Unexpected mode: " + `evaluation_mode`) #this will load the srel_mat #if(isinstance(self.dg_model, model4_du.model4_du)): print "loading srel_mat" self.dg_model.initialize() #open the ground truth file self.tf = tag_file(gtruth_tag_fn, map_fn) self.gtruth_tag_fn = gtruth_tag_fn #map the topological regions to teh ground truth regions self.topohash = get_region_to_topo_hash_containment(self.tf, self.dg_model) print "getting topological paths" self.topo_graph_D = get_topological_paths_hash(self.dg_model.clusters) #cPickle.dump(self.topo_graph_D, open("topo_graph_D", "wb"), 2) #self.topo_graph_D = cPickle.load(open("topo_graph_D", "r")) if wizard_of_oz_sdcs != None: print "using wizard", wizard_of_oz_sdcs self.sdc_parser = direction_parser_wizard_of_oz(corpus_fn, wizard_of_oz_sdcs) else: print "using crfs" self.sdc_parser = direction_parser_sdc() if num_explorations in [None, ""]: #TODO replace 2 by the branching factor or something else. self.num_explorations=len(self.dg_model.tmap_locs.keys()) / 2 else: self.num_explorations=int(num_explorations) if exploration_heuristics_name in [None,""]: self.exploration_heuristics_name = "lifted_stairs" else: self.exploration_heuristics_name = exploration_heuristics_name if self.exploration_heuristics_name == "slope_offset_delay": if parameters not in [None, ""]: params_str = parameters.split(":") if len(params_str)==3: self.params_num = map(float,params_str) else: self.params_num = None else: self.params_num = None