def parse(tree, classifier, tagger=False, tagsize=5, singlestep=False, silent=True, stackwindow=2, qwindow=3, keepscore=True, preclassified=False, justParse=False): malt.SILENT = silent if type(tree) == "SENTENCE": if tree.dtree: s, goldstandard = malt.dtree2state(tree) else: goldstandard = tree.goldstandard s = malt.STATE(text=tree.leaves) words = s.words else: words = [malt.WORD(w[0], w[1]) for w in tagger.tag(tree)] tree = tb.SENTENCE(False, False, False, False) tree.leaves = words s = malt.STATE(text=words) goldstandard = False for w in s.words: w.tag = w.tag[:tagsize] features = sorted(classifier.features.keys()) agenda = [s] s.score = 0.0 while not agenda == []: s = agenda.pop() if s.queue == []: """ What we'd like to do (pace Sardar) is to insist that the stack should have only one item on it, and if not then we go on to the next task on the agenda. That works nicely if we use the head percolation table that has CC as the head whenever possible; but overall the one that has CC as worst choice for the head does better (as with Maytham), and in that case trying to get to a nice terminal state by choosing other options from the agenda gets stuck. So we have to do something more simple-minded: attaching the things that haven't been attached to anything to their neighbours seems to work quite nicely. It's not exactly systematic, but it gets a surprising number of things right. """ for i in range(len(s.stack)-1): hd = s.stack[i+1] dtr = s.stack[i] s.relations[dtr.position] = malt.RELATION(hd.position, dtr.position, 'mod') d = s.stateDescriptor(False, qwindow=classifier.qwindow, stackwindow=classifier.stackwindow) t = id3.INSTANCE(features, d) """ The classifier can return several options: it would be nice to weigh them up, using influences from the grammar, and then order the agenda to pay due attention to the confidence of the classifier and the constraints from the grammar, but I can't find any useful influences. So I'm just choosing the best and working with that. I've left it as a loop, even though it's currently a pointless loop because the list only ever has one item on it, so that I can revisit it some time if I have the energy. """ actions = sortTable(classifier.classify(t, printing=False)) if actions == []: print "no action found" for action in actions: s1 = s.copy() s1.score = s.score+10 if singlestep: print '***********************************' s1.showState() ss("action %s\ns.relations %s\nd %s\n"%(action, s.relations, d)) action = eval("malt.STATE.%s"%(action[0])) WARNINGS = True try: if action(s1, warnings=WARNINGS, throwException=True): agenda.append(s1) break except Exception as e: if agenda == []: if not s1.queue == []: malt.STATE.shift(s1, warnings=WARNINGS) agenda.append(s1) else: break agenda.sort(cmp=compstates) right = 0 tree.parsed = s.relations if justParse: return tree if type(tree) == "str" or not keepscore: return malt.buildtree(s.relations, s.words) else: right1 = scoreState(goldstandard, s, tree) if preclassified: usePreclassifiedHDs(preclassified, s.relations) right2 = scoreState(goldstandard, s, tree) return right2, len(goldstandard), len(s.stack) else: return right1, len(goldstandard), len(s.stack)
def forceparse(s, goldstandard=False, irelns=False, top=False): if top: malt.SILENT = True s.actions = [] if goldstandard == False: if s.dtree: x, goldstandard = malt.dtree2state(s) s.goldstandard = goldstandard s = x else: goldstandard = s.goldstandard s = malt.STATE(queue=s.leaves) s.words = [w for w in s.queue] else: for r in goldstandard: r = goldstandard[r] (s.words[r.dtr]).label = r.rel if irelns == False: irelns = {} for d in goldstandard: h = goldstandard[d].hd if not h in irelns: irelns[h] = {} irelns[h][d] = True malt.REPLAY = False queue = s.queue stack = s.stack if queue == []: return s if not len(stack) == 1: return False if satisfied(goldstandard, s): return s else: return False if not stack == [] and not queue == []: q0 = queue[0] s0 = stack[0] """ leftArc: hd of the queue is hd, top of the stack is dtr. We want to check that this is indeed a targeted relation, i.e. that goldstandard[s[0].position] = q[0].position > s[0].position AND that all q[0] daughters have already been found """ for i in range(len(stack)): # Comment this out for LDD (currently broken) if i > 0: break d = stack[i] if d.position in goldstandard and goldstandard[d.position].hd == q0.position and alldtrsfound(d.position, irelns, s.relations): s.leftArc(d.label, i=i) r = forceparse(s, goldstandard=goldstandard, irelns=irelns) if r: return r else: s.undo() """ rightArc: top of the stack is hd, hd of the queue is dtr. We want to check that this is indeed a targeted relation, i.e. that goldstandard[q[0].position] = s[0].position > q[0].position AND that all q[0] daughters have already been found """ for i in range(len(queue)): # Comment this out for LDD (currently broken) if i > 0: break d = queue[i] if d.position in goldstandard and goldstandard[d.position].hd == s0.position and alldtrsfound(d.position, irelns, s.relations): s.rightArc(d.label, i=i) r = forceparse(s, goldstandard=goldstandard, irelns=irelns) if r: return r else: s.undo() s.shift() r = forceparse(s, goldstandard=goldstandard, irelns=irelns) if r: return r else: s.undo() return False