def __init__(self, weightstr): if FLAGS.integerize: FLAGS.tuplefeats = True from deptree import DepTree DepTree.setup() Model.start_sym = Vocab.str2id("<s>") if FLAGS.integerize else "<s>" Model.stop_sym = Vocab.str2id("</s>") if FLAGS.integerize else "</s>" Model.none_sym = Vocab.str2id("NONE") if FLAGS.integerize else "NONE" WVector.init(Model.names) # for doublehash 1 (and trim, value_class) self.templates = {} # mapping from "s0t-q0t" to the eval expression self.list_templates = [] # ordered list of template keys "s0t-q0t" self.freq_templates = defaultdict(int) Model.doublehash = FLAGS.doublehash ## if Model.doublehash == 1: ## self.weights = \ ## dict((action, new_vector()) for action in Model.names) # faster than defaultdict! ## #WVector() if FLAGS.wvector else \ ## self.weights = [new_vector() for _ in Model.names] # faster than defaultdict! ## elif Model.doublehash == 2: ## self.weights = [[new_vector() for _ in range(100)] for _ in Model.names] # one dict per template ## else: self.weights = Model.new_weights() #Vector() self.read_weights(weightstr) ## self.featurenames = set(self.weights.iterkeys()) if FLAGS.featstat: self.print_templates()
def tree(self): '''alternatively: simulate actions''' (children, action, _) = self.backptrs[0] if children is None: return DepTree(self.i) # SHIFT else: left, right = children # REDUCE return DepTree.combine(left.tree(), right.tree(), action)
def one_pass_on_train(self, data): num_updates = 0 early_updates = 0 ## for i, example in enumerate(self.decoder.load(self.trainfile), 1): for i, line in enumerate(data, 1): example = DepTree.parse(line) #print >> logs, "... example %d (len %d)..." % (i, len(example)), self.c += 1 similarity, deltafeats = self.decoder.decode(example, early_stop=True) if similarity < 1 - 1e-8: #update num_updates += 1 #print >> logs, "sim={0}, |delta|={1}".format(similarity, len(deltafeats)) if FLAGS.debuglevel >= 2: print >> logs, "deltafv=", deltafeats self.weights += deltafeats if FLAGS.avg: self.allweights += deltafeats * self.c if similarity < 1e-8: # early-update happened early_updates += 1 else: #print >> logs, "PASSED! :)" None return num_updates, early_updates
def tree(self): '''alternatively: simulate actions''' (children, action, _) = self.backptrs[0] if children is None: if FLAGS.pretag and action[0] == -2: return DepTree(self.i, action[2]) elif FLAGS.shifttag and action[0] == 0: return DepTree(self.i, action[2]) else: return DepTree( self.i, action[1]) # yang: SHIFT tag (changes tag in deptree) else: left, right = children # REDUCE return DepTree.combine(left.tree(), right.tree(), action)
def besteval(self, reflinks, cache=None): ''' sub-oracle ''' if cache is None: cache = {} sig = (self.step, self.rank) if sig in cache: return cache[sig] if self.action == 0: s = DepVal() t = self.tree() else: h = self.headidx s = -1 t = None for ((left, right), action, _) in self.backptrs: m = left.headidx if action == 1 else right.headidx this = 1 if (m in reflinks and reflinks[m] == h) else 0 thistot = 1 if (m in reflinks) else 0 lefteval, lefttree = left.besteval(reflinks, cache) righteval, righttree = right.besteval(reflinks, cache) thiseval = DepVal(yes=this, tot=thistot) + lefteval + righteval if thiseval > s: s = thiseval t = DepTree.combine(lefttree, righttree, action) cache[sig] = s, t return s, t
def load(self, lines, shuffle=False): if shuffle: print >> logs, "Shuffling training set..." random.shuffle(lines) ## for i, line in enumerate(open(filename), 1): for i, line in enumerate(lines, 1): yield DepTree.parse(line)
def extract_dependency_rules(self, relation): """ extract dependency rules :param relation: :return: """ res1 = defaultdict(int) res2 = defaultdict(int) DepTree.get_dependency_rules(res1, relation.arg1_leaves, False, True) DepTree.get_dependency_rules(res2, relation.arg2_leaves, False, True) feat_vec = [] for k in self.dep_dict: a1 = k in res1 a2 = k in res2 if a1: feat_vec.append(k + ":1") if a2: feat_vec.append(k + ":2") if a1 and a2: feat_vec.append(k + ":12") return feat_vec
def eval_worker(self, sentences): sub = decoder.evalclass() # global variable trainer for i, example in enumerate( sentences): # trainer.decoder.load(sentences), 1): tree = DepTree.parse( example ) # have to parse here, not outside, because Tree.words is static similarity, _ = trainer.decoder.decode(tree) sub += similarity # do it inside instead of outside return similarity
def __init__(self, sent_id, parse_tree, dep_tree, words): self.leaves = [] self.id = sent_id self.tree = Tree(parse_tree, sent_id) self.get_leaves() self.words = words self.begin_offset = words[0][1]['CharacterOffsetBegin'] self.end_offset = words[-1][1]['CharacterOffsetEnd'] self.word_ids = [] self.true_connectives = [] self.checked_connectives = [] self.depTree = DepTree(self, dep_tree) self.clauses = [] self.break_clauses()
def take(self, action, action_gold=False): '''returns a list (iterator) of resulting states.''' if self.i == self.j == 0: ## don't count start actioncost = 0 else: ## applying the model weights actioncost = self.feats(action).dot( self.model.weights) if self.model is not None else 0 if action == 0: # SHIFT new = State(self.j, self.j+1, action, \ self.stack + [DepTree(self.j)]) new.inside = 0 self.shiftcost = actioncost # N.B.: self! new.score = self.score + actioncost # forward cost new.step = self.step + 1 new.leftptrs = [self] new.backptrs = [(None, action)] # shift has no children new.gold = self.gold and action_gold # gold is sequentially incremented yield new # shift always returns one unique offspring else: # REDUCE for leftstate in self.leftptrs: # i'm combining with it newtree = leftstate.stack[-1].combine( self.stack[-1], action) # N.B.:theory! NOT self.stack[-2] with -1 ## N.B.: theory! NOT self.stack[:-2] new = State(leftstate.i, self.j, action, \ leftstate.stack[:-1] + [newtree]) new.inside = leftstate.inside + self.inside + \ leftstate.shiftcost + actioncost # N.B. new.score = leftstate.score + self.inside + leftstate.shiftcost + actioncost #n.b. ## WRONG: new.score = self.score + actioncost # forward cost, only true for non-DP new.step = self.step + 1 new.leftptrs = leftstate.leftptrs new.backptrs = [((leftstate, self), action)] # meaning of x.gold: first of all, viterbi-inside derivation is gold # and also, there is a gold path predicting x (same as earley item: bottom-up + top-down filtering) new.gold = leftstate.gold and self.gold and action_gold # gold is binary yield new
def load(self, filename): for i, line in enumerate(open(filename), 1): yield DepTree.parse(line)
def this_tree(self): ## very careful: sym=False! do not symbolize again ## return Tree(self.label, self.span, wrd=self.word, sym=False) t = DepTree(int(self.label)) t.canonical = True return t
def load(self, lines): for i, line in enumerate(lines, 1): yield DepTree.parse(line)
def main(): if FLAGS.sim is not None: sequencefile = open(FLAGS.sim) parser = Parser(model, b=FLAGS.beam) print >> logs, "memory usage before parsing: ", human(memory(start_mem)) totalscore = 0 totalstates = 0 totaluniq = 0 totaledges = 0 totaltime = 0 totalprec = DepVal() totaloracle = DepVal() print >> logs, "gc.collect unreachable: %d" % gc.collect() if FLAGS.manual_gc: gc.disable() i = 0 gctime = 0 for i, line in enumerate(shell_input(), 1): if FLAGS.manual_gc and i % FLAGS.gc == 0: print >> logs, "garbage collection...", tt = time.time() print >> logs, "gc.collect unreachable: %d" % gc.collect() tt = time.time() - tt print >> logs, "took %.1f seconds" % tt gctime += tt line = line.strip() if line[0]=="(": # input is a gold tree (so that we can evaluate) reftree = DepTree.parse(line) sentence = DepTree.sent # assigned in DepTree.parse() else: # input is word/tag list reftree = None sentence = [tuple(x.rsplit("/", 1)) for x in line.split()] # split by default returns list DepTree.sent = sentence if FLAGS.debuglevel >= 1: print >> logs, sentence print >> logs, reftree mytime.zero() if FLAGS.sim is not None: # simulation, not parsing actions = map(int, sequencefile.readline().split()) goal, feats = parser.simulate(actions, sentence) #if model is None score=0 print >> logs, feats score, tree = goal.score, goal.top() (nstates, nedges, nuniq) = (0, 0, 0) else: # real parsing if True: #FLAGS.earlystop: refseq = reftree.seq() if reftree is not None else None tree, myseq, score, _ = parser.try_parse(sentence, refseq, update=False) if FLAGS.early: print >> logs, "ref=", refseq print >> logs, "myt=", myseq refseq = refseq[:len(myseq)] # truncate _, reffeats = parser.simulate(refseq, sentence) _, myfeats = parser.simulate(myseq, sentence) print >> logs, "+feats", reffeats print >> logs, "-feats", myfeats nstates, nedges, nuniq = parser.stats() else: goal = parser.parse(sentence) nstates, nedges, nuniq = parser.stats() ## score, tree = goal.score, goal.top() # score, tree = mytree dtime = mytime.period() if not FLAGS.early and not FLAGS.profile: if FLAGS.forest: parser.dumpforest(i) elif FLAGS.output: if not FLAGS.kbest: print tree else: stuff = parser.beams[-1][:FLAGS.kbest] print "sent.%d\t%d" % (i, len(stuff)) for state in stuff: print "%.2f\t%s" % (state.score, state.tree()) print if FLAGS.oracle: oracle, oracletree = parser.forestoracle(reftree) totaloracle += oracle prec = DepTree.compare(tree, reftree) # OK if either is None searched = sum(x.derivation_count() for x in parser.beams[-1]) if FLAGS.forest else 0 print >> logs, "sent {i:-4} (len {l}):\tmodelcost= {c:.2f}\tprec= {p:.2%}"\ "\tstates= {ns} (uniq {uq})\tedges= {ne}\ttime= {t:.3f}\tsearched= {sp}" \ .format(i=i, l=len(sentence), c=score, p=prec.prec(), \ ns=nstates, uq=nuniq, ne=nedges, t=dtime, sp=searched) if FLAGS.seq: actions = goal.all_actions() print >> logs, " ".join(actions) check = simulate(actions, sentence, model) #if model is None score=0 checkscore = check.score checktree = check.top() print >> logs, checktree checkprec = checktree.evaluate(reftree) print >> logs, "verify: tree:%s\tscore:%s\tprec:%s" % (tree == checktree, score == checkscore, prec == checkprec) print >> logs, "sentence %-4d (len %d): modelcost= %.2lf\tprec= %.2lf\tstates= %d (uniq %d)\tedges= %d\ttime= %.3lf" % \ (i, len(sentence), checkscore, checkprec.prec100(), nstates, nuniq, nedges, dtime) totalscore += score totalstates += nstates totaledges += nedges totaluniq += nuniq totaltime += dtime totalprec += prec if i == 0: print >> logs, "Error: empty input." sys.exit(1) if FLAGS.featscache: print >> logs, "feature constructions: tot= %d shared= %d (%.2f%%)" % (State.tot, State.shared, State.shared / State.tot * 100) print >> logs, "beam= {b}, avg {a} sents,\tmodelcost= {c:.2f}\tprec= {p:.2%}" \ "\tstates= {ns:.1f} (uniq {uq:.1f})\tedges= {ne:.1f}\ttime= {t:.4f}\n{d:s}" \ .format(b=FLAGS.b, a=i, c=totalscore/i, p=totalprec.prec(), ns=totalstates/i, uq=totaluniq/i, ne=totaledges/i, t=totaltime/i, d=totalprec.details()) if FLAGS.uniqstat: for i in sorted(uniqstats): print >> logs, "%d\t%.1lf\t%d\t%d" % \ (i, sum(uniqstats[i]) / len(uniqstats[i]), \ min(uniqstats[i]), max(uniqstats[i])) if FLAGS.oracle: print >> logs, "oracle= ", totaloracle if FLAGS.manual_gc: print >> logs, "garbage collection took %.1f seconds" % gctime print >> logs, "memory usage after parsing: ", human(memory(start_mem)) if FLAGS.mydouble: from mydouble import counts print >> logs, "mydouble usage and freed: %d %d" % counts()
FLAGS = flags.FLAGS if __name__ == "__main__": # TODO: automatically figure out maxk flags.DEFINE_integer("maxk", 128, "maxk") try: file = open(sys.argv[1]) except: print >> logs, "Usage: cat <kbest-lists> | ./kbest_oracles.py <goldtrees>" sys.exit(1) tot = defaultdict(lambda: DepVal()) for sid, reftree in enumerate(DepTree.load(sys.argv[1]), 1): sentid, k = sys.stdin.readline().split() k = int(k) best = -1 besttree = None for i in range(1, k + 1): score, tree = sys.stdin.readline().split("\t") score = float(score) tree = DepTree.parse(tree) ev = reftree.evaluate(tree) if ev > best: best = ev besttree = tree
def get_dependency_rule(self, rule_dict, with_leaf=False, with_label=True): DepTree.get_dependency_rules(rule_dict, self.arg1_leaves, with_leaf, with_label) DepTree.get_dependency_rules(rule_dict, self.arg2_leaves, with_leaf, with_label)
def work(line, i): global totalscore, totalstates, totaledges, totaluniq, totaltime, totalprec line = line.strip() if line[0] == "(": # input is a gold tree (so that we can evaluate) reftree = DepTree.parse(line) sentence = DepTree.sent # assigned in DepTree.parse() else: # input is word/tag list reftree = None sentence = [tuple(x.rsplit("/", 1)) for x in line.split()] # split by default returns list DepTree.sent = sentence if FLAGS.debuglevel >= 1: print >> logs, sentence print >> logs, reftree mytime.zero() if FLAGS.sim is not None: # simulation, not parsing actions = map(int, sequencefile.readline().split()) goal, feats = parser.simulate(actions, sentence) #if model is None score=0 print >> logs, feats score, tree = goal.score, goal.top() (nstates, nedges, nuniq) = (0, 0, 0) else: # real parsing if True: #FLAGS.earlystop: refseq = reftree.seq() if reftree is not None else None tree, myseq, score = parser.try_parse(sentence, refseq, early_stop=FLAGS.earlystop) if FLAGS.earlystop: print >> logs, "ref=", refseq print >> logs, "myt=", myseq refseq = refseq[:len(myseq)] # truncate _, reffeats = parser.simulate(refseq, sentence) _, myfeats = parser.simulate(myseq, sentence) print >> logs, "feat diff=", Model.trim(reffeats - myfeats) nstates, nedges, nuniq = parser.stats() else: goal = parser.parse(sentence) nstates, nedges, nuniq = parser.stats() dtime = mytime.period() if not FLAGS.earlystop and not FLAGS.profile: if FLAGS.forest: parser.dumpforest(i) else: if not FLAGS.kbest: toprint = str(tree) else: stuff = parser.beams[-1][:FLAGS.kbest] toprint = "sent.%d\t%d" % (i, len(stuff)) toprint += [ "%.2f\t%s" % (state.score, state.tree()) for state in stuff ] if FLAGS.oracle: oracle, oracletree = parser.forestoracle(reftree) totaloracle += oracle prec = DepTree.compare(tree, reftree) # OK if either is None searched = sum(x.derivation_count() for x in parser.beams[-1]) if FLAGS.forest else 0 print >> logs, "sent {i:-4} (len {l}):\tmodelcost= {c:.2f}\tprec= {p:.2%}"\ "\tstates= {ns} (uniq {uq})\tedges= {ne}\ttime= {t:.3f}\tsearched= {sp}" \ .format(i=i, l=len(sentence), c=score, p=prec.prec(), \ ns=nstates, uq=nuniq, ne=nedges, t=dtime, sp=searched) if FLAGS.seq: actions = goal.all_actions() print >> logs, " ".join(actions) check = simulate(actions, sentence, model) #if model is None score=0 checkscore = check.score checktree = check.top() print >> logs, checktree checkprec = checktree.evaluate(reftree) print >> logs, "verify: tree:%s\tscore:%s\tprec:%s" % \ (tree == checktree, score == checkscore, prec == checkprec) print >> logs, "sentence %-4d (len %d): modelcost= %.2lf\tprec= %.2lf\tstates= %d (uniq %d)\tedges= %d\ttime= %.3lf" % \ (i, len(sentence), checkscore, checkprec.prec100(), nstates, nuniq, nedges, dtime) totalscore += score totalstates += nstates totaledges += nedges totaluniq += nuniq totaltime += dtime totalprec += prec return toprint
if __name__ == "__main__": flags.DEFINE_integer("cutoff", 1, "cut off freq") flags.DEFINE_integer("cutoff_char", 1, "cut off freq for chars") argv = FLAGS(sys.argv) word_dict = defaultdict(lambda: [0, defaultdict(int)]) unktags = defaultdict(int) for line in sys.stdin: line = line.strip() if line == "": continue reftree = DepTree.parse(line) words = DepTree.words tags = reftree.tagseq() for w, t in zip(words, tags): word_dict[w][0] += 1 word_dict[w][1][t] += 1 unkcnt = 0 for w in sorted(word_dict): freq, tags = word_dict[w] if freq <= FLAGS.cutoff: for t in tags: unktags[t] += tags[t] unkcnt += freq else: print "%s\t%d\t%s" % (w, freq, sorttags(tags))