Exemplo n.º 1
0
    def __init__(self, weightstr):

        if FLAGS.integerize:
            FLAGS.tuplefeats = True
            from deptree import DepTree
            DepTree.setup()

        Model.start_sym = Vocab.str2id("<s>") if FLAGS.integerize else "<s>"
        Model.stop_sym = Vocab.str2id("</s>") if FLAGS.integerize else "</s>"
        Model.none_sym = Vocab.str2id("NONE") if FLAGS.integerize else "NONE"

        WVector.init(Model.names)  # for doublehash 1 (and trim, value_class)

        self.templates = {}  # mapping from "s0t-q0t" to the eval expression
        self.list_templates = []  # ordered list of template keys "s0t-q0t"
        self.freq_templates = defaultdict(int)
        Model.doublehash = FLAGS.doublehash
        ##        if Model.doublehash == 1:
        ##            self.weights = \
        ##                         dict((action, new_vector()) for action in Model.names) # faster than defaultdict!
        ##            #WVector() if FLAGS.wvector else \
        ##           self.weights = [new_vector() for _ in Model.names] # faster than defaultdict!
        ##        elif Model.doublehash == 2:
        ##            self.weights = [[new_vector() for _ in range(100)] for _ in Model.names] # one dict per template
        ##        else:

        self.weights = Model.new_weights()  #Vector()

        self.read_weights(weightstr)
        ##        self.featurenames = set(self.weights.iterkeys())

        if FLAGS.featstat:
            self.print_templates()
Exemplo n.º 2
0
    def tree(self):
        '''alternatively: simulate actions'''

        (children, action, _) = self.backptrs[0]
        if children is None:
            return DepTree(self.i) # SHIFT
        else:
            left, right = children  # REDUCE            
            return DepTree.combine(left.tree(), right.tree(), action)
Exemplo n.º 3
0
    def one_pass_on_train(self, data):

        num_updates = 0
        early_updates = 0
        ##        for i, example in enumerate(self.decoder.load(self.trainfile), 1):
        for i, line in enumerate(data, 1):
            example = DepTree.parse(line)
            #print >> logs, "... example %d (len %d)..." % (i, len(example)),
            self.c += 1

            similarity, deltafeats = self.decoder.decode(example,
                                                         early_stop=True)

            if similarity < 1 - 1e-8:  #update

                num_updates += 1

                #print >> logs, "sim={0}, |delta|={1}".format(similarity, len(deltafeats))
                if FLAGS.debuglevel >= 2:
                    print >> logs, "deltafv=", deltafeats

                self.weights += deltafeats
                if FLAGS.avg:
                    self.allweights += deltafeats * self.c

                if similarity < 1e-8:  # early-update happened
                    early_updates += 1

            else:
                #print >> logs, "PASSED! :)"
                None

        return num_updates, early_updates
Exemplo n.º 4
0
    def tree(self):
        '''alternatively: simulate actions'''

        (children, action, _) = self.backptrs[0]
        if children is None:
            if FLAGS.pretag and action[0] == -2:
                return DepTree(self.i, action[2])
            elif FLAGS.shifttag and action[0] == 0:
                return DepTree(self.i, action[2])
            else:
                return DepTree(
                    self.i,
                    action[1])  # yang: SHIFT tag (changes tag in deptree)
        else:
            left, right = children  # REDUCE
            return DepTree.combine(left.tree(), right.tree(), action)
Exemplo n.º 5
0
    def besteval(self, reflinks, cache=None):
        ''' sub-oracle '''

        if cache is None:
            cache = {}

        sig = (self.step, self.rank)
        if sig in cache:
            return cache[sig]        
            
        if self.action == 0:
            s = DepVal()
            t = self.tree()
        else:
            h = self.headidx
            s = -1
            t = None
            for ((left, right), action, _) in self.backptrs:
                m = left.headidx if action == 1 else right.headidx
                this = 1 if (m in reflinks and reflinks[m] == h) else 0
                thistot = 1 if (m in reflinks) else 0
                
                lefteval, lefttree = left.besteval(reflinks, cache)
                righteval, righttree = right.besteval(reflinks, cache)

                thiseval = DepVal(yes=this, tot=thistot) + lefteval + righteval
                
                if thiseval > s:
                    s = thiseval
                    t = DepTree.combine(lefttree, righttree, action)
                    
        cache[sig] = s, t
        return s, t
Exemplo n.º 6
0
    def load(self, lines, shuffle=False):

        if shuffle:
            print >> logs, "Shuffling training set..."
            random.shuffle(lines)


##        for i, line in enumerate(open(filename), 1):
        for i, line in enumerate(lines, 1):
            yield DepTree.parse(line)
Exemplo n.º 7
0
    def extract_dependency_rules(self, relation):
        """
        extract dependency rules
        :param relation:
        :return:
        """
        res1 = defaultdict(int)
        res2 = defaultdict(int)
        DepTree.get_dependency_rules(res1, relation.arg1_leaves, False, True)
        DepTree.get_dependency_rules(res2, relation.arg2_leaves, False, True)

        feat_vec = []
        for k in self.dep_dict:
            a1 = k in res1
            a2 = k in res2
            if a1: feat_vec.append(k + ":1")
            if a2: feat_vec.append(k + ":2")
            if a1 and a2: feat_vec.append(k + ":12")

        return feat_vec
Exemplo n.º 8
0
    def extract_dependency_rules(self, relation):
        """
        extract dependency rules
        :param relation:
        :return:
        """
        res1 = defaultdict(int)
        res2 = defaultdict(int)
        DepTree.get_dependency_rules(res1, relation.arg1_leaves, False, True)
        DepTree.get_dependency_rules(res2, relation.arg2_leaves, False, True)

        feat_vec = []
        for k in self.dep_dict:
            a1 = k in res1
            a2 = k in res2
            if a1: feat_vec.append(k + ":1")
            if a2: feat_vec.append(k + ":2")
            if a1 and a2: feat_vec.append(k + ":12")

        return feat_vec
Exemplo n.º 9
0
    def eval_worker(self, sentences):

        sub = decoder.evalclass()  # global variable trainer

        for i, example in enumerate(
                sentences):  # trainer.decoder.load(sentences), 1):
            tree = DepTree.parse(
                example
            )  # have to parse here, not outside, because Tree.words is static
            similarity, _ = trainer.decoder.decode(tree)
            sub += similarity  # do it inside instead of outside

        return similarity
Exemplo n.º 10
0
 def __init__(self, sent_id, parse_tree, dep_tree, words):
     self.leaves = []
     self.id = sent_id
     self.tree = Tree(parse_tree, sent_id)
     self.get_leaves()
     self.words = words
     self.begin_offset = words[0][1]['CharacterOffsetBegin']
     self.end_offset = words[-1][1]['CharacterOffsetEnd']
     self.word_ids = []
     self.true_connectives = []
     self.checked_connectives = []
     self.depTree = DepTree(self, dep_tree)
     self.clauses = []
     self.break_clauses()
Exemplo n.º 11
0
    def take(self, action, action_gold=False):
        '''returns a list (iterator) of resulting states.'''

        if self.i == self.j == 0:  ## don't count start
            actioncost = 0
        else:
            ## applying the model weights
            actioncost = self.feats(action).dot(
                self.model.weights) if self.model is not None else 0

        if action == 0:  # SHIFT
            new = State(self.j, self.j+1, action, \
                        self.stack + [DepTree(self.j)])
            new.inside = 0
            self.shiftcost = actioncost  # N.B.: self!
            new.score = self.score + actioncost  # forward cost
            new.step = self.step + 1
            new.leftptrs = [self]
            new.backptrs = [(None, action)]  # shift has no children

            new.gold = self.gold and action_gold  # gold is sequentially incremented

            yield new  # shift always returns one unique offspring

        else:  # REDUCE
            for leftstate in self.leftptrs:  # i'm combining with it
                newtree = leftstate.stack[-1].combine(
                    self.stack[-1],
                    action)  # N.B.:theory! NOT self.stack[-2] with -1
                ## N.B.: theory! NOT self.stack[:-2]
                new = State(leftstate.i, self.j, action, \
                            leftstate.stack[:-1] + [newtree])

                new.inside = leftstate.inside + self.inside + \
                             leftstate.shiftcost + actioncost # N.B.

                new.score = leftstate.score + self.inside + leftstate.shiftcost + actioncost  #n.b.
                ## WRONG: new.score = self.score + actioncost # forward cost, only true for non-DP
                new.step = self.step + 1
                new.leftptrs = leftstate.leftptrs
                new.backptrs = [((leftstate, self), action)]

                # meaning of x.gold: first of all, viterbi-inside derivation is gold
                # and also, there is a gold path predicting x (same as earley item: bottom-up + top-down filtering)
                new.gold = leftstate.gold and self.gold and action_gold  # gold is binary

                yield new
Exemplo n.º 12
0
    def load(self, filename):

        for i, line in enumerate(open(filename), 1):
            yield DepTree.parse(line)
Exemplo n.º 13
0
 def this_tree(self):
     ## very careful: sym=False! do not symbolize again
     ##      return Tree(self.label, self.span, wrd=self.word, sym=False)
     t = DepTree(int(self.label))
     t.canonical = True
     return t
Exemplo n.º 14
0
    def load(self, lines):

        for i, line in enumerate(lines, 1):
            yield DepTree.parse(line)
Exemplo n.º 15
0
def main():

    if FLAGS.sim is not None:
        sequencefile = open(FLAGS.sim)

    parser = Parser(model, b=FLAGS.beam)

    print >> logs, "memory usage before parsing: ", human(memory(start_mem))

    totalscore = 0
    totalstates = 0
    totaluniq = 0
    totaledges = 0
    totaltime = 0

    totalprec = DepVal()    
    totaloracle = DepVal()

    print >> logs, "gc.collect unreachable: %d" % gc.collect()

    if FLAGS.manual_gc:
        gc.disable()
    
    i = 0
    gctime = 0
    for i, line in enumerate(shell_input(), 1):

        if FLAGS.manual_gc and i % FLAGS.gc == 0:
            print >> logs, "garbage collection...",
            tt = time.time()
            print >> logs, "gc.collect unreachable: %d" % gc.collect()
            tt = time.time() - tt
            print >> logs, "took %.1f seconds" % tt
            gctime += tt

        line = line.strip()
        if line[0]=="(":
            # input is a gold tree (so that we can evaluate)
            reftree = DepTree.parse(line)
            sentence = DepTree.sent # assigned in DepTree.parse()            
        else:
            # input is word/tag list
            reftree = None
            sentence = [tuple(x.rsplit("/", 1)) for x in line.split()]   # split by default returns list            
            DepTree.sent = sentence

        if FLAGS.debuglevel >= 1:
            print >> logs, sentence
            print >> logs, reftree

        mytime.zero()
        
        if FLAGS.sim is not None: # simulation, not parsing
            actions = map(int, sequencefile.readline().split())
            goal, feats = parser.simulate(actions, sentence) #if model is None score=0
            print >> logs, feats
            score, tree = goal.score, goal.top()
            (nstates, nedges, nuniq) = (0, 0, 0)
        else:
            # real parsing
            if True: #FLAGS.earlystop:
                refseq = reftree.seq() if reftree is not None else None
                tree, myseq, score, _ = parser.try_parse(sentence, refseq, update=False)
                if FLAGS.early:
                    print >> logs, "ref=", refseq
                    print >> logs, "myt=", myseq

                    refseq = refseq[:len(myseq)] # truncate
                    _, reffeats = parser.simulate(refseq, sentence) 
                    _, myfeats = parser.simulate(myseq, sentence)
                    print >> logs, "+feats", reffeats
                    print >> logs, "-feats", myfeats
                    
                nstates, nedges, nuniq = parser.stats()
            else:
                goal = parser.parse(sentence)
                nstates, nedges, nuniq = parser.stats()

##        score, tree = goal.score, goal.top()
#        score, tree = mytree
            
        dtime = mytime.period()

        if not FLAGS.early and not FLAGS.profile:
            if FLAGS.forest:
                parser.dumpforest(i)
            elif FLAGS.output:
                if not FLAGS.kbest:
                    print tree
                else:
                    stuff = parser.beams[-1][:FLAGS.kbest]
                    print "sent.%d\t%d" % (i, len(stuff))
                    for state in stuff:
                        print "%.2f\t%s" % (state.score, state.tree())
                    print
                    
            if FLAGS.oracle:
                oracle, oracletree = parser.forestoracle(reftree)
                totaloracle += oracle

        prec = DepTree.compare(tree, reftree) # OK if either is None

        searched = sum(x.derivation_count() for x in parser.beams[-1]) if FLAGS.forest else 0
        print >> logs, "sent {i:-4} (len {l}):\tmodelcost= {c:.2f}\tprec= {p:.2%}"\
              "\tstates= {ns} (uniq {uq})\tedges= {ne}\ttime= {t:.3f}\tsearched= {sp}" \
              .format(i=i, l=len(sentence), c=score, p=prec.prec(), \
                      ns=nstates, uq=nuniq, ne=nedges, t=dtime, sp=searched)
        if FLAGS.seq:
            actions = goal.all_actions()
            print >> logs, " ".join(actions)
            check = simulate(actions, sentence, model) #if model is None score=0
            checkscore = check.score
            checktree = check.top()
            print >> logs, checktree
            checkprec = checktree.evaluate(reftree)
            print >> logs, "verify: tree:%s\tscore:%s\tprec:%s" % (tree == checktree, score == checkscore, prec == checkprec)
            print >> logs, "sentence %-4d (len %d): modelcost= %.2lf\tprec= %.2lf\tstates= %d (uniq %d)\tedges= %d\ttime= %.3lf" % \
                  (i, len(sentence), checkscore, checkprec.prec100(), nstates, nuniq, nedges, dtime)

        totalscore += score
        totalstates += nstates
        totaledges += nedges
        totaluniq += nuniq
        totaltime += dtime

        totalprec += prec

    if i == 0:
        print >> logs, "Error: empty input."
        sys.exit(1)

    if FLAGS.featscache:
        print >> logs, "feature constructions: tot= %d shared= %d (%.2f%%)" % (State.tot, State.shared, State.shared / State.tot * 100)

    print >> logs, "beam= {b}, avg {a} sents,\tmodelcost= {c:.2f}\tprec= {p:.2%}" \
          "\tstates= {ns:.1f} (uniq {uq:.1f})\tedges= {ne:.1f}\ttime= {t:.4f}\n{d:s}" \
          .format(b=FLAGS.b, a=i, c=totalscore/i, p=totalprec.prec(), 
                  ns=totalstates/i, uq=totaluniq/i, ne=totaledges/i, t=totaltime/i, 
                  d=totalprec.details())
    
    if FLAGS.uniqstat:
        for i in sorted(uniqstats):
            print >> logs, "%d\t%.1lf\t%d\t%d" % \
                  (i, sum(uniqstats[i]) / len(uniqstats[i]), \
                   min(uniqstats[i]), max(uniqstats[i]))

    if FLAGS.oracle:
        print >> logs, "oracle= ", totaloracle

    if FLAGS.manual_gc:
        print >> logs, "garbage collection took %.1f seconds" % gctime

    print >> logs, "memory usage after parsing: ", human(memory(start_mem))
    if FLAGS.mydouble:
        from mydouble import counts
        print >> logs, "mydouble usage and freed: %d %d" % counts()
Exemplo n.º 16
0
FLAGS = flags.FLAGS

if __name__ == "__main__":

    # TODO: automatically figure out maxk
    flags.DEFINE_integer("maxk", 128, "maxk")

    try:
        file = open(sys.argv[1])
    except:
        print >> logs, "Usage: cat <kbest-lists> | ./kbest_oracles.py <goldtrees>"
        sys.exit(1)

    tot = defaultdict(lambda: DepVal())

    for sid, reftree in enumerate(DepTree.load(sys.argv[1]), 1):

        sentid, k = sys.stdin.readline().split()
        k = int(k)

        best = -1
        besttree = None
        for i in range(1, k + 1):
            score, tree = sys.stdin.readline().split("\t")
            score = float(score)
            tree = DepTree.parse(tree)

            ev = reftree.evaluate(tree)
            if ev > best:
                best = ev
                besttree = tree
Exemplo n.º 17
0
 def get_dependency_rule(self, rule_dict, with_leaf=False, with_label=True):
     DepTree.get_dependency_rules(rule_dict, self.arg1_leaves, with_leaf,
                                  with_label)
     DepTree.get_dependency_rules(rule_dict, self.arg2_leaves, with_leaf,
                                  with_label)
Exemplo n.º 18
0
 def get_dependency_rule(self, rule_dict, with_leaf=False, with_label=True):
     DepTree.get_dependency_rules(rule_dict, self.arg1_leaves, with_leaf, with_label)
     DepTree.get_dependency_rules(rule_dict, self.arg2_leaves, with_leaf, with_label)
Exemplo n.º 19
0
def work(line, i):
    global totalscore, totalstates, totaledges, totaluniq, totaltime, totalprec

    line = line.strip()
    if line[0] == "(":
        # input is a gold tree (so that we can evaluate)
        reftree = DepTree.parse(line)
        sentence = DepTree.sent  # assigned in DepTree.parse()
    else:
        # input is word/tag list
        reftree = None
        sentence = [tuple(x.rsplit("/", 1))
                    for x in line.split()]  # split by default returns list
        DepTree.sent = sentence

    if FLAGS.debuglevel >= 1:
        print >> logs, sentence
        print >> logs, reftree

    mytime.zero()

    if FLAGS.sim is not None:  # simulation, not parsing
        actions = map(int, sequencefile.readline().split())
        goal, feats = parser.simulate(actions,
                                      sentence)  #if model is None score=0
        print >> logs, feats
        score, tree = goal.score, goal.top()
        (nstates, nedges, nuniq) = (0, 0, 0)
    else:
        # real parsing
        if True:  #FLAGS.earlystop:
            refseq = reftree.seq() if reftree is not None else None
            tree, myseq, score = parser.try_parse(sentence,
                                                  refseq,
                                                  early_stop=FLAGS.earlystop)
            if FLAGS.earlystop:
                print >> logs, "ref=", refseq
                print >> logs, "myt=", myseq

                refseq = refseq[:len(myseq)]  # truncate
                _, reffeats = parser.simulate(refseq, sentence)
                _, myfeats = parser.simulate(myseq, sentence)
                print >> logs, "feat diff=", Model.trim(reffeats - myfeats)

            nstates, nedges, nuniq = parser.stats()
        else:
            goal = parser.parse(sentence)
            nstates, nedges, nuniq = parser.stats()

    dtime = mytime.period()

    if not FLAGS.earlystop and not FLAGS.profile:
        if FLAGS.forest:
            parser.dumpforest(i)
        else:
            if not FLAGS.kbest:
                toprint = str(tree)
            else:
                stuff = parser.beams[-1][:FLAGS.kbest]
                toprint = "sent.%d\t%d" % (i, len(stuff))
                toprint += [
                    "%.2f\t%s" % (state.score, state.tree()) for state in stuff
                ]

        if FLAGS.oracle:
            oracle, oracletree = parser.forestoracle(reftree)
            totaloracle += oracle

    prec = DepTree.compare(tree, reftree)  # OK if either is None

    searched = sum(x.derivation_count()
                   for x in parser.beams[-1]) if FLAGS.forest else 0
    print >> logs, "sent {i:-4} (len {l}):\tmodelcost= {c:.2f}\tprec= {p:.2%}"\
          "\tstates= {ns} (uniq {uq})\tedges= {ne}\ttime= {t:.3f}\tsearched= {sp}" \
          .format(i=i, l=len(sentence), c=score, p=prec.prec(), \
                  ns=nstates, uq=nuniq, ne=nedges, t=dtime, sp=searched)
    if FLAGS.seq:
        actions = goal.all_actions()
        print >> logs, " ".join(actions)
        check = simulate(actions, sentence, model)  #if model is None score=0
        checkscore = check.score
        checktree = check.top()
        print >> logs, checktree
        checkprec = checktree.evaluate(reftree)
        print >> logs, "verify: tree:%s\tscore:%s\tprec:%s" % \
              (tree == checktree, score == checkscore, prec == checkprec)
        print >> logs, "sentence %-4d (len %d): modelcost= %.2lf\tprec= %.2lf\tstates= %d (uniq %d)\tedges= %d\ttime= %.3lf" % \
              (i, len(sentence), checkscore, checkprec.prec100(), nstates, nuniq, nedges, dtime)

    totalscore += score
    totalstates += nstates
    totaledges += nedges
    totaluniq += nuniq
    totaltime += dtime

    totalprec += prec

    return toprint
Exemplo n.º 20
0
if __name__ == "__main__":

    flags.DEFINE_integer("cutoff", 1, "cut off freq")
    flags.DEFINE_integer("cutoff_char", 1, "cut off freq for chars")

    argv = FLAGS(sys.argv)

    word_dict = defaultdict(lambda: [0, defaultdict(int)])
    unktags = defaultdict(int)

    for line in sys.stdin:
        line = line.strip()
        if line == "":
            continue

        reftree = DepTree.parse(line)
        words = DepTree.words
        tags = reftree.tagseq()
        for w, t in zip(words, tags):
            word_dict[w][0] += 1
            word_dict[w][1][t] += 1

    unkcnt = 0
    for w in sorted(word_dict):
        freq, tags = word_dict[w]
        if freq <= FLAGS.cutoff:
            for t in tags:
                unktags[t] += tags[t]
                unkcnt += freq
        else:
            print "%s\t%d\t%s" % (w, freq, sorttags(tags))