Пример #1
0
    def rescore_deduction(self, ded, models, weights, memo, add=False):
        """Recompute ded.dcost and ded.viterbi according to models and weights."""

        vviterbi = svector.Vector()
        for ant in ded.ants:
            vviterbi += ant.rescore(models,
                                    weights,
                                    memo,
                                    add=add,
                                    check_states=True)

        if not add:
            ded.dcost = svector.Vector()
        states = []
        for m_i in xrange(len(models)):
            antstates = [ant.states[m_i] for ant in ded.ants]
            if ded.rule is not None:
                j1 = ded.ants[0].j if len(ded.ants) == 2 else None
                (state,
                 mdcost) = models[m_i].transition(ded.rule, antstates, self.i,
                                                  self.j, j1)
            elif len(antstates) == 1:  # goal item
                mdcost = models[m_i].finaltransition(antstates[0])
                state = None
            states.append(state)

            ded.dcost += mdcost
        vviterbi += ded.dcost
        ded.viterbi = weights.dot(vviterbi)

        return vviterbi, states
Пример #2
0
    def send_weights(self):
        #log.write("prev weights: %s\n" % self.prev_weights)
        #log.write("weights: %s\n" % self.weights)
        if self.prev_weights is None:
            weights = self.weights
        else:
            weights = self.weights - self.prev_weights
            weights.compact()

        core_weights = [0.] * self.n_core_features
        sparse_weights = svector.Vector()
        for feature in weights:
            if not feature.startswith('_core'):
                sparse_weights[feature] = -weights[feature]
            else:
                i = int(feature[5:])
                core_weights[i] = -weights[feature]
        request = {'core-weights' : ','.join(str(x) for x in core_weights),
                   'sparse-weights': str(sparse_weights)}

        if self.prev_weights is None:
            log.write("setWeights(%s)\n" % request)
            self.server.setWeights(request)
        else:
            log.write("addWeights(%s)\n" % request)
            self.server.addWeights(request)
        self.prev_weights = svector.Vector(self.weights)
Пример #3
0
def make_decoder():
    thedecoder = Decoder(opts.decoder, opts.n_core_features)
    if opts.feature_weights:
        if '=' in opts.feature_weights:
            thedecoder.weights = -svector.Vector(opts.feature_weights)
        else:
            thedecoder.weights = -svector.Vector(open(opts.feature_weights).read())
    else:
        thedecoder.weights = svector.Vector()
    return thedecoder
Пример #4
0
 def __init__(self, flen=0, elen=0):
     """
   Initialize member objects
   """
     self.links = []
     self.score = 0
     self.fscore = 0
     self.hope = 0
     self.fear = 0
     # local feature vector
     self.scoreVector = svector.Vector()
     self.scoreVector_nonlocal = svector.Vector()
     self.position = None
     self.boundingBox = None
Пример #5
0
    def seed(self, flattice, grammars, models, weights):
        self.models = models
        self.weights = weights

        # Seed the dotchart. This will give the extracted rules

        self.grammars = [(g, DotChart(self, flattice)) for g in grammars
                         if isinstance(g, Grammar)]

        for (g, dotchart) in self.grammars:
            for i in xrange(self.flattice.n - 1):
                if g.filterspan(self.flattice, i, i):
                    dotchart.add(g.root, i, i, ())
                    self.dot_added += 1

        for g in grammars:
            if isinstance(g, NewGrammar):
                g.input(flattice)
                for i in xrange(self.flattice.n - 1):
                    for j in xrange(i + 1, self.flattice.n):
                        for (r, ) in g.get_rules(i, j):
                            estimate_rule(r, models, weights)
                            self.add_axiom(i, j, r)

        # Last resort for unknown French word: pass it through
        for edge in flattice.edges:
            for x in self.default_nonterminals:
                r = rule.Rule(x, [edge.w], [edge.w],
                              scores=svector.Vector('unknown', 1.))
                estimate_rule(r, models, weights)
                self.add_axiom(edge.i, edge.j, r)
Пример #6
0
    def seed(self, input, grammars, models, weights):
        fwords = [sym.fromstring(f) for f in input.fwords]
        self.models = models
        self.weights = weights

        # Seed the dotchart. This will give the extracted rules

        self.grammars = [(g, DotChart(self, fwords)) for g in grammars if isinstance(g, Grammar)]

        for (g,dotchart) in self.grammars:
            for i in xrange(self.n):
                if g.filterspan(i,i,self.n):
                    dotchart.add(g.root,i,i,())
                    self.dot_added += 1

        for g in grammars:
            if isinstance(g, NewGrammar):
                g.input(input)
                for i in xrange(self.n):
                    for j in xrange(i+1,self.n+1):
                        for (r,) in g.get_rules(i,j):
                            estimate_rule(r, models, weights)
                            self.add_axiom(i, j, r)

        # Last resort for unknown French word: pass it through
        for i in xrange(0, len(fwords)):
            for x in self.default_nonterminals:
                r = rule.Rule(x,
                              rule.Phrase(fwords[i:i+1]),
                              rule.Phrase(fwords[i:i+1]),
                              scores=svector.Vector('unknown', 1.))
                estimate_rule(r, models, weights)
                self.add_axiom(i, i+1, r)
Пример #7
0
    def reweight(self, weights, memo=None):
        """Recompute self.viterbi according to weights. Returns the
        Viterbi vector, and (unlike the decoder) only calls
        weights.dot on vectors of whole subderivations, which is handy
        for overriding weights.dot."""

        if memo is None:
            memo = {}
        if id(self) in memo:
            return memo[id(self)]

        vviterbi = None
        for ded in self.deds:
            ded_vviterbi = svector.Vector()

            for ant in ded.ants:
                ded_vviterbi += ant.reweight(weights, memo)

            ded_vviterbi += ded.dcost
            ded.viterbi = weights.dot(ded_vviterbi)

            if vviterbi is None or ded.viterbi < self.viterbi:
                vviterbi = ded_vviterbi
                self.viterbi = ded.viterbi

        memo[id(self)] = vviterbi
        return vviterbi
Пример #8
0
 def delta_mweights(self):
     dmweights = svector.Vector()
     for instance in self.instances:
         dmweights += self.learning_rate * instance.hope.mvector
         for hyp in instance.hyps:
             dmweights -= hyp.alpha * self.learning_rate * hyp.mvector
     return -dmweights
Пример #9
0
 def __init__(self, i, j, x, f, e, v={}):
     self.i = i
     self.j = j
     self.x = x
     self.f = f
     self.e = e
     self.v = svector.Vector(v)
Пример #10
0
 def clean(self, v):
     """Return a copy of v that doesn't have any of the features
        used for the oracle."""
     v = svector.Vector(v)
     for f in self.feats:
         del v[f]
     return v
Пример #11
0
def sbmt_vector(s):
    v = svector.Vector()
    if s:
        for featvalue in s.split(","):
            feat, value = featvalue.split(":", 1)
            v[feat] = float(value)
    return v
Пример #12
0
def update_weights(weights, updates, alphas):
    # sequential minimum optimization
    # minimize 1/2 ||sum(updates)||**2 + C*sum(xis)
    # one xi for all candidates for each sentence
    # s.t. each margin >= loss - xi
    # s.t. each xi >= 0

    # these are not sensitive to feature_scales, but maybe they should be
    # this is not right -- gammas should be preserved across calls
    if l1_regularization:
        gammas = svector.Vector()

    iterations = 0
    done = False
    while not done:
        if l1_regularization:
            for f in weights:
                delta = max(
                    -l1_regularization * max_learning_rate * len(updates) -
                    gammas[f],
                    min(
                        weights[f],
                        l1_regularization * max_learning_rate * len(updates) -
                        gammas[f]))
                gammas[f] += delta
                weights[f] += -delta
            if log.level >= 4:
                log.write("  gammas: %s\n" % gammas)

        done = True
        sentids = updates.keys()
        #random.shuffle(sentids)
        for sentid in sentids:
            vscores = updates[sentid]
            if len(vscores) < 2:
                continue
            if log.level >= 4:
                log.write("  sentence %s\n" % sentid)
            try:
                weights, alphas[sentid] = update_sentence_weights(
                    weights, updates[sentid], alphas[sentid])
                done = False
            except StopOptimization:
                pass

            if log.level >= 4:
                log.write("    alphas: %s\n" %
                          (" ".join(str(alpha) for alpha in alphas[sentid])))

        iterations += 1
        if iterations > 1000:
            log.write("  SMO: 1000 passes through data, stopping\n")
            break

        #log.write("  intermediate weights: %s\n" % weights)

    return weights, alphas
Пример #13
0
 def clear(self):
     self.links = []
     self.score = 0
     self.fscore = 0
     self.hope = 0
     self.fear = 0
     self.scoreVector = svector.Vector()
     self.position = None
     self.boundingBox = None
Пример #14
0
 def estimate(self, r):
     v = svector.Vector()
     v["oracle.srclen"] = srclen = len(r.f) - r.f.arity()
     v["oracle.candlen"] = candlen = len(r.e) - r.e.arity()
     # pro-rate reference length
     try:
         v["oracle.reflen"] = float(srclen) / self.srclen * self.reflen
     except ZeroDivisionError:
         v["oracle.reflen"] = self.reflen
     return v
Пример #15
0
def estimate_rule(r, models, weights): #, return_vector=False):
    '''Puts a lower-bound estimate inside the rule, returns
    the full estimate.'''

    r.statelesscost = svector.Vector()
    estcost = svector.Vector()
    #estcost = 0.

    for m in models:
        me = m.estimate(r)
        if m.stateless:
            r.statelesscost += me
        else:
            estcost += me
            #estcost += weights.dot(me)
    estcost += r.statelesscost
    #estcost += weights.dot(r.statelesscost)

    #return estcost
    return weights.dot(estcost)
Пример #16
0
 def input():
     for line in sys.stdin:
         try:
             key, rule, scores = line.split("\t")
         except Exception:
             sys.stderr.write("bad line: %s\n" % line.rstrip())
             raise
         scores = svector.Vector(scores)
         if feat_prob in scores:
             raise Exception("feature %s already present" % feat_prob)
         yield key, rule, scores
Пример #17
0
 def from_str(s):
     fields = s.split(" ||| ")
     lhs = Nonterminal.from_str(fields[0].strip())
     frhs = [Nonterminal.from_str(f) for f in fields[1].split()]
     erhs = [Nonterminal.from_str(e) for e in fields[2].split()]
     r = Rule(lhs, frhs, erhs)
     if len(fields) >= 4:
         r.scores = svector.Vector(fields[3])
     if len(fields) >= 5:
         r.attrs = Attributes()
         r.attrs['align'] = fields[4].strip()
     return r
Пример #18
0
def make_forest(fieldss):
    nodes = {}
    goal_ids = set()
    for fields in fieldss:
        node_id = fields['hyp']
        if node_id not in nodes:
            nodes[node_id] = forest.Item(sym.fromtag('PHRASE'), 0, 0, [])
        node = nodes[node_id]

        if node_id == 0:
            r = rule.Rule(sym.fromtag('PHRASE'), rule.Phrase([]), rule.Phrase([]))
            node.deds.append(forest.Deduction((), r, svector.Vector()))
        else:
            m = scores_re.match(fields['scores'])
            core_values = [float(x) for x in m.group(1).split(',')]
            dcost = svector.Vector(m.group(2).encode('utf8'))
            for i, x in enumerate(core_values):
                dcost["_core%d" % i] = x

            back = int(fields['back'])
            ant = nodes[back]
            f = fields['src-phrase'].encode('utf8').split()
            e = fields['tgt-phrase'].encode('utf8').split()
            if len(f) != int(fields['cover-end']) - int(fields['cover-start']) + 1:
                sys.stderr.write("warning: French phrase length didn't match covered length\n")

            f = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + f)
            e = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + e)
            r = rule.Rule(sym.fromtag('PHRASE'), f, e)

            ded = forest.Deduction((ant,), r, dcost)
            node.deds.append(ded)

            if int(fields['forward']) < 0: # goal
                goal_ids.add(node_id)

    goal = forest.Item(None, 0, 0, [])
    for node_id in goal_ids:
        goal.deds.append(forest.Deduction((nodes[node_id],), None, svector.Vector()))
    return goal
Пример #19
0
   def createEdge(self, childEdges, currentNode, span):
     """
     Create a new edge from the list of edges 'edge'.
     Creating an edge involves:
     (1) Initializing the PartialGridAlignment data structure
     (2) Adding links (f,e) to list newEdge.links
     (3) setting the score of the edge with scoreEdge(newEdge, ...)
     In addition, set the score of the new edge.
     """
     newEdge = PartialGridAlignment()
     newEdge.scoreVector_local = svector.Vector()
     newEdge.scoreVector = svector.Vector()
 
     for e in childEdges:
         newEdge.links += e.links
         newEdge.scoreVector_local += e.scoreVector_local
         newEdge.scoreVector += e.scoreVector
 
         if e.boundingBox is None:
             e.boundingBox = self.boundingBox(e.links)
     score, boundingBox = self.scoreEdge(newEdge, currentNode, span, childEdges)
     return newEdge, boundingBox
Пример #20
0
    def estimate(self, r):
        if len(r.e) - r.e.arity() == 0:
            return model.zero  # a hack to avoid having estimate the glue rule

        match = svector.Vector()
        state = r.e.subst((), ((HOLE, ), ) * r.e.arity())
        for o in xrange(1, self.order + 1):
            m = 0
            for i in xrange(len(state) - o + 1):
                if (tuple(state[i:i + o]) in self.refngrams):
                    m += 1
            match[self.feat[o - 1]] = m

        return match
Пример #21
0
    def compute_item(self, r, ants, i, j):
        """Computes various pieces of information that go into an Item:
        heuristic (float), for comparing Items
        cost (float), of the resulting Item
        dcost (Vector), to be stored in Deduction
        states

        The reason this isn't just part of Item.__init__() is that
        we want to be able to abort creation of an Item object
        as early as possible.

        It didn't really need to be a method of Chart.
        """

        ms = self.models
        w = self.weights

        cost = sum(ant.viterbi for ant in ants)
        dcost = svector.Vector(r.statelesscost)
        bonus = svector.Vector()
        newstates = [None]*len(ms)

        if r.arity() == 2:
            j1 = ants[0].j
        else:
            j1 = None

        for m_i in xrange(len(ms)):
            m = ms[m_i]
            if not m.stateless:
                antstates = [ant.states[m_i] for ant in ants]
                (state, mdcost) = m.transition(r, antstates, i, j, j1)
                bonus += m.bonus(r.lhs, state)
                newstates[m_i] = state
                dcost += mdcost
        cost += w.dot(dcost)
        return (cost+w.dot(bonus), (cost, dcost, newstates))
Пример #22
0
 def __init__(self, x, order=4, add=None, scale=True):
     svector.Vector.__init__(self, x)
     self.order = order
     if add is not None:
         self.add = add
     else:
         self.add = svector.Vector()  # add zero
     self.matchfeat = ["oracle.match%d" % o for o in xrange(order)]
     self.guessfeat = ["oracle.guess%d" % o for o in xrange(order)]
     self.addmatch = [self.add["oracle.match%d" % o] for o in xrange(order)]
     self.addguess = [self.add["oracle.guess%d" % o] for o in xrange(order)]
     self.addcandlen = self.add["oracle.candlen"]
     self.addreflen = self.add["oracle.reflen"]
     self.addsrclen = self.add["oracle.srclen"]
     self.scale = scale
Пример #23
0
    def __init__(self, filename, feat, mapdigits=False, p_unk=None):
        model.Model.__init__(self)

        log.write("Reading language model from %s...\n" % filename)
        if p_unk is not None:
            self.ngram = Ngram(filename, override_unk=-p_unk)
        else:
            self.ngram = Ngram(filename)

        self.order = self.ngram.order
        self.mapdigits = mapdigits
        self.unit = svector.Vector(feat, 1.)

        self.START = self.ngram.lookup_word("<s>")
        self.STOP = self.ngram.lookup_word("</s>")
Пример #24
0
    def scoreEdge(self, edge, currentNode, srcSpan, childEdges):
      """
      Score an edge.
      (1) edge: new hyperedge in the alignment forest, tail of this hyperedge are the edges in childEdges
      (2) currentNode: the currentNode in the tree
      (3) srcSpan: span (i, j) of currentNode; i = index of first terminal node in span, j = index of last terminal node in span
      (4) childEdges: the two (or more in case of general trees) nodes we are combining with a new hyperedge
      """

      # print(srcSpan)
      if self.COMPUTE_ORACLE:
          edge.fscore = self.ff_fscore(edge, srcSpan)
  
      boundingBox = None
      if self.DO_RESCORE:
          ##################################################################
          # Compute data needed for certain feature functions
          ##################################################################
          tgtSpan = None
          if len(edge.links) > 0:
              boundingBox = self.boundingBox(edge.links)
              tgtSpan = (boundingBox[0][0], boundingBox[1][0])
          edge.boundingBox = boundingBox
  
          # TODO: This is an awful O(l) patch of code
          linkedIndices = defaultdict(list)
          for link in edge.links:
              fIndex = link[0]
              eIndex = link[1]
              linkedIndices[fIndex].append(eIndex)
  
          scoreVector = svector.Vector(edge.scoreVector)
  
          if currentNode.data is not None and currentNode.data is not '_XXX_':
              for _, func in enumerate(self.featureTemplates_nonlocal):
                  value_dict = func(self.info, currentNode, edge, edge.links, srcSpan, tgtSpan, linkedIndices, childEdges, self.diagValues, self.treeDistValues)
                  for name, value in value_dict.iteritems():
                      if value != 0:
                          scoreVector[name] = value
          edge.scoreVector = scoreVector
  
          ##################################################
          # Compute final score for this partial alignment
          ##################################################
          edge.score = edge.scoreVector.dot(self.weights)
  
      return edge.score, boundingBox
Пример #25
0
 def expand_goal(self, bin1):
     for (cost1, item1) in bin1:
         if item1.x == self.start_nonterminal:
             if log.level >= 3:
                 log.write("Considering: %s\n" % str(item1))
             dcost = sum((m.finaltransition(item1.states[m_i])
                          for (m_i, m) in enumerate(self.models)),
                         svector.Vector())
             cost = item1.viterbi + self.weights.dot(dcost)
             ded = forest.Deduction((item1, ), None, dcost, viterbi=cost)
             self.goal.add(
                 cost,
                 forest.Item(None,
                             0,
                             self.flattice.n - 1,
                             deds=[ded],
                             states=(),
                             viterbi=cost))
Пример #26
0
 def send_weights(self, delta=None, input=''):
     if delta is None:
         delta = default_delta
     if len(self.oldweights) == 0:
         delta = False  # looking to avoid any weird bug from decoder's default weight vector (note: excluding 0 items is risky too)
     w = self.weights
     fmt = "%s:%+g" if delta else "%s:%g"
     cmd = "weights"
     keep = lambda x: True
     if delta:
         keep = lambda x: abs(x) != 0.
         cmd += " diff"
         w = w - self.oldweights
     weightstr = ",".join(fmt % (cstr_escape_nows(k), v)
                          for (k, v) in w.iteritems() if keep(v))
     #FIXME: should non-delta weights omit 0? should be ok, except crazy lm (unk?) weight from feature semantics
     self.send_instruction('%s "%s";' % (cmd, weightstr), input)
     self.oldweights = svector.Vector(self.weights)
Пример #27
0
    def __init__(self, order=4, variant="nist", oracledoc_size=10):
        self.order = order

        self.variant = variant.lower()
        if self.variant not in ['ibm', 'nist', 'average']:
            raise Exception("unknown BLEU variant %s" % self.variant)

        self.oraclemodel = OracleModel(order=order)
        self.wordcounter = WordCounter(variant=self.variant)
        self.models = [self.oraclemodel, self.wordcounter]

        self.oracledoc = svector.Vector(
            "oracle.candlen=1 oracle.reflen=1 oracle.srclen=1")
        for o in xrange(order):
            self.oracledoc["oracle.match%d" % o] = 1
            self.oracledoc["oracle.guess%d" % o] = 1
        self.oracledoc_size = oracledoc_size

        self.feats = list(self.oracledoc)
Пример #28
0
    def input(self, input):
        self.rules = collections.defaultdict(list)
        for tag, attrs, i, j in input.fmeta:
            attrs = sgml.attrs_to_dict(attrs)
            if attrs.has_key('english'):
                ephrases = attrs['english'].split('|')

                if attrs.has_key('cost'):
                    costs = [float(x) for x in attrs['cost'].split('|')]
                elif attrs.has_key('prob'):
                    costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')]
                else:
                    costs = [-math.log10(1.0/len(ephrases)) for e in ephrases] # uniform
                if len(costs) != len(ephrases):
                    sys.stderr.write("wrong number of probabilities/costs")
                    raise ValueError

                if attrs.has_key('features'):
                    features = attrs['features'].split('|')
                    if len(features) != len(ephrases):
                        sys.stderr.write("wrong number of feature names")
                        raise ValueError
                elif attrs.has_key('feature'):
                    features = [attrs['feature'] for ephrase in ephrases]
                else:
                    features = ['sgml' for ephrase in ephrases]

                if attrs.has_key('label'):
                    tags = attrs['label'].split('|')
                else:
                    tags = [tag.upper()]

                # bug: if new nonterminals are introduced at this point,
                # they will not participate in the topological sort

                for (ephrase,cost,feature) in zip(ephrases,costs,features):
                    for tag in tags:
                        r = rule.Rule(sym.fromtag(tag),
                                      rule.Phrase(input.fwords[i:j]),
                                      rule.Phrase([sym.fromstring(e) for e in ephrase.split()]),
                                      scores=svector.Vector('%s' % feature, cost))
                        self.rules[i,j].append((r,))
Пример #29
0
    def input(self, lat):
        self.rules = collections.defaultdict(list)
        for span in lat.spans:
            i, j = span.i, span.j

            if hasattr(span, 'v'):
                v = svector.Vector(span.v)
            else:
                v = model.zero

            # bug: if new nonterminals are introduced at this point,
            # they will not participate in the topological sort

            r = rule.Rule(rule.Nonterminal(span.x),
                          [rule.Nonterminal.from_str(f) for f in span.f],
                          [rule.Nonterminal.from_str(e) for e in span.e],
                          scores=v)
            self.rules[i, j].append((r, ))
            if log.level >= 2:
                log.write("added lattice rule at (%d,%d): %s\n" % (i, j, r))
Пример #30
0
def cutting_plane(weights, updates, alphas, oracles={}, epsilon=0.01):
    done = False
    saveweights = svector.Vector(weights)

    if l2_regularization:
        # not using feature scales
        #weights *= 1./(1+len(updates)*l2_regularization*max_learning_rate)

        # using feature scales
        for f in weights:
            weights[f] *= 1. / (1. + feature_scales[f] * len(updates) *
                                l2_regularization * max_learning_rate)

    while not done:
        # call separation oracles
        done = True
        for sentid, oracle in oracles.iteritems():
            vscores = oracle(weights)
            for v, score in vscores:
                violation = weights.dot(v) + score

                for v1, score1 in updates[sentid]:
                    violation1 = weights.dot(v1) + score1
                    if violation <= violation1 + epsilon:
                        break
                else:
                    updates[sentid].append((v, score))
                    alphas[sentid].append(0.)
                    done = False

        weights, alphas = update_weights(weights, updates, alphas)

        if log.level >= 4:
            log.write("alphas: %s\n" % alphas)

    if False and log.level >= 1:
        log.write("weight update: %s\n" % " ".join(
            "%s=%s" % (f, v)
            for f, v in (weights - saveweights).iteritems() if abs(v) > 0.))

    return weights, alphas