def score_rule(self, a, r): fweight = eweight = 1.0 fratio = 0.0 for i in xrange(len(r.f)): if not sym.isvar(r.f[i]): if self.fweights is not None: fweight *= self.fweights[r.fpos[i]] if self.fratios is not None: fratio += self.fratios[r.fpos[i]] for i in xrange(len(r.e)): if not sym.isvar(r.e[i]): if self.eweights is not None: eweight *= self.eweights[r.epos[i]] scores = [] if self.fweights is not None: scores.append(fweight) if self.eweights is not None: scores.append(eweight) if self.fratios is not None: scores.append(fratio) return scores
def feature(fphrase, ephrase, paircount, fcount, fsample_count): fwords = (sym.tostring(w) for w in fphrase if not sym.isvar(w)) ewords = [sym.tostring(w) for w in ephrase if not sym.isvar(w)] + ['NULL'] def score(): for f in fwords: maxScore = max(ttable.get_score(f, e, 1) for e in ewords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score())
def score_rule(self, a, r): funaligned = eunaligned = 0 for i in xrange(len(r.f)): if not sym.isvar(r.f[i]): if not a.faligned[r.fpos[i]]: funaligned += 1 for i in xrange(len(r.e)): if not sym.isvar(r.e[i]): if not a.ealigned[r.epos[i]]: eunaligned += 1 return [funaligned, eunaligned]
def _tree_helper(t, antvalues): t = tree.str_to_tree(t) for node in t.frontier(): x = sym.fromstring(node.label) if sym.isvar(x): node.insert_child(0, antvalues[sym.getindex(x) - 1]) return t
def _ded_to_text(node, result, memo, mode=None, weights=None): # Convert rule and features into single tokens #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost) vstr = "cost:%s" % weights.dot(node.dcost) #rstr = id(node.rule) rstr = id(node) s = "%s<%s>" % (rstr,vstr) if False and len(node.ants) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id result.append(s) else: result.append('(') result.append(s) if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): result.append(' ') _item_to_text(child, result, memo, mode=mode, weights=weights) elif sym.isvar(child): result.append(' ') _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights) else: result.append(' ') result.append(quoteattr(sym.tostring(child))) result.append(')')
def visit(item): ded = self.ded[id(item)] if ded.rule: align = collections.defaultdict(list) if 'align' in ded.rule.attrs: for fi, ei in ded.rule.attrs['align']: align[ei].append(fi) result = [] j1 = None for ei, e in enumerate(ded.rule.e): if sym.isvar(e): result.extend(visit(ded.ants[sym.getindex(e) - 1])) else: if len(ded.ants) == 2: j1 = ded.ants[0].j else: j1 = None result.append([ ded.rule.f.stringpos(fi, item.i, item.j, j1) for fi in align[ei] ]) print ded.rule, item.i, item.j, j1, result return result else: return visit(ded.ants[0])
def _ded_to_xml(node, result, memo, mode, models, weights): if weights: result.append('<and label=%s cost=%s>' % (xml.sax.saxutils.quoteattr(str(id(node.rule))), xml.sax.saxutils.quoteattr(str(weights.dot(node.dcost))))) else: result.append('<and label=%s>' % (xml.sax.saxutils.quoteattr(str(id(node))))) result.append('<features>') for f,v in node.dcost.iteritems(): result.append('<feature name=%s value=%s/>' % (xml.sax.saxutils.quoteattr(f), xml.sax.saxutils.quoteattr(str(v)))) result.append('</features>') if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): _item_to_xml(child, result, memo, mode=mode, models=models, weights=weights) elif sym.isvar(child): _item_to_xml(node.ants[sym.getindex(child)-1], result, memo, mode=mode, models=models, weights=weights) else: result.append('<leaf label=%s/>' % xml.sax.saxutils.quoteattr(sym.tostring(child))) result.append('</and>')
def estimate(self, r): total = 0.0 for e in r.e: if not sym.isvar(e): subtotal = 0.0 te = self.ttable.get(e, None) if te is None: continue l = 0 for f in r.f: if not sym.isvar(f): subtotal += te.get(f, self.epsilon) l += 1 subtotal += te.get(None, self.epsilon) total += -math.log10(subtotal / (l + 1)) return total
def estimate(self, r): total = 0.0 for e in r.e: if not sym.isvar(e): subtotal = 0.0 te = self.ttable.get(e, None) if te is None: continue l = 0 for f in r.f: if not sym.isvar(f): subtotal += te.get(f, self.epsilon) l += 1 subtotal += te.get(None, self.epsilon) total += -math.log10(subtotal/(l+1)) return total
def _tree_helper(t, antvalues): t = tree.str_to_tree(t) for node in t.frontier(): x = sym.fromstring(node.label) if sym.isvar(x): node.insert_child(0, antvalues[sym.getindex(x)-1]) return t
def _fake_tree_helper(lhs, rhs, antvalues): children = [] for x in rhs: if sym.isvar(x): children.append(antvalues[sym.getindex(x) - 1]) else: children.append(tree.Node(sym.tostring(x), [])) return tree.Node(sym.totag(lhs), children)
def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): totalscore = 1.0 fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase)) ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase)) ewords.append("NULL") for f in fwords: maxScore = 0.0 for e in ewords: score = self.ttable.get_score(f, e, self.col) #print "score(MaxLexFgivenE) = ",score if score > maxScore: maxScore = score totalscore *= maxScore if totalscore == 0.0: return 999 else: return -math.log10(totalscore)
def _fake_tree_helper(lhs, rhs, antvalues): children = [] for x in rhs: if sym.isvar(x): children.append(antvalues[sym.getindex(x)-1]) else: children.append(tree.Node(sym.tostring(x), [])) return tree.Node(sym.totag(lhs), children)
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None): result = [] result.append('{\n') if fwords: fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords] result.append(' "source": [%s],\n' % ",".join(quotejson(fword) for fword in fwords)) items = list(root) nodeindex = {} nodestrs = [] for ni, item in enumerate(items): nodeindex[item] = ni if item is root: ri = ni if item.x is None: nodestrs.append(' {}') else: nodestrs.append(' {"label": %s}' % quotejson(sym.totag(item.x))) result.append(' "nodes": [\n%s\n ],\n' % ",\n".join(nodestrs)) result.append(' "root": %d,\n' % ri) edgestrs = [] for ni, item in enumerate(items): for ded in item.deds: tailstrs = [] if mode == 'french': children = ded.rule.f if ded.rule else ded.ants elif mode == 'english': children = ded.rule.e if ded.rule else ded.ants else: children = ded.ants for child in children: if isinstance(child, Item): tailstrs.append(str(nodeindex[child])) elif sym.isvar(child): ant = ded.ants[sym.getindex(child) - 1] tailstrs.append(str(nodeindex[ant])) else: tailstrs.append(quotejson(sym.tostring(child))) dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f), v) for (f, v) in ded.dcost.iteritems()) edgestrs.append( ' {"head": %s, "tails": [%s], "features": %s}\n' % (ni, ",".join(tailstrs), dcoststr)) result.append(' "edges": [\n%s\n ]\n' % ",\n".join(edgestrs)) result.append('}') return "".join(result)
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None): result = [] result.append('{\n') if fwords: fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords] result.append(' "source": [%s],\n' % ",".join(quotejson(fword) for fword in fwords)) items = list(root) nodeindex = {} nodestrs = [] for ni,item in enumerate(items): nodeindex[item] = ni if item is root: ri = ni if item.x is None: nodestrs.append(' {}') else: nodestrs.append(' {"label": %s}' % quotejson(sym.totag(item.x))) result.append(' "nodes": [\n%s\n ],\n' % ",\n".join(nodestrs)) result.append(' "root": %d,\n' % ri) edgestrs = [] for ni,item in enumerate(items): for ded in item.deds: tailstrs = [] if mode == 'french': children = ded.rule.f if ded.rule else ded.ants elif mode == 'english': children = ded.rule.e if ded.rule else ded.ants else: children = ded.ants for child in children: if isinstance(child, Item): tailstrs.append(str(nodeindex[child])) elif sym.isvar(child): ant = ded.ants[sym.getindex(child)-1] tailstrs.append(str(nodeindex[ant])) else: tailstrs.append(quotejson(sym.tostring(child))) dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f),v) for (f,v) in ded.dcost.iteritems()) edgestrs.append(' {"head": %s, "tails": [%s], "features": %s}\n' % ( ni, ",".join(tailstrs), dcoststr)) result.append(' "edges": [\n%s\n ]\n' % ",\n".join(edgestrs)) result.append('}') return "".join(result)
def transition(self, r, antstates, i, j, j1=None): total = 0.0 for e in r.e: if not sym.isvar(e): subtotal = 0.0 te = self.ttable.get(e, None) if te is None: continue for f in self.fwords: subtotal += te.get(f, self.epsilon) subtotal += te.get(None, self.epsilon) total += -math.log10(subtotal/(len(self.fwords)+1)) return (None, total)
def transition(self, r, antstates, i, j, j1=None): total = 0.0 for e in r.e: if not sym.isvar(e): subtotal = 0.0 te = self.ttable.get(e, None) if te is None: continue for f in self.fwords: subtotal += te.get(f, self.epsilon) subtotal += te.get(None, self.epsilon) total += -math.log10(subtotal / (len(self.fwords) + 1)) return (None, total)
def _ded_to_xml(node, result, memo, mode, models, weights): if weights: result.append( '<and label=%s cost=%s>' % (xml.sax.saxutils.quoteattr(str(id(node.rule))), xml.sax.saxutils.quoteattr(str(weights.dot(node.dcost))))) else: result.append('<and label=%s>' % (xml.sax.saxutils.quoteattr(str(id(node))))) result.append('<features>') for f, v in node.dcost.iteritems(): result.append('<feature name=%s value=%s/>' % (xml.sax.saxutils.quoteattr(f), xml.sax.saxutils.quoteattr(str(v)))) result.append('</features>') if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): _item_to_xml(child, result, memo, mode=mode, models=models, weights=weights) elif sym.isvar(child): _item_to_xml(node.ants[sym.getindex(child) - 1], result, memo, mode=mode, models=models, weights=weights) else: result.append('<leaf label=%s/>' % xml.sax.saxutils.quoteattr(sym.tostring(child))) result.append('</and>')
def add(self, r, estcost=0.): if r.f.arity() == 1 and len(r.f) == 1: log.write("unary rule: %s\n" % r) self.unary_rules.setdefault(sym.clearindex(r.f[0]), RuleBin(self.threshold, self.limit)).add(estcost, r) self.unary_less_than.add((sym.clearindex(r.f[0]), r.lhs)) else: cur = self.root for f in r.f: if sym.isvar(f): f = sym.clearindex(f) cur[1].setdefault(f, [None, {}]) cur = cur[1][f] if cur[0] is None: cur[0] = RuleBin(self.threshold, self.limit) self.rulebin_count += 1 bin = cur[0] bin.add(estcost, r) bin.prune() self.count += 1
def _ded_to_text(node, result, memo, mode=None, weights=None): # Convert rule and features into single tokens #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost) # lhuang: in case no weights vstr = "cost:%s" % weights.dot(node.dcost) if weights is not None \ else "_" rstr = id(node.rule) #rstr = id(node) s = "ruleid=%s<value=%s>" % (rstr,vstr) print "\truleid=%s" % rstr, if False and len(node.ants) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id result.append(s) else: result.append('(') result.append(s) if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': # lhuang: default mode: english side children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): result.append(' it ') _item_to_text(child, result, memo, mode=mode, weights=weights) elif sym.isvar(child): # lhuang: variable, do recursion result.append(' var ') _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights) else: # lhuang: english word result.append(' word ') w = quoteattr(sym.tostring(child)) result.append(w) print w, result.append(')') print # end of a hyperedge
def visit(item): ded = self.ded[id(item)] if ded.rule: align = collections.defaultdict(list) if 'align' in ded.rule.attrs: for fi, ei in ded.rule.attrs['align']: align[ei].append(fi) result = [] j1 = None for ei, e in enumerate(ded.rule.e): if sym.isvar(e): result.extend(visit(ded.ants[sym.getindex(e)-1])) else: if len(ded.ants) == 2: j1 = ded.ants[0].j else: j1 = None result.append([ded.rule.f.stringpos(fi, item.i, item.j, j1) for fi in align[ei]]) print ded.rule, item.i, item.j, j1, result return result else: return visit(ded.ants[0])
def _ded_to_text(node, result, memo, mode=None, weights=None): # Convert rule and features into single tokens #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost) vstr = "cost:%s" % weights.dot(node.dcost) #rstr = id(node.rule) rstr = id(node) s = "%s<%s>" % (rstr, vstr) if False and len( node.ants ) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id result.append(s) else: result.append('(') result.append(s) if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): result.append(' ') _item_to_text(child, result, memo, mode=mode, weights=weights) elif sym.isvar(child): result.append(' ') _item_to_text(node.ants[sym.getindex(child) - 1], result, memo, mode=mode, weights=weights) else: result.append(' ') result.append(quoteattr(sym.tostring(child))) result.append(')')
def traverse(self, right_idx=0, right_widx=0, fsent=None, rules=None, nodememo=None): ''' helper called by dump(); returns a string; figure out span''' if nodememo is None: nodememo = {} if id(self) in nodememo: return deds = [(ded.dcost.dot(weights), ded) for ded in self.deds] deds.sort() deds = [x for _, x in deds[:max_edges_per_node]] self.deds = deds # prune! nedges = len(deds) # accumulating number of edges, recursively self.i = right_idx self.wi = right_widx for dedid, ded in enumerate(deds): try: rule = rules[ded.ruleid] except: print >> sys.stderr, "WARNING: rule %d not found" % ded.ruleid ## assuming it's a one-word UNKNOWN rule ## TODO: check with lattice unkword = fsent[self.wi] rule = 'UNKNOWN("@UNKNOWN@") -> "%s"' % unkword # in reverse order rules[ded.ruleid] = rule print >> sys.stderr, " covering " + unkword self.x = rule.split("(", 1)[0] # non-terminal label # analyse RHS (chinese side) lhs, rhs = rule.split(" -> ", 1) ## -> might be a word # deal with lhs; convert to ded.lhsstr = ["...", "...", Item(...), "..."] varid = 0 lhsstr = [] for child in ded.rule.e: if sym.isvar(child): lhsstr.append(ded.ants[varid]) varid += 1 else: lhsstr.append(quoteattr(sym.tostring(child))) # will be used in _dump() ded.lhsstr = lhsstr vars = [] chars_in_gap = 0 words_in_gap = 0 for it in reversed(rhs.split()): ## from RIGHT to LEFT!! N.B. can't split(" ") if it[0] == "x": #variable: var = int(it[1:]) vars.append((var, chars_in_gap, words_in_gap)) chars_in_gap = 0 words_in_gap = 0 else: # strip off quotes "..." it = it[1:-1] # calculate char-length if it == foreign_sentence_tag: # <foreign-sentence>: # glue symbol is not counted! chars_in_gap += 0 words_in_gap += 0 else: # 1 for word, len(...) for char chars_in_gap += len(words_to_chars(it, encode_back=True)) words_in_gap += 1 accumu = self.i ## left boundary waccumu = self.wi for i, c_gap, w_gap in vars: ##for sub in ded.ants: sub = ded.ants[i] if id(sub) not in nodememo: sub.traverse(accumu + c_gap, waccumu + w_gap, fsent, rules, nodememo) # accumulating # of edges (if first seen) nedges += nodememo[id(sub)][1] ## don't accumulate subs now; will do in another visit ## s += subs accumu = sub.j waccumu = sub.wj tmp_j = (ded.ants[vars[-1][0]].j if vars != [] else self.i) + chars_in_gap if self.j is not None and self.j != tmp_j: assert False, "@sentence %d, node #%s, %d %d != %d %s rule %d" % \ (opts.sentid, self.nodeid, self.i, self.j, tmp_j, self.x, ded.ruleid) self.j = tmp_j tmp_wj = (ded.ants[vars[-1][0]].wj if vars != [] else self.wi) + words_in_gap ## self.wj = tmp_wj self.id = len(nodememo) + 1 nodememo[id(self)] = (self.id, nedges)
for line in sys.stdin: r = rule.rule_from_line(line) if r.word_alignments is None: scores = r.scores scores.extend([scores[0], scores[0]]) r.scores = scores sys.stdout.write("%s\n" % r.to_line()) progress += 1 continue align = set(r.word_alignments) fweight = eweight = 1.0 for fi in xrange(len(r.f)): if not sym.isvar(r.f[fi]): fwordweight = 0. n = 0 for ei in xrange(len(r.e)): if (fi, ei) in align: try: fwordweight += fweighttable[r.f[fi]][r.e[ei]] except KeyError: fwordweight += threshold n += 1 if n > 0: fweight *= fwordweight / n else: try: fweight = fweighttable[r.f[fi]][None] except KeyError:
for line in sys.stdin: r = rule.rule_from_line(line) if r.word_alignments is None: scores = r.scores scores.extend([scores[0],scores[0]]) r.scores = scores sys.stdout.write("%s\n" % r.to_line()) progress += 1 continue align = set(r.word_alignments) fweight = eweight = 1.0 for fi in xrange(len(r.f)): if not sym.isvar(r.f[fi]): fwordweight = 0. n = 0 for ei in xrange(len(r.e)): if (fi, ei) in align: try: fwordweight += fweighttable[r.f[fi]][r.e[ei]] except KeyError: fwordweight += threshold n += 1 if n > 0: fweight *= fwordweight / n else: try: fweight = fweighttable[r.f[fi]][None] except KeyError: