def add_constituent_prefixes(a, ephrase_index): """if a phrase is a prefix of a constituent, give it a fake label""" if log.level >= 3: log.write( str([(i, j, sym.tostring(x)) for ((i, j), l) in a.espans.iteritems() for x in l])) log.write("\n") ei_index = {} for ((ei, ej), labels) in a.espans.iteritems(): ei_index.setdefault(ei, []).extend([(ej, x) for x in reversed(labels)]) for ei in ei_index.iterkeys(): ei_index[ei].sort() # stable for (ei, ej) in ephrase_index: if True or not (a.espans.has_key( (ei, ej)) and len(a.espans[ei, ej]) > 0): for (ej1, x) in ei_index.get(ei, []): if ej1 > ej: x1 = sym.fromtag(sym.totag(x) + "*") a.espans.setdefault((ei, ej), []).append(x1) prefix_labels.add(x1) break if log.level >= 3: log.write( str([(i, j, sym.tostring(x)) for ((i, j), l) in a.espans.iteritems() for x in l])) log.write("\n---\n")
def feature(fphrase, ephrase, paircount, fcount, fsample_count): fwords = (sym.tostring(w) for w in fphrase if not sym.isvar(w)) ewords = [sym.tostring(w) for w in ephrase if not sym.isvar(w)] + ['NULL'] def score(): for f in fwords: maxScore = max(ttable.get_score(f, e, 1) for e in ewords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score())
def write(self, file): '''Write in GIZA++ format''' file.write("%s\n" % self.comment) file.write("%s\n" % " ".join([sym.tostring(word) for word in self.ewords])) output = [] output += ['NULL','({']+[str(j+1) for j in xrange(len(self.ewords)) if not self.ealigned[j]]+['})'] for i in xrange(len(self.fwords)): output += [sym.tostring(self.fwords[i]),'({']+[str(j+1) for j in xrange(len(self.aligned[i])) if self.aligned[i][j]]+['})'] file.write("%s\n" % " ".join(output))
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None): result = [] result.append('{\n') if fwords: fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords] result.append(' "source": [%s],\n' % ",".join(quotejson(fword) for fword in fwords)) items = list(root) nodeindex = {} nodestrs = [] for ni, item in enumerate(items): nodeindex[item] = ni if item is root: ri = ni if item.x is None: nodestrs.append(' {}') else: nodestrs.append(' {"label": %s}' % quotejson(sym.totag(item.x))) result.append(' "nodes": [\n%s\n ],\n' % ",\n".join(nodestrs)) result.append(' "root": %d,\n' % ri) edgestrs = [] for ni, item in enumerate(items): for ded in item.deds: tailstrs = [] if mode == 'french': children = ded.rule.f if ded.rule else ded.ants elif mode == 'english': children = ded.rule.e if ded.rule else ded.ants else: children = ded.ants for child in children: if isinstance(child, Item): tailstrs.append(str(nodeindex[child])) elif sym.isvar(child): ant = ded.ants[sym.getindex(child) - 1] tailstrs.append(str(nodeindex[ant])) else: tailstrs.append(quotejson(sym.tostring(child))) dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f), v) for (f, v) in ded.dcost.iteritems()) edgestrs.append( ' {"head": %s, "tails": [%s], "features": %s}\n' % (ni, ",".join(tailstrs), dcoststr)) result.append(' "edges": [\n%s\n ]\n' % ",\n".join(edgestrs)) result.append('}') return "".join(result)
def add_bounded_prefixes(a, ephrases, etree): if log.level >= 3: log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ])) log.write("\n") add_bounded_prefixes_helper(a, ephrases, etree, 0, []) if log.level >= 3: log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ])) log.write("\n---\n")
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None): result = [] result.append('{\n') if fwords: fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords] result.append(' "source": [%s],\n' % ",".join(quotejson(fword) for fword in fwords)) items = list(root) nodeindex = {} nodestrs = [] for ni,item in enumerate(items): nodeindex[item] = ni if item is root: ri = ni if item.x is None: nodestrs.append(' {}') else: nodestrs.append(' {"label": %s}' % quotejson(sym.totag(item.x))) result.append(' "nodes": [\n%s\n ],\n' % ",\n".join(nodestrs)) result.append(' "root": %d,\n' % ri) edgestrs = [] for ni,item in enumerate(items): for ded in item.deds: tailstrs = [] if mode == 'french': children = ded.rule.f if ded.rule else ded.ants elif mode == 'english': children = ded.rule.e if ded.rule else ded.ants else: children = ded.ants for child in children: if isinstance(child, Item): tailstrs.append(str(nodeindex[child])) elif sym.isvar(child): ant = ded.ants[sym.getindex(child)-1] tailstrs.append(str(nodeindex[ant])) else: tailstrs.append(quotejson(sym.tostring(child))) dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f),v) for (f,v) in ded.dcost.iteritems()) edgestrs.append(' {"head": %s, "tails": [%s], "features": %s}\n' % ( ni, ",".join(tailstrs), dcoststr)) result.append(' "edges": [\n%s\n ]\n' % ",\n".join(edgestrs)) result.append('}') return "".join(result)
def add_bounded_prefixes(a, ephrases, etree): if log.level >= 3: log.write( str([(i, j, sym.tostring(x)) for ((i, j), l) in a.espans.iteritems() for x in l])) log.write("\n") add_bounded_prefixes_helper(a, ephrases, etree, 0, []) if log.level >= 3: log.write( str([(i, j, sym.tostring(x)) for ((i, j), l) in a.espans.iteritems() for x in l])) log.write("\n---\n")
def _ded_to_xml(node, result, memo, mode, models, weights): if weights: result.append('<and label=%s cost=%s>' % (xml.sax.saxutils.quoteattr(str(id(node.rule))), xml.sax.saxutils.quoteattr(str(weights.dot(node.dcost))))) else: result.append('<and label=%s>' % (xml.sax.saxutils.quoteattr(str(id(node))))) result.append('<features>') for f,v in node.dcost.iteritems(): result.append('<feature name=%s value=%s/>' % (xml.sax.saxutils.quoteattr(f), xml.sax.saxutils.quoteattr(str(v)))) result.append('</features>') if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): _item_to_xml(child, result, memo, mode=mode, models=models, weights=weights) elif sym.isvar(child): _item_to_xml(node.ants[sym.getindex(child)-1], result, memo, mode=mode, models=models, weights=weights) else: result.append('<leaf label=%s/>' % xml.sax.saxutils.quoteattr(sym.tostring(child))) result.append('</and>')
def _ded_to_text(node, result, memo, mode=None, weights=None): # Convert rule and features into single tokens #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost) vstr = "cost:%s" % weights.dot(node.dcost) #rstr = id(node.rule) rstr = id(node) s = "%s<%s>" % (rstr,vstr) if False and len(node.ants) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id result.append(s) else: result.append('(') result.append(s) if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): result.append(' ') _item_to_text(child, result, memo, mode=mode, weights=weights) elif sym.isvar(child): result.append(' ') _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights) else: result.append(' ') result.append(quoteattr(sym.tostring(child))) result.append(')')
def get_hyps(sent, goal, weights): """Assumes that oraclemodel.input() has been called""" # worst violators oracleweights = theoracle.make_weights(additive=True) # we use the in-place operations because oracleweights might be # a subclass of Vector oracleweights *= fear_weight oracleweights += weights goal.reweight(oracleweights) hyps = decoder.get_nbest(goal, 1, 1) result = [] for hypv, hyp in hyps: hypscore = get_score(hypv, hyp) log.write( "added new hyp: %s %s cost=%s score=%s\n" % (" ".join(sym.tostring(e) for e in hyp), hypv, weights.dot(hypv), hypscore) ) # the learner MUST not see the oracle features hypv = theoracle.clean(hypv) result.append((hypv, hyp, hypscore)) return result
def add_sister_prefixes(a, ephrases, etree): if log.level >= 3: log.write("phrases before filtering:\n") for (i,j) in ephrases: log.write("%s" % ((i,j),)) log.write("constituents before adding:\n") for ((i,j),l) in a.espans.iteritems(): log.write("%s %s\n" % ((i,j),[sym.tostring(x) for x in l])) add_sister_prefixes_helper(a, ephrases, etree, 0) if log.level >= 3: log.write("constituents after adding:\n") for ((i,j),l) in a.espans.iteritems(): log.write("%s %s\n" % ((i,j),[sym.tostring(x) for x in l])) log.write("\n---\n")
def get_hyps(sent, goal, weights): """Assumes that oraclemodel.input() has been called""" # worst violators oracleweights = theoracle.make_weights(additive=True) # we use the in-place operations because oracleweights might be # a subclass of Vector oracleweights *= fear_weight oracleweights += weights goal.reweight(oracleweights) hyps = decoder.get_nbest(goal, 1, 1) result = [] for hypv, hyp in hyps: hypscore = get_score(hypv, hyp) log.write("added new hyp: %s %s cost=%s score=%s\n" % (" ".join(sym.tostring(e) for e in hyp), hypv, weights.dot(hypv), hypscore)) # the learner MUST not see the oracle features hypv = theoracle.clean(hypv) result.append((hypv, hyp, hypscore)) return result
def add_sister_prefixes(a, ephrases, etree): if log.level >= 3: log.write("phrases before filtering:\n") for (i, j) in ephrases: log.write("%s" % ((i, j), )) log.write("constituents before adding:\n") for ((i, j), l) in a.espans.iteritems(): log.write("%s %s\n" % ((i, j), [sym.tostring(x) for x in l])) add_sister_prefixes_helper(a, ephrases, etree, 0) if log.level >= 3: log.write("constituents after adding:\n") for ((i, j), l) in a.espans.iteritems(): log.write("%s %s\n" % ((i, j), [sym.tostring(x) for x in l])) log.write("\n---\n")
def _fake_tree_helper(lhs, rhs, antvalues): children = [] for x in rhs: if sym.isvar(x): children.append(antvalues[sym.getindex(x)-1]) else: children.append(tree.Node(sym.tostring(x), [])) return tree.Node(sym.totag(lhs), children)
def _fake_tree_helper(lhs, rhs, antvalues): children = [] for x in rhs: if sym.isvar(x): children.append(antvalues[sym.getindex(x) - 1]) else: children.append(tree.Node(sym.tostring(x), [])) return tree.Node(sym.totag(lhs), children)
def __str__(self): r = "conf net: %d\n" % (len(self.columns),) i = 0 for col in self.columns: r += "%d -- " % i i += 1 for alternative in col: r += "(%s, %s, %s) " % (sym.tostring(alternative[0]), alternative[1], alternative[2]) r += "\n" return r
def forest_to_xml(node, fwords=None, mode=None, models=None, weights=None): result = [] result.append('<forest>') if fwords: fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords] result.append('<source>%s</source>' % " ".join(fwords)) _item_to_xml(node, result, {}, mode=mode, models=models, weights=weights) result.append('</forest>') return "".join(result)
def process_heldout(sent): theoracle.input(sent) log.write("done preparing\n") global decoder_errors try: goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) decoder_errors = 0 if goal is None: raise Exception("parse failure") except Exception: import traceback log.writeln( "decoder raised exception: %s %s" % (sent, "".join(traceback.format_exception(*sys.exc_info()))) ) decoder_errors += 1 if decoder_errors >= 100: log.write("decoder failed too many times, passing exception through!\n") raise else: return goal.rescore(theoracle.models, thedecoder.weights, add=True) bestv, best = decoder.get_nbest(goal, 1)[0] log.write("done decoding\n") bestscore = get_score(bestv, best) log.write( "best hyp: %s %s cost=%s score=%s\n" % (" ".join(sym.tostring(e) for e in best), bestv, thedecoder.weights.dot(bestv), bestscore) ) bestv = theoracle.finish(bestv, best) sent.score_comps = bestv sent.ewords = [sym.tostring(e) for e in best] return sent
def output(node, prompt, gbleu, gscore): deriv = node.viterbi_deriv() hyp = " ".join([sym.tostring(e) for e in deriv.english()]) bleu = fbleu.rescore(hyp) score = weights.dot(deriv.vector()) # in place!! gbleu += fbleu.copy() gscore += [score] print >> logs, "%s \tscore=%.4lf\tbleu+1=%.4lf\tlenratio=%.2lf\n%s" % \ (prompt, score, bleu, fbleu.ratio(), hyp)
def process_heldout(sent): theoracle.input(sent) log.write("done preparing\n") global decoder_errors try: goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) decoder_errors = 0 if goal is None: raise Exception("parse failure") except Exception: import traceback log.writeln( "decoder raised exception: %s %s" % (sent, "".join(traceback.format_exception(*sys.exc_info())))) decoder_errors += 1 if decoder_errors >= 100: log.write( "decoder failed too many times, passing exception through!\n" ) raise else: return goal.rescore(theoracle.models, thedecoder.weights, add=True) bestv, best = decoder.get_nbest(goal, 1)[0] log.write("done decoding\n") bestscore = get_score(bestv, best) log.write("best hyp: %s %s cost=%s score=%s\n" % (" ".join( sym.tostring(e) for e in best), bestv, thedecoder.weights.dot(bestv), bestscore)) bestv = theoracle.finish(bestv, best) sent.score_comps = bestv sent.ewords = [sym.tostring(e) for e in best] return sent
def topological_sort(self): # now we do a topological sort on the unary immediate domination relation if log.level >= 3: log.write("Doing topological sort on nonterminals\n") self.nonterminals = [] # make unary_children into graph for (x,s) in self.unary_children.items(): for y in s: self.unary_children.setdefault(y, set()) if log.level >= 3: for (x,s) in self.unary_children.items(): log.write("%s -> %s\n" % (sym.tostring(x), " | ".join(sym.tostring(y) for y in s))) for x in sym.nonterminals(): if not self.unary_children.has_key(x): self.nonterminals.append(x) while len(self.unary_children) > 0: childless = None for (x,s) in self.unary_children.iteritems(): if len(s) == 0: childless = x break if childless is None: sys.stderr.write("cycle of unary productions detected: ") childless = self.unary_children.keys()[0] # arbitrary sys.stderr.write("breaking all unary children of %s\n" % sym.tostring(childless)) del self.unary_children[childless] for (x,s) in self.unary_children.iteritems(): s.discard(childless) self.nonterminals.append(childless) if len(self.nonterminals) < 1000 and log.level >= 3: log.write("Nonterminals: %s\n" % " ".join("%s=%s" % (x,sym.tostring(x)) for x in self.nonterminals)) self.make_index() self.unary_children = None
def add_constituent_prefixes(a, ephrase_index): """if a phrase is a prefix of a constituent, give it a fake label""" if log.level >= 3: log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ])) log.write("\n") ei_index = {} for ((ei,ej),labels) in a.espans.iteritems(): ei_index.setdefault(ei, []).extend([(ej,x) for x in reversed(labels)]) for ei in ei_index.iterkeys(): ei_index[ei].sort() # stable for (ei,ej) in ephrase_index: if True or not (a.espans.has_key((ei,ej)) and len(a.espans[ei,ej]) > 0): for (ej1,x) in ei_index.get(ei,[]): if ej1 > ej: x1 = sym.fromtag(sym.totag(x)+"*") a.espans.setdefault((ei,ej),[]).append(x1) prefix_labels.add(x1) break if log.level >= 3: log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ])) log.write("\n---\n")
def get_nbest(goal, n_best, ambiguity_limit=None): if log.level >= 1: log.write(" Extracting derivation(s)...\n") result = [] nbest = forest.NBest(goal, ambiguity_limit=ambiguity_limit) for deriv in itertools.islice(nbest, n_best): hyp = Hypothesis() hyp.words = [sym.tostring(e) for e in deriv.english()] hyp.vector = deriv.vector() hyp.deriv = str(deriv) result.append(hyp) return result
def _ded_to_xml(node, result, memo, mode, models, weights): if weights: result.append( '<and label=%s cost=%s>' % (xml.sax.saxutils.quoteattr(str(id(node.rule))), xml.sax.saxutils.quoteattr(str(weights.dot(node.dcost))))) else: result.append('<and label=%s>' % (xml.sax.saxutils.quoteattr(str(id(node))))) result.append('<features>') for f, v in node.dcost.iteritems(): result.append('<feature name=%s value=%s/>' % (xml.sax.saxutils.quoteattr(f), xml.sax.saxutils.quoteattr(str(v)))) result.append('</features>') if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): _item_to_xml(child, result, memo, mode=mode, models=models, weights=weights) elif sym.isvar(child): _item_to_xml(node.ants[sym.getindex(child) - 1], result, memo, mode=mode, models=models, weights=weights) else: result.append('<leaf label=%s/>' % xml.sax.saxutils.quoteattr(sym.tostring(child))) result.append('</and>')
def _ded_to_text(node, result, memo, mode=None, weights=None): # Convert rule and features into single tokens #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost) # lhuang: in case no weights vstr = "cost:%s" % weights.dot(node.dcost) if weights is not None \ else "_" rstr = id(node.rule) #rstr = id(node) s = "ruleid=%s<value=%s>" % (rstr,vstr) print "\truleid=%s" % rstr, if False and len(node.ants) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id result.append(s) else: result.append('(') result.append(s) if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': # lhuang: default mode: english side children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): result.append(' it ') _item_to_text(child, result, memo, mode=mode, weights=weights) elif sym.isvar(child): # lhuang: variable, do recursion result.append(' var ') _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights) else: # lhuang: english word result.append(' word ') w = quoteattr(sym.tostring(child)) result.append(w) print w, result.append(')') print # end of a hyperedge
def get_gold(sent, goal, weights): """Assumes that oraclemodel.input() has been called""" oracleweights = theoracle.make_weights(additive=True) # we use the in-place operations because oracleweights might be # a subclass of Vector oracleweights *= -hope_weight oracleweights += weights goal.reweight(oracleweights) goldv, gold = decoder.get_nbest(goal, 1, 1)[0] goldscore = get_score(goldv, gold) log.write("gold hyp: %s %s cost=%s score=%s\n" % (" ".join(sym.tostring(e) for e in gold), goldv, weights.dot(goldv), goldscore)) # the learner MUST not see the oracle features goldv = theoracle.clean(goldv) return goldv, gold, goldscore
def finish(self, v, words): """Return a copy of v that contains only the features relevant to computing a score. We can also perform any necessary corrections to v that are possible knowing the whole output.""" # Actually, for BLEU we just recompute from scratch # postprocessing: delete non-ASCII chars and @UNKNOWN@ words = [sym.tostring(w) for w in words] words = " ".join(words) words = "".join(c for c in words if ord(c) < 128) words = [sym.fromstring(word) for word in words.split()] v = svector.Vector() cand = collections.defaultdict(int) for o in xrange(self.order): for i in xrange(len(words) - o): cand[tuple(words[i:i + o + 1])] += 1 match = collections.defaultdict(int) for ngram in cand: match[len(ngram) - 1] += min(cand[ngram], self.oraclemodel.refngrams[ngram]) for o in xrange(self.order): v["oracle.match%d" % o] = match[o] v["oracle.guess%d" % o] = max(0, len(words) - o) v["oracle.srclen"] = self.wordcounter.srclen v["oracle.candlen"] = len(words) if self.variant == "ibm": v["oracle.reflen"] = min( (abs(l - len(words)), l) for l in self.wordcounter.reflens)[1] else: v["oracle.reflen"] = self.wordcounter.reflen return v
def get_gold(sent, goal, weights): """Assumes that oraclemodel.input() has been called""" oracleweights = theoracle.make_weights(additive=True) # we use the in-place operations because oracleweights might be # a subclass of Vector oracleweights *= -hope_weight oracleweights += weights goal.reweight(oracleweights) goldv, gold = decoder.get_nbest(goal, 1, 1)[0] goldscore = get_score(goldv, gold) log.write( "gold hyp: %s %s cost=%s score=%s\n" % (" ".join(sym.tostring(e) for e in gold), goldv, weights.dot(goldv), goldscore) ) # the learner MUST not see the oracle features goldv = theoracle.clean(goldv) return goldv, gold, goldscore
def finish(self, v, words): """Return a copy of v that contains only the features relevant to computing a score. We can also perform any necessary corrections to v that are possible knowing the whole output.""" # Actually, for BLEU we just recompute from scratch # postprocessing: delete non-ASCII chars and @UNKNOWN@ words = [sym.tostring(w) for w in words] words = " ".join(words) words = "".join(c for c in words if ord(c) < 128) words = [sym.fromstring(word) for word in words.split()] v = svector.Vector() cand = collections.defaultdict(int) for o in xrange(self.order): for i in xrange(len(words)-o): cand[tuple(words[i:i+o+1])] += 1 match = collections.defaultdict(int) for ngram in cand: match[len(ngram)-1] += min(cand[ngram], self.oraclemodel.refngrams[ngram]) for o in xrange(self.order): v["oracle.match%d" % o] = match[o] v["oracle.guess%d" % o] = max(0,len(words)-o) v["oracle.srclen"] = self.wordcounter.srclen v["oracle.candlen"] = len(words) if self.variant == "ibm": v["oracle.reflen"] = min((abs(l-len(words)), l) for l in self.wordcounter.reflens)[1] else: v["oracle.reflen"] = self.wordcounter.reflen return v
def _ded_to_text(node, result, memo, mode=None, weights=None): # Convert rule and features into single tokens #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost) vstr = "cost:%s" % weights.dot(node.dcost) #rstr = id(node.rule) rstr = id(node) s = "%s<%s>" % (rstr, vstr) if False and len( node.ants ) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id result.append(s) else: result.append('(') result.append(s) if mode == 'french': children = node.rule.f if node.rule else node.ants elif mode == 'english': children = node.rule.e if node.rule else node.ants else: children = node.ants for child in children: if isinstance(child, Item): result.append(' ') _item_to_text(child, result, memo, mode=mode, weights=weights) elif sym.isvar(child): result.append(' ') _item_to_text(node.ants[sym.getindex(child) - 1], result, memo, mode=mode, weights=weights) else: result.append(' ') result.append(quoteattr(sym.tostring(child))) result.append(')')
def __str__(self): return "%s ::= %s (%d rules)" % (sym.tostring(self.lhs), str(self.f), len(self.rules))
def strstate(self, state): return " ".join(sym.tostring(s) for s in state)
% (len(a.etags), len(a.ewords))) a.espans = None if opts.trees: if ebfile is not None: etree = tree.str_to_tree(ebfile.readline()) if etree is None: sys.stderr.write("warning, line %d: null tree" % a.lineno) a.espans = {} elif etree.length != len(a.ewords): sys.stderr.write( "warning, line %d: length mismatch between English words and trees (%d != %d)\n" % (a.lineno, len(a.ewords), etree.length)) sys.stderr.write( " start of English sentence: %s\n" % " ".join([sym.tostring(x) for x in a.ewords[:5]])) a.espans = {} else: remove_req(etree) a.espans = etree.spans() for (span, labels) in a.espans.iteritems(): a.espans[span] = [sym.fromtag(x) for x in labels] # done reading all input lines if opts.discard_long_sentences and len(a.fwords) > opts.maxabslen: continue realcount += 1 if opts.parallel is not None: if realcount % opts.parallel[1] != opts.parallel[ 0] % opts.parallel[1]:
fcount[a.fwords[i]] = fcount.get(a.fwords[i],0)+1 ecount[null] = ecount.get(null,0)+1 fecount[(a.fwords[i],null)] = fecount.get((a.fwords[i],null),0)+1 for j in xrange(len(a.ewords)): if not a.ealigned[j]: count += 1 fcount[null] = fcount.get(null,0)+1 ecount[a.ewords[j]] = ecount.get(a.ewords[j],0)+1 fecount[(null,a.ewords[j])] = fecount.get((null,a.ewords[j]),0)+1 progress += 1 if progress % 10000 == 0: sys.stderr.write(".") # Dump lexical weights for (fword,eword) in fecount.keys(): if opts.ratiofile: # f|e c12 = fecount[fword,eword] c1 = ecount[eword] c2 = fcount[fword] p = float(c2)/count p1 = float(c12)/c1 p2 = float(c2-c12)/(count-c1) ratiofile.write("%s %s %f\n" % (sym.tostring(eword), sym.tostring(fword), -2*llr(count,ecount[eword],fcount[fword],fecount[fword,eword]))) if opts.weightfiles: fweightfile.write("%s %s %f\n" % (sym.tostring(fword), sym.tostring(eword), float(fecount[(fword,eword)])/ecount[eword])) eweightfile.write("%s %s %f\n" % (sym.tostring(eword), sym.tostring(fword), float(fecount[(fword,eword)])/fcount[fword])) sys.stderr.write("\n")
def __str__(self): if self.x is None: return "[Goal]" else: return "[%s,%d,%d,%s,cost=%s]" % (sym.tostring( self.x), self.i, self.j, str(self.states), self.viterbi)
def traverse(self, right_idx=0, right_widx=0, fsent=None, rules=None, nodememo=None): ''' helper called by dump(); returns a string; figure out span''' if nodememo is None: nodememo = {} if id(self) in nodememo: return deds = [(ded.dcost.dot(weights), ded) for ded in self.deds] deds.sort() deds = [x for _, x in deds[:max_edges_per_node]] self.deds = deds # prune! nedges = len(deds) # accumulating number of edges, recursively self.i = right_idx self.wi = right_widx for dedid, ded in enumerate(deds): try: rule = rules[ded.ruleid] except: print >> sys.stderr, "WARNING: rule %d not found" % ded.ruleid ## assuming it's a one-word UNKNOWN rule ## TODO: check with lattice unkword = fsent[self.wi] rule = 'UNKNOWN("@UNKNOWN@") -> "%s"' % unkword # in reverse order rules[ded.ruleid] = rule print >> sys.stderr, " covering " + unkword self.x = rule.split("(", 1)[0] # non-terminal label # analyse RHS (chinese side) lhs, rhs = rule.split(" -> ", 1) ## -> might be a word # deal with lhs; convert to ded.lhsstr = ["...", "...", Item(...), "..."] varid = 0 lhsstr = [] for child in ded.rule.e: if sym.isvar(child): lhsstr.append(ded.ants[varid]) varid += 1 else: lhsstr.append(quoteattr(sym.tostring(child))) # will be used in _dump() ded.lhsstr = lhsstr vars = [] chars_in_gap = 0 words_in_gap = 0 for it in reversed(rhs.split()): ## from RIGHT to LEFT!! N.B. can't split(" ") if it[0] == "x": #variable: var = int(it[1:]) vars.append((var, chars_in_gap, words_in_gap)) chars_in_gap = 0 words_in_gap = 0 else: # strip off quotes "..." it = it[1:-1] # calculate char-length if it == foreign_sentence_tag: # <foreign-sentence>: # glue symbol is not counted! chars_in_gap += 0 words_in_gap += 0 else: # 1 for word, len(...) for char chars_in_gap += len(words_to_chars(it, encode_back=True)) words_in_gap += 1 accumu = self.i ## left boundary waccumu = self.wi for i, c_gap, w_gap in vars: ##for sub in ded.ants: sub = ded.ants[i] if id(sub) not in nodememo: sub.traverse(accumu + c_gap, waccumu + w_gap, fsent, rules, nodememo) # accumulating # of edges (if first seen) nedges += nodememo[id(sub)][1] ## don't accumulate subs now; will do in another visit ## s += subs accumu = sub.j waccumu = sub.wj tmp_j = (ded.ants[vars[-1][0]].j if vars != [] else self.i) + chars_in_gap if self.j is not None and self.j != tmp_j: assert False, "@sentence %d, node #%s, %d %d != %d %s rule %d" % \ (opts.sentid, self.nodeid, self.i, self.j, tmp_j, self.x, ded.ruleid) self.j = tmp_j tmp_wj = (ded.ants[vars[-1][0]].wj if vars != [] else self.wi) + words_in_gap ## self.wj = tmp_wj self.id = len(nodememo) + 1 nodememo[id(self)] = (self.id, nedges)
def __reduce__(self): return (Item, (sym.tostring(self.x), i, j, self.deds))
def __str__(self): if self.x is None: return "[Goal]" else: return "[%s,%d,%d,%s,cost=%s]" % (sym.tostring(self.x),self.i,self.j,str(self.states),self.viterbi)
def process(sent): global alphas if online_learning: updates.clear() alphas.clear() theoracle.input(sent) log.write("done preparing\n") global decoder_errors try: goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) decoder_errors = 0 if goal is None: raise Exception("parse failure") except Exception: import traceback log.writeln( "decoder raised exception: %s %s" % (sent, "".join(traceback.format_exception(*sys.exc_info()))) ) decoder_errors += 1 if decoder_errors >= 100: log.write("decoder failed too many times, passing exception through!\n") raise else: return goal.rescore(theoracle.models, thedecoder.weights, add=True) bestv, best = decoder.get_nbest(goal, 1)[0] log.write("done decoding\n") bestscore = get_score(bestv, best) log.write( "best hyp: %s %s cost=%s score=%s\n" % (" ".join(sym.tostring(e) for e in best), bestv, thedecoder.weights.dot(bestv), bestscore) ) goldv, gold, goldscore = get_gold(sent, goal, thedecoder.weights) assert ( sent.id not in updates ) # in batch learning, this can happen, and we would have to undo the update associated with this sentence updates[sent.id] = [(svector.Vector(), 0.0)] alphas[sent.id] = [max_learning_rate] if opts.parallel: while True: if mpi.world.iprobe(tag=1): (sentid, vscores) = mpi.world.recv(tag=1) log.write("received update for %s\n" % (sentid,)) if sentid in updates: # see comment above log.write("ignoring update for %s\n" % (sentid,)) continue # drop this update on the floor updates[sentid] = vscores alphas[sentid] = [max_learning_rate] + [0.0] * (len(vscores) - 1) # since the first update is zero, the alphas & updates # are still consistent with weights else: break def oracle(weights): hyps = get_hyps(sent, goal, weights) return [(goldv - hypv, goldscore - hypscore) for (hypv, hyp, hypscore) in hyps] thedecoder.weights, alphas = cutting_plane(thedecoder.weights, updates, alphas, {sent.id: oracle}) remove_zeros(thedecoder.weights) log.write("feature weights: %s\n" % (thedecoder.weights * watch_features)) log.write("weight norm: %s\n" % (math.sqrt(thedecoder.weights.normsquared()))) # update weight sum for averaging global nweights, sumweights_helper # sumweights_helper = \sum_{i=0}^n (i \Delta w_i) for sentid in updates: for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]): apply_update(sumweights_helper, nweights * alpha * v) nweights += 1 # update feature scales if update_feature_scales: global sum_updates2, n_updates, feature_scales for sentid in updates: u = svector.Vector() for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]): u += alpha / max_learning_rate * v sum_updates2 += u * u n_updates += 1 try: default_feature_scale = 1.0 / compute_variance(0, n_updates) except ZeroDivisionError: default_feature_scale = 0.0 # pseudoinverse feature_scales = collections.defaultdict(lambda: default_feature_scale) for feat in sum_updates2: try: feature_scales[feat] = 1.0 / compute_variance(sum_updates2[feat], n_updates) except ZeroDivisionError: feature_scales[feat] = 0.0 # pseudoinverse log.write( "feature scales: %s\n" % (" ".join("%s=%s" % (f, feature_scales[f]) for f in watch_features if f in feature_scales)) ) if opts.parallel: # flush out filled requests global requests requests = [request for request in requests if not request.test()] # transmit updates to other nodes for node in parallel.slaves: if node != parallel.rank: requests.append(mpi.world.isend(value=(sent.id, updates[sent.id]), dest=node, tag=1)) bestv = theoracle.finish(bestv, best) theoracle.update(bestv) sent.score_comps = bestv if log.level >= 1: gc.collect() log.write("done updating, memory = %s\n" % monitor.memory()) sent.ewords = [sym.tostring(e) for e in best] return sent
def process(sent): goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) if goal is None: return None if opts.forest_dir: forest_file = gzip.open(os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w") forest_file.write(forest.forest_to_json(goal, fwords=sent.fwords, mode='english', models=thedecoder.models, weights=thedecoder.weights)) forest_file.close() if opts.rule_posterior_dir: rule_posterior_file = open(os.path.join(opts.rule_posterior_dir, "rule_posterior.%s" % sent.id), "w") beta = 1. insides = goal.compute_inside(thedecoder.weights, beta=beta) outsides = goal.compute_outside(thedecoder.weights, insides, beta=beta) z = insides[id(goal)] for item in goal.bottomup(): for ded in item.deds: c = outsides[id(item)] c += thedecoder.weights.dot(ded.dcost) c += sum(insides[id(ant)] for ant in ded.ants) c -= z rule_posterior_file.write("%s ||| span=%s posterior=%s\n" % (ded.rule, (item.i, item.j), cost.prob(c))) ded.dcost['posterior'] = c rule_posterior_file.close() max_posterior_file = open(os.path.join(opts.rule_posterior_dir, "max_posterior.%s" % sent.id), "w") goal.reweight(svector.Vector('posterior=1')) max_posterior = goal.viterbi_deriv() def show(ded, antvalues): if ded.rule: value = ded.rule.e.subst((), antvalues) else: value = antvalues[0] return ("[%.3f" % cost.prob(ded.dcost['posterior']),) + value + ("]",) value = max_posterior.value(show) s = " ".join((sym.tostring(e) if type(e) is int else e) for e in value) max_posterior_file.write("%s\n" % s) max_posterior_file.close() outputs = get_nbest(goal, n_best, ambiguity_limit) if n_best_file: for (v,e) in outputs: e = " ".join(sym.tostring(w) for w in e) #n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, -thedecoder.weights.dot(v))) n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, v)) n_best_file.flush() (bestv,best) = outputs[0] if french_parse_file: french_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().french_tree())) french_parse_file.flush() if english_parse_file: english_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().english_tree())) english_parse_file.flush() if log.level >= 1: gc.collect() log.write(" done decoding, memory=%s\n" % monitor.memory()) log.write(" features: %s; %s\n" % (bestv, thedecoder.weights.dot(bestv))) sent.ewords = [sym.tostring(e) for e in best] return sent
fecount[(null, a.ewords[j])] = fecount.get( (null, a.ewords[j]), 0) + 1 progress += 1 if progress % 10000 == 0: sys.stderr.write(".") # Dump lexical weights for (fword, eword) in fecount.keys(): if opts.ratiofile: # f|e c12 = fecount[fword, eword] c1 = ecount[eword] c2 = fcount[fword] p = float(c2) / count p1 = float(c12) / c1 p2 = float(c2 - c12) / (count - c1) ratiofile.write("%s %s %f\n" % (sym.tostring(eword), sym.tostring(fword), -2 * llr(count, ecount[eword], fcount[fword], fecount[fword, eword]))) if opts.weightfiles: fweightfile.write("%s %s %f\n" % (sym.tostring(fword), sym.tostring(eword), float(fecount[(fword, eword)]) / ecount[eword])) eweightfile.write("%s %s %f\n" % (sym.tostring(eword), sym.tostring(fword), float(fecount[(fword, eword)]) / fcount[fword])) sys.stderr.write("\n")
def output(f): deriv = f.viterbi_deriv() hypv = deriv.vector() hyp = deriv.english() return "hyp={{{%s}}} derivation={{{%s}}} %s" % (" ".join(sym.tostring(e) for e in hyp), deriv, hypv)
def process(sent): global alphas if online_learning: updates.clear() alphas.clear() theoracle.input(sent) log.write("done preparing\n") global decoder_errors try: goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) decoder_errors = 0 if goal is None: raise Exception("parse failure") except Exception: import traceback log.writeln( "decoder raised exception: %s %s" % (sent, "".join(traceback.format_exception(*sys.exc_info())))) decoder_errors += 1 if decoder_errors >= 100: log.write( "decoder failed too many times, passing exception through!\n" ) raise else: return goal.rescore(theoracle.models, thedecoder.weights, add=True) bestv, best = decoder.get_nbest(goal, 1)[0] log.write("done decoding\n") bestscore = get_score(bestv, best) log.write("best hyp: %s %s cost=%s score=%s\n" % (" ".join( sym.tostring(e) for e in best), bestv, thedecoder.weights.dot(bestv), bestscore)) goldv, gold, goldscore = get_gold(sent, goal, thedecoder.weights) assert ( sent.id not in updates ) # in batch learning, this can happen, and we would have to undo the update associated with this sentence updates[sent.id] = [(svector.Vector(), 0.)] alphas[sent.id] = [max_learning_rate] if opts.parallel: while True: if mpi.world.iprobe(tag=1): (sentid, vscores) = mpi.world.recv(tag=1) log.write("received update for %s\n" % (sentid, )) if sentid in updates: # see comment above log.write("ignoring update for %s\n" % (sentid, )) continue # drop this update on the floor updates[sentid] = vscores alphas[sentid] = [max_learning_rate ] + [0.] * (len(vscores) - 1) # since the first update is zero, the alphas & updates # are still consistent with weights else: break def oracle(weights): hyps = get_hyps(sent, goal, weights) return [(goldv - hypv, goldscore - hypscore) for (hypv, hyp, hypscore) in hyps] thedecoder.weights, alphas = cutting_plane(thedecoder.weights, updates, alphas, {sent.id: oracle}) remove_zeros(thedecoder.weights) log.write("feature weights: %s\n" % (thedecoder.weights * watch_features)) log.write("weight norm: %s\n" % (math.sqrt(thedecoder.weights.normsquared()))) # update weight sum for averaging global nweights, sumweights_helper # sumweights_helper = \sum_{i=0}^n (i \Delta w_i) for sentid in updates: for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]): apply_update(sumweights_helper, nweights * alpha * v) nweights += 1 # update feature scales if update_feature_scales: global sum_updates2, n_updates, feature_scales for sentid in updates: u = svector.Vector() for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]): u += alpha / max_learning_rate * v sum_updates2 += u * u n_updates += 1 try: default_feature_scale = 1. / compute_variance(0, n_updates) except ZeroDivisionError: default_feature_scale = 0. # pseudoinverse feature_scales = collections.defaultdict( lambda: default_feature_scale) for feat in sum_updates2: try: feature_scales[feat] = 1. / compute_variance( sum_updates2[feat], n_updates) except ZeroDivisionError: feature_scales[feat] = 0. # pseudoinverse log.write( "feature scales: %s\n" % (" ".join("%s=%s" % (f, feature_scales[f]) for f in watch_features if f in feature_scales))) if opts.parallel: # flush out filled requests global requests requests = [request for request in requests if not request.test()] # transmit updates to other nodes for node in parallel.slaves: if node != parallel.rank: requests.append( mpi.world.isend(value=(sent.id, updates[sent.id]), dest=node, tag=1)) bestv = theoracle.finish(bestv, best) theoracle.update(bestv) sent.score_comps = bestv if log.level >= 1: gc.collect() log.write("done updating, memory = %s\n" % monitor.memory()) sent.ewords = [sym.tostring(e) for e in best] return sent
a.etags = etfile.readline().split() if len(a.ftags) != len(a.fwords): sys.stderr.write("warning: length mismatch between French words and tags (%d != %d)\n" % (len(a.ftags), len(a.fwords))) if len(a.etags) != len(a.ewords): sys.stderr.write("warning: length mismatch between English words and tags (%d != %d)\n" % (len(a.etags), len(a.ewords))) a.espans = None if opts.trees: if ebfile is not None: etree = tree.str_to_tree(ebfile.readline()) if etree is None: sys.stderr.write("warning, line %d: null tree" % a.lineno) a.espans = {} elif etree.length != len(a.ewords): sys.stderr.write("warning, line %d: length mismatch between English words and trees (%d != %d)\n" % (a.lineno, len(a.ewords), etree.length)) sys.stderr.write(" start of English sentence: %s\n" % " ".join([sym.tostring(x) for x in a.ewords[:5]])) a.espans = {} else: remove_req(etree) a.espans = etree.spans() for (span, labels) in a.espans.iteritems(): a.espans[span] = [sym.fromtag(x) for x in labels] # done reading all input lines if opts.discard_long_sentences and len(a.fwords) > opts.maxabslen: continue realcount += 1 if opts.parallel is not None: if realcount % opts.parallel[1] != opts.parallel[0] % opts.parallel[1]: continue