def _str_helper(self, item, accum): ded = self.ded[id(item)] if ded.rule: x = ded.rule.lhs else: x = sym.fromtag("-") if len(ded.ants) > 0: accum.extend(["(", sym.totag(x)]) for ant in ded.ants: accum.append(" ") self._str_helper(ant, accum) accum.append(")") else: accum.append(sym.totag(x))
def add_constituent_prefixes(a, ephrase_index): """if a phrase is a prefix of a constituent, give it a fake label""" if log.level >= 3: log.write( str([(i, j, sym.tostring(x)) for ((i, j), l) in a.espans.iteritems() for x in l])) log.write("\n") ei_index = {} for ((ei, ej), labels) in a.espans.iteritems(): ei_index.setdefault(ei, []).extend([(ej, x) for x in reversed(labels)]) for ei in ei_index.iterkeys(): ei_index[ei].sort() # stable for (ei, ej) in ephrase_index: if True or not (a.espans.has_key( (ei, ej)) and len(a.espans[ei, ej]) > 0): for (ej1, x) in ei_index.get(ei, []): if ej1 > ej: x1 = sym.fromtag(sym.totag(x) + "*") a.espans.setdefault((ei, ej), []).append(x1) prefix_labels.add(x1) break if log.level >= 3: log.write( str([(i, j, sym.tostring(x)) for ((i, j), l) in a.espans.iteritems() for x in l])) log.write("\n---\n")
def add_multiconstituents(a, maxabslen, ephrase_index, consts): elen = len(a.ewords) chart = [[None for ej in xrange(elen + 1)] for ei in xrange(elen + 1)] for ((ei, ej), labels) in a.espans.iteritems(): chart[ei][ej] = [labels[0]] # take the highest label for el in xrange(2, maxabslen + 1): for ei in xrange(elen - el + 1): ej = ei + el if chart[ei][ej] is not None: # must be a singleton continue bestsplit = None bestlen = None for ek in xrange(ei + 1, ej): if chart[ei][ek] is not None and chart[ek][ej] is not None and ( bestlen is None or len(chart[ei][ek]) + len(chart[ek][ej]) < bestlen): bestsplit = ek bestlen = len(chart[ei][ek]) + len(chart[ek][ej]) if bestlen is not None and bestlen <= consts: chart[ei][ej] = chart[ei][bestsplit] + chart[bestsplit][ej] for (ei, ej) in ephrase_index: if not a.espans.has_key((ei, ej)) and chart[ei][ej] is not None: a.espans[ei, ej] = [ sym.fromtag("_".join(sym.totag(x) for x in chart[ei][ej])) ]
def _item_to_xml(node, result, memo, mode, models, weights): if id(node) in memo: result.append(memo[id(node)]) return nodeid = str(len(memo)+1) memo[id(node)] = "<or ref=%s/>" % xml.sax.saxutils.quoteattr(nodeid) states = [] for mi,m in enumerate(models): try: s = m.strstate(node.states[mi]) except IndexError: continue if s: states.append(s) states = ",".join(states) result.append('<or id=%s label=%s states=%s fspan=%s>' % ( xml.sax.saxutils.quoteattr(nodeid), xml.sax.saxutils.quoteattr(sym.totag(node.x) if node.x else "None"), xml.sax.saxutils.quoteattr(states), xml.sax.saxutils.quoteattr("%s,%s" % (node.i,node.j)))) # keep only the top k deductions to slim the forest deds = [(ded.viterbi, ded) for ded in node.deds] deds.sort() if max_deds: deds = deds[:max_deds] for _, ded in deds: _ded_to_xml(ded, result, memo, mode=mode, models=models, weights=weights) result.append('</or>')
def _fake_tree_helper(lhs, rhs, antvalues): children = [] for x in rhs: if sym.isvar(x): children.append(antvalues[sym.getindex(x) - 1]) else: children.append(tree.Node(sym.tostring(x), [])) return tree.Node(sym.totag(lhs), children)
def _fake_tree_helper(lhs, rhs, antvalues): children = [] for x in rhs: if sym.isvar(x): children.append(antvalues[sym.getindex(x)-1]) else: children.append(tree.Node(sym.tostring(x), [])) return tree.Node(sym.totag(lhs), children)
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None): result = [] result.append('{\n') if fwords: fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords] result.append(' "source": [%s],\n' % ",".join(quotejson(fword) for fword in fwords)) items = list(root) nodeindex = {} nodestrs = [] for ni, item in enumerate(items): nodeindex[item] = ni if item is root: ri = ni if item.x is None: nodestrs.append(' {}') else: nodestrs.append(' {"label": %s}' % quotejson(sym.totag(item.x))) result.append(' "nodes": [\n%s\n ],\n' % ",\n".join(nodestrs)) result.append(' "root": %d,\n' % ri) edgestrs = [] for ni, item in enumerate(items): for ded in item.deds: tailstrs = [] if mode == 'french': children = ded.rule.f if ded.rule else ded.ants elif mode == 'english': children = ded.rule.e if ded.rule else ded.ants else: children = ded.ants for child in children: if isinstance(child, Item): tailstrs.append(str(nodeindex[child])) elif sym.isvar(child): ant = ded.ants[sym.getindex(child) - 1] tailstrs.append(str(nodeindex[ant])) else: tailstrs.append(quotejson(sym.tostring(child))) dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f), v) for (f, v) in ded.dcost.iteritems()) edgestrs.append( ' {"head": %s, "tails": [%s], "features": %s}\n' % (ni, ",".join(tailstrs), dcoststr)) result.append(' "edges": [\n%s\n ]\n' % ",\n".join(edgestrs)) result.append('}') return "".join(result)
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None): result = [] result.append('{\n') if fwords: fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords] result.append(' "source": [%s],\n' % ",".join(quotejson(fword) for fword in fwords)) items = list(root) nodeindex = {} nodestrs = [] for ni,item in enumerate(items): nodeindex[item] = ni if item is root: ri = ni if item.x is None: nodestrs.append(' {}') else: nodestrs.append(' {"label": %s}' % quotejson(sym.totag(item.x))) result.append(' "nodes": [\n%s\n ],\n' % ",\n".join(nodestrs)) result.append(' "root": %d,\n' % ri) edgestrs = [] for ni,item in enumerate(items): for ded in item.deds: tailstrs = [] if mode == 'french': children = ded.rule.f if ded.rule else ded.ants elif mode == 'english': children = ded.rule.e if ded.rule else ded.ants else: children = ded.ants for child in children: if isinstance(child, Item): tailstrs.append(str(nodeindex[child])) elif sym.isvar(child): ant = ded.ants[sym.getindex(child)-1] tailstrs.append(str(nodeindex[ant])) else: tailstrs.append(quotejson(sym.tostring(child))) dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f),v) for (f,v) in ded.dcost.iteritems()) edgestrs.append(' {"head": %s, "tails": [%s], "features": %s}\n' % ( ni, ",".join(tailstrs), dcoststr)) result.append(' "edges": [\n%s\n ]\n' % ",\n".join(edgestrs)) result.append('}') return "".join(result)
def _item_to_xml(node, result, memo, mode, models, weights): if id(node) in memo: result.append(memo[id(node)]) return nodeid = str(len(memo) + 1) memo[id(node)] = "<or ref=%s/>" % xml.sax.saxutils.quoteattr(nodeid) states = [] for mi, m in enumerate(models): try: s = m.strstate(node.states[mi]) except IndexError: continue if s: states.append(s) states = ",".join(states) result.append( '<or id=%s label=%s states=%s fspan=%s>' % (xml.sax.saxutils.quoteattr(nodeid), xml.sax.saxutils.quoteattr(sym.totag(node.x) if node.x else "None"), xml.sax.saxutils.quoteattr(states), xml.sax.saxutils.quoteattr("%s,%s" % (node.i, node.j)))) # keep only the top k deductions to slim the forest deds = [(ded.viterbi, ded) for ded in node.deds] deds.sort() if max_deds: deds = deds[:max_deds] for _, ded in deds: _ded_to_xml(ded, result, memo, mode=mode, models=models, weights=weights) result.append('</or>')
def add_multiconstituents(a, maxabslen, ephrase_index, consts): elen = len(a.ewords) chart = [[None for ej in xrange(elen+1)] for ei in xrange(elen+1)] for ((ei,ej),labels) in a.espans.iteritems(): chart[ei][ej] = [labels[0]] # take the highest label for el in xrange(2,maxabslen+1): for ei in xrange(elen-el+1): ej = ei+el if chart[ei][ej] is not None: # must be a singleton continue bestsplit = None bestlen = None for ek in xrange(ei+1,ej): if chart[ei][ek] is not None and chart[ek][ej] is not None and (bestlen is None or len(chart[ei][ek])+len(chart[ek][ej]) < bestlen): bestsplit = ek bestlen = len(chart[ei][ek])+len(chart[ek][ej]) if bestlen is not None and bestlen <= consts: chart[ei][ej] = chart[ei][bestsplit]+chart[bestsplit][ej] for (ei,ej) in ephrase_index: if not a.espans.has_key((ei,ej)) and chart[ei][ej] is not None: a.espans[ei,ej] = [sym.fromtag("_".join(sym.totag(x) for x in chart[ei][ej]))]
def add_constituent_prefixes(a, ephrase_index): """if a phrase is a prefix of a constituent, give it a fake label""" if log.level >= 3: log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ])) log.write("\n") ei_index = {} for ((ei,ej),labels) in a.espans.iteritems(): ei_index.setdefault(ei, []).extend([(ej,x) for x in reversed(labels)]) for ei in ei_index.iterkeys(): ei_index[ei].sort() # stable for (ei,ej) in ephrase_index: if True or not (a.espans.has_key((ei,ej)) and len(a.espans[ei,ej]) > 0): for (ej1,x) in ei_index.get(ei,[]): if ej1 > ej: x1 = sym.fromtag(sym.totag(x)+"*") a.espans.setdefault((ei,ej),[]).append(x1) prefix_labels.add(x1) break if log.level >= 3: log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ])) log.write("\n---\n")