示例#1
0
 def _str_helper(self, item, accum):
     ded = self.ded[id(item)]
     if ded.rule:
         x = ded.rule.lhs
     else:
         x = sym.fromtag("-")
     if len(ded.ants) > 0:
         accum.extend(["(", sym.totag(x)])
         for ant in ded.ants:
             accum.append(" ")
             self._str_helper(ant, accum)
         accum.append(")")
     else:
         accum.append(sym.totag(x))
示例#2
0
文件: forest.py 项目: jungikim/sbmt
 def _str_helper(self, item, accum):
     ded = self.ded[id(item)]
     if ded.rule:
         x = ded.rule.lhs
     else:
         x = sym.fromtag("-")
     if len(ded.ants) > 0:
         accum.extend(["(", sym.totag(x)])
         for ant in ded.ants:
             accum.append(" ")
             self._str_helper(ant, accum)
         accum.append(")")
     else:
         accum.append(sym.totag(x))
示例#3
0
def add_constituent_prefixes(a, ephrase_index):
    """if a phrase is a prefix of a constituent, give it a fake label"""
    if log.level >= 3:
        log.write(
            str([(i, j, sym.tostring(x))
                 for ((i, j), l) in a.espans.iteritems() for x in l]))
        log.write("\n")

    ei_index = {}
    for ((ei, ej), labels) in a.espans.iteritems():
        ei_index.setdefault(ei, []).extend([(ej, x) for x in reversed(labels)])
    for ei in ei_index.iterkeys():
        ei_index[ei].sort()  # stable

    for (ei, ej) in ephrase_index:
        if True or not (a.espans.has_key(
            (ei, ej)) and len(a.espans[ei, ej]) > 0):
            for (ej1, x) in ei_index.get(ei, []):
                if ej1 > ej:
                    x1 = sym.fromtag(sym.totag(x) + "*")
                    a.espans.setdefault((ei, ej), []).append(x1)
                    prefix_labels.add(x1)
                    break

    if log.level >= 3:
        log.write(
            str([(i, j, sym.tostring(x))
                 for ((i, j), l) in a.espans.iteritems() for x in l]))
        log.write("\n---\n")
示例#4
0
def add_multiconstituents(a, maxabslen, ephrase_index, consts):
    elen = len(a.ewords)

    chart = [[None for ej in xrange(elen + 1)] for ei in xrange(elen + 1)]
    for ((ei, ej), labels) in a.espans.iteritems():
        chart[ei][ej] = [labels[0]]  # take the highest label

    for el in xrange(2, maxabslen + 1):
        for ei in xrange(elen - el + 1):
            ej = ei + el
            if chart[ei][ej] is not None:  # must be a singleton
                continue
            bestsplit = None
            bestlen = None
            for ek in xrange(ei + 1, ej):
                if chart[ei][ek] is not None and chart[ek][ej] is not None and (
                        bestlen is None
                        or len(chart[ei][ek]) + len(chart[ek][ej]) < bestlen):
                    bestsplit = ek
                    bestlen = len(chart[ei][ek]) + len(chart[ek][ej])
            if bestlen is not None and bestlen <= consts:
                chart[ei][ej] = chart[ei][bestsplit] + chart[bestsplit][ej]
    for (ei, ej) in ephrase_index:
        if not a.espans.has_key((ei, ej)) and chart[ei][ej] is not None:
            a.espans[ei, ej] = [
                sym.fromtag("_".join(sym.totag(x) for x in chart[ei][ej]))
            ]
示例#5
0
def _item_to_xml(node, result, memo, mode, models, weights):
    if id(node) in memo:
        result.append(memo[id(node)])
        return

    nodeid = str(len(memo)+1)
    memo[id(node)] = "<or ref=%s/>" % xml.sax.saxutils.quoteattr(nodeid)

    states = []
    for mi,m in enumerate(models):
        try:
            s = m.strstate(node.states[mi])
        except IndexError:
            continue
        if s:
            states.append(s)
    states = ",".join(states)

    result.append('<or id=%s label=%s states=%s fspan=%s>' % (
            xml.sax.saxutils.quoteattr(nodeid),
            xml.sax.saxutils.quoteattr(sym.totag(node.x) if node.x else "None"),
            xml.sax.saxutils.quoteattr(states),
            xml.sax.saxutils.quoteattr("%s,%s" % (node.i,node.j))))

    # keep only the top k deductions to slim the forest
    deds = [(ded.viterbi, ded) for ded in node.deds]
    deds.sort()
    if max_deds:
        deds = deds[:max_deds]
    for _, ded in deds:
        _ded_to_xml(ded, result, memo, mode=mode, models=models, weights=weights)
    result.append('</or>')
示例#6
0
文件: forest.py 项目: jungikim/sbmt
 def _fake_tree_helper(lhs, rhs, antvalues):
     children = []
     for x in rhs:
         if sym.isvar(x):
             children.append(antvalues[sym.getindex(x) - 1])
         else:
             children.append(tree.Node(sym.tostring(x), []))
     return tree.Node(sym.totag(lhs), children)
示例#7
0
 def _fake_tree_helper(lhs, rhs, antvalues):
     children = []
     for x in rhs:
         if sym.isvar(x):
             children.append(antvalues[sym.getindex(x)-1])
         else:
             children.append(tree.Node(sym.tostring(x), []))
     return tree.Node(sym.totag(lhs), children)
示例#8
0
文件: forest.py 项目: jungikim/sbmt
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None):
    result = []
    result.append('{\n')

    if fwords:
        fwords = [(sym.tostring(fword) if type(fword) is int else fword)
                  for fword in fwords]
        result.append('  "source": [%s],\n' %
                      ",".join(quotejson(fword) for fword in fwords))

    items = list(root)
    nodeindex = {}
    nodestrs = []
    for ni, item in enumerate(items):
        nodeindex[item] = ni
        if item is root:
            ri = ni
        if item.x is None:
            nodestrs.append('    {}')
        else:
            nodestrs.append('    {"label": %s}' % quotejson(sym.totag(item.x)))
    result.append('  "nodes": [\n%s\n  ],\n' % ",\n".join(nodestrs))

    result.append('  "root": %d,\n' % ri)

    edgestrs = []
    for ni, item in enumerate(items):
        for ded in item.deds:
            tailstrs = []

            if mode == 'french':
                children = ded.rule.f if ded.rule else ded.ants
            elif mode == 'english':
                children = ded.rule.e if ded.rule else ded.ants
            else:
                children = ded.ants

            for child in children:
                if isinstance(child, Item):
                    tailstrs.append(str(nodeindex[child]))
                elif sym.isvar(child):
                    ant = ded.ants[sym.getindex(child) - 1]
                    tailstrs.append(str(nodeindex[ant]))
                else:
                    tailstrs.append(quotejson(sym.tostring(child)))

            dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f), v)
                                         for (f, v) in ded.dcost.iteritems())
            edgestrs.append(
                '    {"head": %s, "tails": [%s], "features": %s}\n' %
                (ni, ",".join(tailstrs), dcoststr))

    result.append('  "edges": [\n%s\n  ]\n' % ",\n".join(edgestrs))

    result.append('}')
    return "".join(result)
示例#9
0
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None):
    result = []
    result.append('{\n')

    if fwords:
        fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords]
        result.append('  "source": [%s],\n' % ",".join(quotejson(fword) for fword in fwords))

    items = list(root)
    nodeindex = {}
    nodestrs = []
    for ni,item in enumerate(items):
        nodeindex[item] = ni
        if item is root:
            ri = ni
        if item.x is None:
            nodestrs.append('    {}')
        else:
            nodestrs.append('    {"label": %s}' % quotejson(sym.totag(item.x)))
    result.append('  "nodes": [\n%s\n  ],\n' % ",\n".join(nodestrs))

    result.append('  "root": %d,\n' % ri)

    edgestrs = []
    for ni,item in enumerate(items):
        for ded in item.deds:
            tailstrs = []

            if mode == 'french':
                children = ded.rule.f if ded.rule else ded.ants
            elif mode == 'english':
                children = ded.rule.e if ded.rule else ded.ants
            else:
                children = ded.ants

            for child in children:
                if isinstance(child, Item):
                    tailstrs.append(str(nodeindex[child]))
                elif sym.isvar(child):
                    ant = ded.ants[sym.getindex(child)-1]
                    tailstrs.append(str(nodeindex[ant]))
                else:
                    tailstrs.append(quotejson(sym.tostring(child)))

            dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f),v) for (f,v) in ded.dcost.iteritems())
            edgestrs.append('    {"head": %s, "tails": [%s], "features": %s}\n' % (
                    ni,
                    ",".join(tailstrs),
                    dcoststr))

    result.append('  "edges": [\n%s\n  ]\n' % ",\n".join(edgestrs))

    result.append('}')
    return "".join(result)
示例#10
0
文件: forest.py 项目: jungikim/sbmt
def _item_to_xml(node, result, memo, mode, models, weights):
    if id(node) in memo:
        result.append(memo[id(node)])
        return

    nodeid = str(len(memo) + 1)
    memo[id(node)] = "<or ref=%s/>" % xml.sax.saxutils.quoteattr(nodeid)

    states = []
    for mi, m in enumerate(models):
        try:
            s = m.strstate(node.states[mi])
        except IndexError:
            continue
        if s:
            states.append(s)
    states = ",".join(states)

    result.append(
        '<or id=%s label=%s states=%s fspan=%s>' %
        (xml.sax.saxutils.quoteattr(nodeid),
         xml.sax.saxutils.quoteattr(sym.totag(node.x) if node.x else "None"),
         xml.sax.saxutils.quoteattr(states),
         xml.sax.saxutils.quoteattr("%s,%s" % (node.i, node.j))))

    # keep only the top k deductions to slim the forest
    deds = [(ded.viterbi, ded) for ded in node.deds]
    deds.sort()
    if max_deds:
        deds = deds[:max_deds]
    for _, ded in deds:
        _ded_to_xml(ded,
                    result,
                    memo,
                    mode=mode,
                    models=models,
                    weights=weights)
    result.append('</or>')
示例#11
0
def add_multiconstituents(a, maxabslen, ephrase_index, consts):
    elen = len(a.ewords)

    chart = [[None for ej in xrange(elen+1)] for ei in xrange(elen+1)]
    for ((ei,ej),labels) in a.espans.iteritems():
        chart[ei][ej] = [labels[0]] # take the highest label

    for el in xrange(2,maxabslen+1):
        for ei in xrange(elen-el+1):
            ej = ei+el
            if chart[ei][ej] is not None: # must be a singleton
                continue
            bestsplit = None
            bestlen = None
            for ek in xrange(ei+1,ej):
                if chart[ei][ek] is not None and chart[ek][ej] is not None and (bestlen is None or len(chart[ei][ek])+len(chart[ek][ej]) < bestlen):
                    bestsplit = ek
                    bestlen = len(chart[ei][ek])+len(chart[ek][ej])
            if bestlen is not None and bestlen <= consts:
                chart[ei][ej] = chart[ei][bestsplit]+chart[bestsplit][ej]
    for (ei,ej) in ephrase_index:
        if not a.espans.has_key((ei,ej)) and chart[ei][ej] is not None:
            a.espans[ei,ej] = [sym.fromtag("_".join(sym.totag(x) for x in chart[ei][ej]))]
示例#12
0
def add_constituent_prefixes(a, ephrase_index):
    """if a phrase is a prefix of a constituent, give it a fake label"""
    if log.level >= 3:
        log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ]))
        log.write("\n")

    ei_index = {}
    for ((ei,ej),labels) in a.espans.iteritems():
        ei_index.setdefault(ei, []).extend([(ej,x) for x in reversed(labels)])
    for ei in ei_index.iterkeys():
        ei_index[ei].sort() # stable

    for (ei,ej) in ephrase_index:
        if True or not (a.espans.has_key((ei,ej)) and len(a.espans[ei,ej]) > 0):
            for (ej1,x) in ei_index.get(ei,[]):
                if ej1 > ej:
                    x1 = sym.fromtag(sym.totag(x)+"*")
                    a.espans.setdefault((ei,ej),[]).append(x1)
                    prefix_labels.add(x1)
                    break

    if log.level >= 3:
        log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ]))
        log.write("\n---\n")