예제 #1
0
def make_rule(parent,
              children,
              fwords,
              ewords,
              parent_nt=None,
              children_nts=None):
    """Given parent and children as phrases (boxes), return a rule.
    A phrase is a list [fi, fj, ei, ej]."""
    if parent_nt is None:
        parent_nt = PHRASE_NT
    if children_nts is None:
        children_nts = [PHRASE_NT] * len(children)
    fi, fj, ei, ej = parent
    f = fwords[fi:fj]
    e = ewords[ei:ej]
    # maps from index in phrase to index in sentence
    # used to keep track of word alignment for lexical weighting
    fpos = [i for i in range(fi, fj)]
    epos = [i for i in range(ei, ej)]
    # None is used as a placeholder in gaps
    for var_idx in range(len(children)):
        child_fi, child_fj, child_ei, child_ej = children[var_idx]
        child_nt = children_nts[var_idx]
        # phrase index
        sub_fi = child_fi - fi
        sub_fj = child_fj - fi
        f[sub_fi] = child_nt
        fpos[sub_fi] = (child_fi, child_fj)
        for i in range(sub_fi + 1, sub_fj):
            f[i] = None
            fpos[i] = None
        # phrase index
        sub_ei = child_ei - ei
        sub_ej = child_ej - ei
        e[sub_ei] = (child_nt, var_idx)
        epos[sub_ei] = (child_ei, child_ej)
        for i in range(sub_ei + 1, sub_ej):
            e[i] = None
            epos[i] = None
    # remove placeholders
    f = [w for w in f if w is not None]
    fpos = [i for i in fpos if i is not None]
    epos = [i for i in epos if i is not None]
    # recover nonterminal permutation
    new_e = []
    e2f = []
    for w in e:
        if w is not None:
            if type(w) is tuple:
                new_e.append(w[0])
                e2f.append(w[1])
            else:
                new_e.append(w)
    # build rule
    rule = Rule()
    rule.init(parent_nt, f, new_e, e2f)
    rule.fpos = fpos
    rule.epos = epos
    return rule
예제 #2
0
    def make_rule(self, a, source_phrase, fwords):
        '''fwords is a list of numbers and subphrases:
           the numbers are indices into the French sentence

           note by Fang: the input for make_rule is an initial phrase and a
           possible rule construction, which is plausible only for the f side
           at this moment. 'make_rule' ensures that the e sides of the
           subphrases fit into the initial phrase being subtracted and don't
           overlap.  the outputed rule object includes information of the
           lexicalized symbols at both sides, their indices into the original
           sentence pair (fpos, epos), and possibly the word alignment info.
           '''
        x, i1, j1, i2, j2 = source_phrase

        # omit trivial rules
        if len(fwords) == 1 and type(fwords[0]) is not int:
            return None

        #if not tight_phrases:
        fwords = fwords[:]
        fpos = [None for w in fwords
                ]  # map from index in phrase to index in sentence
        ewords = a.ewords[j1:j2]
        elen = j2 - j1
        index = 0  # nonterminal index
        flag = False
        for i in range(len(fwords)):
            fi = fwords[i]
            if type(fi) is int:  # terminal symbol
                if a.faligned[fi]:
                    flag = True
                fwords[i] = a.fwords[fi]
                fpos[i] = fi
            else:  # nonterminal symbol
                (sub_x, sub_i1, sub_j1, sub_i2, sub_j2) = fi
                sub_j1 -= j1
                sub_j2 -= j1

                if not tight_phrases:
                    # Check English holes
                    # can't lie outside phrase
                    if sub_j1 < 0 or sub_j2 > elen:
                        return None

                    # can't overlap
                    for j in range(sub_j1, sub_j2):
                        if type(ewords[j]) is tuple or ewords[j] is None:
                            return None

                # Set first eword to var, rest to None

                # We'll clean up the Nones later
                v = sub_x
                fwords[i] = v
                fpos[i] = (sub_i1, sub_i2)
                ewords[sub_j1] = (v, index, sub_j1 + j1, sub_j2 + j1)
                for j in range(sub_j1 + 1, sub_j2):
                    ewords[j] = None
                index += 1

        # Require an aligned French word
        if self.require_aligned_terminal and not flag:
            return None

        epos = []
        new_ewords = []
        e2f = []
        for i in range(elen):
            if ewords[i] is not None:
                if type(ewords[i]) is tuple:
                    (v, index, ei, ej) = ewords[i]
                    # force slash categories to be at left edge of English side
                    #if force_english_prefix and len(new_ewords) != 0 and sym.clearindex(v) in prefix_labels:
                    #    return None
                    e2f.append(index)
                    new_ewords.append(v)
                    epos.append((ei, ej))
                else:
                    new_ewords.append(ewords[i])
                    epos.append(i + j1)

        #r = XRule(x,rule.Phrase(tuple(fwords)), rule.Phrase(tuple(new_ewords)))
        r = Rule()
        r.lhs = PHRASE
        r.f = fwords
        r.e = new_ewords
        r.e2f = e2f
        r.fpos = fpos
        r.epos = epos
        r.span = (i1, i2, j1, j2)

        if self.keep_word_alignments:
            r.word_alignments = []
            for fi in range(len(fpos)):
                if type(fpos[fi]) is int:
                    for ei in range(len(epos)):
                        if type(epos[ei]) is int:
                            if a.aligned[fpos[fi]][epos[ei]]:
                                r.word_alignments.append((fi, ei))
        #print(r)
        return r