示例#1
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        start -- start symbol.
        horzMarkov -- None for default. A number n >= 0 for horizontal markov.
        """
        self.start = start

        count_Y_Z = defaultdict(lambda: defaultdict(int))
        count_X = defaultdict(int)
        for t in parsed_sents:
            # it's a copy of tree. We don't want to modify the original tree.
            # mutable structures
            unle_trees = unlexicalize(t.copy(deep=True))
            # chomsky normal form with horizontal markov.
            unle_trees.chomsky_normal_form(horzMarkov=horzMarkov)
            # collapse subtrees with a single child.
            unle_trees.collapse_unary(collapsePOS=True)
            for prod in unle_trees.productions():
                count_Y_Z[prod.lhs()][prod.rhs()] += 1
                count_X[prod.lhs()] += 1

        # create a list of productions.
        productions = []
        for X, c_X in count_X.items():
            for (Y_Z, c_Y_Z) in count_Y_Z[X].items():
                q = c_Y_Z / float(c_X)
                productions.append(ProbabilisticProduction(X, Y_Z, prob=q))

        self.production = productions

        grammar = PCFG(Nonterminal(start), productions)
        self.parser = CKYParser(grammar)
示例#2
0
    def test_unlexicalize_does_change_tree(self):
        t = Tree.fromstring(
            """
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)

        unlexicalize(t)

        ut2 = Tree.fromstring(
            """
                (S
                    (NP (Det Det) (Noun Noun))
                    (VP (Verb Verb) (NP (Noun Noun) (Adj Adj)))
                )
            """)
        self.assertEqual(t, ut2)
示例#3
0
文件: upcfg.py 项目: famaf/PLN_2017
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # { A -> B : count(A -> B) }
        productions_counts = defaultdict(int)
        # { A : count(A) }
        lhs_count = defaultdict(int)  # left_hand_side_count

        self.start = start  # Para la gramatica del parser CKY
        self.prods = []  # Lista de producciones

        # Hacemos una copia de t porque al hacer el unlexicalize, este me
        # modifica el arbol
        # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents]
        unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents]

        for t in unlex_sents:
            t.chomsky_normal_form(horzMarkov=horzMarkov)
            t.collapse_unary(collapsePOS=True, collapseRoot=True)
            for prod in t.productions():
                # type(prod): <class 'nltk.grammar.Production'>
                # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
                # type(prod.rhs): <class 'tuple'>
                #   Cada elemento de prod.rhs() es del tipo:
                #       <class 'nltk.grammar.Nonterminal'>
                productions_counts[prod] += 1
                lhs_count[prod.lhs()] += 1

        for prod, count_prod in productions_counts.items():
            # type(production): <class 'nltk.grammar.Production'>
                # production : A -> B
                # type(count_prod): int
            # count_prod : count(A -> B)
            count_lhs = lhs_count.get(prod.lhs(), 0)

            # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
            # type(prod.rhs): <class 'tuple'>
            q_ML = float(count_prod) / count_lhs
            self.prods += [ProbabilisticProduction(prod.lhs(),
                                                   prod.rhs(),
                                                   prob=q_ML)]
            # Cada elemento de self.prods es del tipo:
            #     <class 'nltk.grammar.ProbabilisticProduction'>

        # type(PCFG(...)) = <class 'nltk.grammar.PCFG'>
        # PCFG(start, productions)
        #       type(start): Nonterminal
        #       type(productions): list(Production)
        grammar = PCFG(Nonterminal(start), self.prods)
        self.my_parser = CKYParser(grammar)
示例#4
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # Non-Terminal start symbol of the pcfg.
        # Be aware that the start symbol now is specified by the init parameter
        # 'start', and not the start label of the trees in parsed_sents
        self.start = N(start)
        self.horzMarkov = horzMarkov
        # saving repeated productions (for induce probabilities)
        productions = []
        for t in parsed_sents:
            unlex_t = unlexicalize(t.copy(deep=True))
            # Set node label
            unlex_t.set_label(start)
            unlex_t.chomsky_normal_form(horzMarkov=horzMarkov)
            # Not collapsing the Root (collapseRoot=False)
            unlex_t.collapse_unary(collapsePOS=True, collapseRoot=True)
            productions += unlex_t.productions()

        self.pcfg = induce_pcfg(self.start, productions)
        self._probabilistic_productions = self.pcfg.productions()
        self._parser = CKYParser(self.pcfg)