예제 #1
0
    def __init__(self, pcfg_filename):
        
        start = None
        with open(pcfg_filename, "r") as f:
            lines = f.readlines()
        productions = []
        for line in lines:
            matches = re.match("(\S+)\s*->\s*(\S+)(\s+\S+)?\s+\[([0-9.]+)\]?", line)
            groups = matches.groups()
            group_count = len(groups)
            assert group_count == 4
            lhs = nltk.Nonterminal(groups[0].strip())
            if groups[2] is None:
                production = nltk.Production(lhs, [ groups[1].strip('\'') ])
            else:
                production = nltk.Production(lhs, [ nltk.Nonterminal(groups[1].strip()), nltk.Nonterminal(groups[2].strip()) ])
            probability = float(groups[3].strip())
            
            # Read the Production rule:
            if (start is None):
                start = lhs

            productions.append(production)
            self.probability_of_production[production] = probability
        self.grammar = nltk.grammar.CFG(start, productions, False)
예제 #2
0
    def __count_productions_recursively(self,
                                        node: nltk.Tree) -> nltk.Nonterminal:
        """Recursively parses a tree representation of a sentence."""
        label = node.label()
        # Traverse the tree:
        if (len(node) == 2):
            # Handle non-leaf nodes:
            left = self.__count_productions_recursively(node[0])
            right = self.__count_productions_recursively(node[1])
            production = nltk.Production(
                nltk.Nonterminal(label),
                [nltk.Nonterminal(left.lhs()),
                 nltk.Nonterminal(right.lhs())])
        else:
            # Handle leaf node.
            token = node[0]
            self.token_count += 1
            if (token not in self.count_per_token):
                self.count_per_token[token] = 1
            else:
                self.count_per_token[token] += 1
            production = nltk.Production(nltk.Nonterminal(label), [token])

        # Update our count of this particular productions.
        if (production not in self.count_per_production):
            self.count_per_production[production] = 1
        else:
            self.count_per_production[production] += 1
        # Update our count of all productions with a particular LHS.
        lhs = production.lhs()
        if (lhs not in self.lhs_count):
            self.lhs_count[lhs] = 1
        else:
            self.lhs_count[lhs] += 1
        return production
예제 #3
0
    def __init__(self, pcfg_filename):
        
        start = None
        # Read the file:
        with open(pcfg_filename, "r") as f:
            lines = f.readlines()
        # Parse the file's contents:
        productions = []
        for line in lines:
            matches = re.match("(\S+)\s*->\s*(\S+)(\s+\S+)?\s+\[([0-9.]+)\]?", line)
            groups = matches.groups()
            group_count = len(groups)
            assert group_count == 4
            lhs = nltk.Nonterminal(groups[0].strip())
            if groups[2] is None:
                production = nltk.Production(lhs, [ groups[1].strip('\'') ])
            else:
                production = nltk.Production(lhs, [ nltk.Nonterminal(groups[1].strip()), nltk.Nonterminal(groups[2].strip()) ])
            log_probability = math.log(float(groups[3].strip()))
            
            # Read the Production rule:
            if (start is None):
                start = lhs

            productions.append(production)
            self.log_probability_of_production[production] = log_probability
            if self.min_log_probability is None or math.fabs(log_probability) > math.fabs(self.min_log_probability):
                self.min_log_probability = log_probability
        self.grammar = nltk.grammar.CFG(start, productions, False)
        # Make it much less probable than the actual minimum_log_probability but still non-zero.
        self.min_log_probability = self.min_log_probability / 2
예제 #4
0
def convert2_nltk_CFG(G):
    terminals, NTs, P, S = G
    Prod = copy(P)
    # this is here to ensure full coverage of terminals
    # when parsing the grammar for testing
    Prod["DUMMY"] = [list(map(lambda x: (x, ), terminals))]
    assert len(S) > 0  # need a start symbol
    if len(S) > 1:
        if "NT0" not in Prod.keys():
            Prod["NT0"] = []
        for Si in S:
            Prod["NT0"].append([(Si, )])
    assert "NT0" in S
    start = nltk.Nonterminal("NT0")
    nltk_nts = nltk.nonterminals(" ".join(list(NTs)))
    productions = []
    # only look at nonterminals with productions
    for NT in Prod.keys():
        for rule in Prod[NT]:
            rhs = rule_to_tuple(rule, NTs)
            #print("convert", NT, rhs)
            prod = nltk.Production(nltk.Nonterminal(NT), rhs)
            productions.append(prod)
    # production is empty here...
    return nltk.grammar.CFG(start, productions)
예제 #5
0
    def hide_some_tokens(self):
        # Sort the dictionary of tokens by the token_count:
        for production in self.count_per_production.keys():
            if self.count_per_production[production] == 0 or self.lhs_count[
                    production.lhs()] == 0:
                self.probability_per_production[production] = 0
            else:
                self.probability_per_production[
                    production] = self.count_per_production[
                        production] / self.lhs_count[production.lhs()]

        sorted_productions = sorted(self.probability_per_production.items(),
                                    key=operator.itemgetter(1))
        hide_target = int(round(self.hide_proportion * self.token_count, 0))
        count_per_unk_production = {}
        # Look for the least probable productions and "delete" them by reducing their count to 0.
        # Instead, the weight is transferred to an corresponding <UNK> production,
        # which may pool over several productions that share the same LHS.
        i = 0
        while hide_target > 0:
            production = sorted_productions[i][0]
            if len(production.rhs()) == 1:
                # We have a terminal:
                lhs = production.lhs()
                count = int(sorted_productions[i][1] * self.lhs_count[lhs])
                assert self.count_per_production[production] == count
                hide_target -= count
                # We need to substitute this token production with an <UNK> production.
                unk_production = nltk.Production(lhs, {'<UNK>'})
                # Transfer the weight of this production to the corresponding UNK-production:
                self.count_per_production[production] = 0
                self.probability_per_production[production] = 0
                if unk_production in count_per_unk_production:
                    count_per_unk_production[unk_production] += count
                else:
                    count_per_unk_production[unk_production] = count
            i += 1
        # We couldn't add to or remove from the production collection in the previous iteration,
        # so we're going to make the necessary insertions now.
        for unk_production in count_per_unk_production.keys():
            self.count_per_production[
                unk_production] = count_per_unk_production[unk_production]
            if self.count_per_production[unk_production] == 0 or self.lhs_count[
                    unk_production.lhs()] == 0:
                self.probability_per_production[unk_production] = 0
            else:
                self.probability_per_production[
                    unk_production] = self.count_per_production[
                        unk_production] / self.lhs_count[unk_production.lhs()]

        # Validation: Double check that we have the same number of tokens both before and after "hiding" tokens.
        count = 0
        for production in self.count_per_production.keys():
            if len(production.rhs()) == 1:
                count += self.count_per_production[production]
        assert count == self.token_count
예제 #6
0
def noleft_immediate(rules, lhs):
    (left, not_left) = group_rhs(rules, lhs, lhs)
    new_rules = []
    if len(left) > 0:
        new_lhs = new_nonterminal(lhs)
        for r in not_left:
            new_rules.append(nltk.Production(lhs, r.rhs() + (new_lhs, )))
        for r in left:
            old_rhs = r.rhs()
            if len(old_rhs) > 0:
                new_rules.append(
                    nltk.Production(new_lhs,
                                    r.rhs()[1:] + (new_lhs, )))
            else:
                new_rules.append(nltk.Production(lhs, old_rhs))
        new_rules.append(nltk.Production(new_lhs, ()))
    else:
        new_rules = rules
    return new_rules
예제 #7
0
def expand_match(rules, expand_rules, lhs, match):
    (matched, not_matched) = group_rhs(expand_rules, lhs, match)
    new_rules = []
    if len(matched) > 0:
        for r in matched:
            for match_rule in lhs_rules(rules, match):
                new_rules.append(
                    nltk.Production(lhs,
                                    match_rule.rhs() + r.rhs()[1:]))
        new_rules.extend(not_matched)
    else:
        new_rules.extend(not_matched)
    return new_rules
예제 #8
0
def pcfg_train(trees, vocab):
    #    Write a function pcfg_train() that takes as its input a collection
    #    of nltk.tree.Tree objects. For example, it might be passed some
    #    portion of nltk.corpus.treebank.parsed_sents(). This function
    #    should return a nltk.PCFG object.

    all_productions = []

    for t in trees:
        for p in t.productions():
            all_productions.append(nltk.Production(p.lhs(), p.rhs()))

    pcfg = nltk.induce_pcfg(nltk.Nonterminal('S'), all_productions)

    return (pcfg)