def generate_grammar_and_parsers(parsed_sents):
    # From sentences, extract the parsing tree and transform each tree to a list of CFG productions;
    # generate a set containing all the productions (without repetitions)
    tbank_productions_with_repet = [
        production for parsed_sent in parsed_sents
        for production in parsed_sent.productions()
    ]
    tbank_productions = set(
        tbank_productions_with_repet)  # exclude repetitions
    print("Num. of unique productions read:", len(tbank_productions))

    # Build a CFG from the productions
    print("\nBuinding a CFG...")
    cfg_grammar = CFG(Nonterminal('S'), tbank_productions)  # a CFG
    print(cfg_grammar, end="\n\n")

    # CFG - An Earley parser
    cfg_earley_parser = EarleyChartParser(cfg_grammar, trace=3)
    # Build a PCFG from the productions

    print("Building a PCFG...")
    pcfg_grammar = induce_pcfg(
        Nonterminal('S'),
        tbank_productions_with_repet)  # a PCFG, here repetitions are needed!
    print(pcfg_grammar, end="\n\n")

    # Allocate a bottom-up chart parser for PCFG; see: http://www.nltk.org/_modules/nltk/parse/pchart.html
    pcfg_pchart_parser = InsideChartParser(pcfg_grammar)

    return cfg_earley_parser, pcfg_pchart_parser  # return both parsers
예제 #2
0
파일: tiger.py 프로젝트: ooz/Confopy
 def pcfg(self, include_edgelabels=True):
     sents = self.parsed_sents(include_edgelabels)
     tiger_prods = set(prod for sent in sents
                       for prod in sent.productions())
     pcfg = induce_pcfg(Nonterminal(TigerCorpusReader.GRAMMAR_START),
                        list(tiger_prods))
     return pcfg
예제 #3
0
파일: retriever.py 프로젝트: mcbsf/pln2
    def __init__(self):
        self.sentences = floresta.parsed_sents()
        productions = []

        for fileid in floresta.fileids()[:2]:
            for t in floresta.parsed_sents(fileid):
                t = self.simpifly_tree_tag(t)
                t.chomsky_normal_form()
                productions += t.productions()

        # for e, t in enumerate(self.sentences):
        #     if(t):
        #         #print("\taqui carai", t)
        #         #print(type(t))
        #         t = self.simpifly_tree_tag(t)
        #         #print(t)
        #         t.chomsky_normal_form()
        #         productions += t.productions()
        #         if(e==4):
        #             break

        print(productions)

        np = nltk.Nonterminal('np')
        grammar = induce_pcfg(np, productions)
        print(grammar)
예제 #4
0
def induce_grammar(sents):
    UNK = '*UNKNOWN*'
    productions = []
    for sent in sents:
        # remove -NONE- tags and simplify hyphenated nonterminals
        sent.chomsky_normal_form()
        sent.collapse_unary(collapsePOS=True, collapseRoot=True)
        productions.extend(sent.productions())
    # add UNK-rules
    grammar = induce_pcfg(Nonterminal('S'), productions)
    return grammar
예제 #5
0
파일: cyk.py 프로젝트: helange23/ULL-mini
def getGrammar(sentence):
    """
	Constructs an ad-hoc split head DMV grammar for the given sentence.
	@param sentence: Input sentence as a list of tokens
	@return: NLTK grammar with weighted productions.
	"""
    productions = []
    for i, head in enumerate(sentence):
        S = Nonterminal("S")
        Y_head = Nonterminal("Y_" + head)
        L_head = Nonterminal("L_" + head)
        R_head = Nonterminal("R_" + head)
        L1_head = Nonterminal("L1_" + head)
        R1_head = Nonterminal("R1_" + head)
        LP_head = Nonterminal("LP_" + head)
        RP_head = Nonterminal("RP_" + head)
        productions.append(Production(S, [Y_head]))
        productions.append(Production(Y_head, [L_head, R_head]))
        productions.append(Production(L_head, [head + "_l"]))
        productions.append(Production(R_head, [head + "_r"]))
        productions.append(Production(L_head, [L1_head]))
        productions.append(Production(R_head, [R1_head]))
        productions.append(Production(LP_head, [head + "_l"]))
        productions.append(Production(RP_head, [head + "_r"]))
        productions.append(Production(LP_head, [L1_head]))
        productions.append(Production(RP_head, [R1_head]))
    grammar = induce_pcfg(Nonterminal("S"), productions)
    for i, head in enumerate(sentence):
        L1_head = Nonterminal("L1_" + head)
        R1_head = Nonterminal("R1_" + head)
        LP_head = Nonterminal("LP_" + head)
        RP_head = Nonterminal("RP_" + head)
        for j in xrange(0, i):
            arg = sentence[j]
            prob = model_wrapper.getProb(head, arg, direction="left")

            grammar.productions().append(
                WeightedProduction(L1_head, [Nonterminal("Y_" + sentence[j]), LP_head], prob=prob)
            )
        for j in xrange(i + 1, len(sentence)):
            arg = sentence[j]
            prob = model_wrapper.getProb(head, arg, direction="right")
            grammar.productions().append(
                WeightedProduction(R1_head, [RP_head, Nonterminal("Y_" + sentence[j])], prob=prob)
            )

    return grammar
예제 #6
0
    def set_grammar(self, productions):
        """
        Add the grammar rules from MBMA to the parser. Transforms all rules
        to Chomsky Normal Form, induces a Weighted Context Free Grammar on
        the basis of these rules and indexes the Right Hand Rules of the
        productions.

        Args:
            - productions (list): a list of :class:`nltk.Production` instances
        """
        cnf_prods = []
        for p in productions:
            # transform each production of MBMA into CNF
            cnf_prods.extend(prod_to_chomsky_normal_form(p))
        self._local_grammar = induce_pcfg('S', cnf_prods)
        self._local_productions = self._build_productions(self._local_grammar)
        self.initialize = _initialize(self._local_grammar)
예제 #7
0
파일: pparse.py 프로젝트: Web5design/mbmp
    def set_grammar(self, productions):
        """
        Add the grammar rules from MBMA to the parser. Transforms all rules
        to Chomsky Normal Form, induces a Weighted Context Free Grammar on
        the basis of these rules and indexes the Right Hand Rules of the
        productions.

        Args:
            - productions (list): a list of :class:`nltk.Production` instances
        """
        cnf_prods = []
        for p in productions:
            # transform each production of MBMA into CNF
            cnf_prods.extend(prod_to_chomsky_normal_form(p))
        self._local_grammar = induce_pcfg('S', cnf_prods)
        self._local_productions = self._build_productions(self._local_grammar)
        self.initialize = _initialize(self._local_grammar)
예제 #8
0
def induce_grammar(train):
    """Induces a PCFG from the given set of Penn Treebank sentences

    Args:
        train (Any): Set of Penn Treebank sentences

    Returns:
        PCFG: A PCFG grammar instance
    """
    productions = []
    for item in train:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            # Remove branches A-B-C into A-B+C
            tree.collapse_unary(collapsePOS=False)
            # Remove A->(B,C,D) into A->B,C+D->D
            tree.chomsky_normal_form(horzMarkov=2)
            productions += tree.productions()

    S = Nonterminal('S')
    return induce_pcfg(S, productions)
def learn_trees(trees, collapse=True, markov_order=None):
    """
    Given a list of parsed sentences, return the maximum likelihood PCFG
    for those sentences.

    If 'collapse' is True, it will collapse the trees before learning the
    grammar so that there are no unary productions.

    This will reduce productions of length more than 2 using Chomsky normal
    form.  You can Markov-smooth the results by setting markov_order to a
    number such as 2.
    """
    productions = []
    for tree in trees:
      if collapse: tree.collapse_unary(collapsePOS=False)
      #if markov_order:
       # tree.chomsky_normal_form(horzMarkov=markov_order)
      #else:
        #tree.chomsky_normal_form()
      productions += tree.productions()
      #print productions
    grammar_p = grammar.induce_pcfg(Nonterminal('S'), productions)
    return grammar_p
예제 #10
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # Non-Terminal start symbol of the pcfg.
        # Be aware that the start symbol now is specified by the init parameter
        # 'start', and not the start label of the trees in parsed_sents
        self.start = N(start)
        self.horzMarkov = horzMarkov
        # saving repeated productions (for induce probabilities)
        productions = []
        for t in parsed_sents:
            unlex_t = unlexicalize(t.copy(deep=True))
            # Set node label
            unlex_t.set_label(start)
            unlex_t.chomsky_normal_form(horzMarkov=horzMarkov)
            # Not collapsing the Root (collapseRoot=False)
            unlex_t.collapse_unary(collapsePOS=True, collapseRoot=True)
            productions += unlex_t.productions()

        self.pcfg = induce_pcfg(self.start, productions)
        self._probabilistic_productions = self.pcfg.productions()
        self._parser = CKYParser(self.pcfg)
예제 #11
0
def fill_missing_words(grammar: PCFG, missing_words: Set[str]):
    # UNK -> word1 | word2 | ... | wordN
    unknown = Nonterminal('UNK')
    unk_rules = [
        Production(unknown, [missing_word]) for missing_word in missing_words
    ]

    # Add UNK as a possibility to all rules with strings in the right hand side
    corrected_rules: List[Nonterminal] = []
    rule: ProbabilisticProduction
    for rule in grammar.productions():

        # right hand side has a string somewhere
        if any(isinstance(element, str) for element in rule.rhs()):

            # rule has already been corrected
            if rule.lhs() in corrected_rules:
                continue

            unk_rules.append(Production(rule.lhs(), [unknown]))

            corrected_rules.append(rule.lhs())

    return induce_pcfg(grammar.start(), grammar.productions() + unk_rules)
예제 #12
0
def induce_grammar(productions: List[Production]):
    S = Nonterminal("S")
    return induce_pcfg(S, productions)
예제 #13
0
#########################

print "Combining extended lexicon with the training trees and "
print "building grammar."

#trainTrees currently has the productions in a list of lists
#this turns them into a list
trainProds = reduce(lambda x,y: x+y,\
		map(lambda x: x.productions(), trainTrees))
parserProds = trainProds+extraLexicals

#remove duplicates
print "Removing duplicates from productions."
parserProds = list(set(parserProds))

for each in parserProds: print each

parser = ViterbiParser(induce_pcfg(Nonterminal("S"),parserProds))

#I've got this thing somewhat working. I'm tired of running it over
# and over. I think it's taking nearly 10 minutes.
# PICKLE AND EXPORT TIME
prsFilename = "hw4_vitParser.pkl"
print "Saving parser to " + prsFilename
dump(parser, open(prsFilename, "w"))
dump(testTrees, open("hw4_testTrees.pkl","w"))




예제 #14
0
파일: Q1.2.py 프로젝트: dianagastrin/NLP
def learn_pcfg(trees):
    productions = []
    for tree in trees:
        productions += tree.productions()
    return induce_pcfg(Nonterminal('S'), productions)
예제 #15
0
productions = list(treebank_productions)
output = set()
for p in productions:
    p = str(p)
    if 'NONE' in p or "'" in p or "'" in p or ":" in p or "," in p or "PDT" in p or "FW" in p:
        continue
    lhs = remove_dash(p.split()[0])
    rhs = []
    for strr in p.split()[2:]:
        rhs.append(remove_dash(strr))

    temp_rhs = rhs[:]
    if len(temp_rhs) == 1:
        output.add(Production(Nonterminal(lhs), [Nonterminal(rhs[0])]))
    else:
        rhs_head = "-".join(temp_rhs[1:])
        output.add(
            Production(Nonterminal(lhs),
                       [Nonterminal(temp_rhs[0]),
                        Nonterminal(rhs_head)]))
        lhs = rhs_head
        temp_rhs = temp_rhs[1:]

grammar = induce_pcfg(Nonterminal('S'), list(output))

print('ROOT -> S 1')
for g in grammar.productions():
    temp = str(g).split('[')
    print(temp[0], temp[1].strip(']'))
예제 #16
0
    sent.chomsky_normal_form()
    for production in sent.productions():
        tbank_productions.append(production)

# supaya tidak ada permasalahan unknown token/word, kita tambahkan semua lexical(termasuk)
#yang di test_set
for word, tag in treebank.tagged_words():
    t = Tree.fromstring("(" + tag + " " + word + ")")
    for production in t.productions():
        tbank_productions.append(production)

print tbank_productions[2]

#Secara otomatis membangun grammar (terutama menghitung probability rule)
#dari list production rule tbank_productions
tbank_grammar = induce_pcfg(Nonterminal('S'), tbank_productions)

print tbank_grammar

#PARSING
parser = ViterbiParser(tbank_grammar)
s = time.time()
#parsing untuk raw data latih kedua
for t in parser.parse(raw_test_set[1]):
    print(t)

#hitung waktu parsing
s = time.time() - s

#gold standard dari dataset kedua
print test_set[1]
예제 #17
0
 def learning_CNF_probabilities(self):
     S = Nonterminal('SENT')
     self.grammar = induce_pcfg(S, self.CNF_rules).productions()
예제 #18
0
def extract_simple_pcfg(n):
    rules = extract_simple_productions(n)
    pcfg = grammar.induce_pcfg(Nonterminal("S"), rules)
    return PCFG(pcfg.start(), sort_rules(pcfg.productions()))
예제 #19
0
파일: Q1.2.py 프로젝트: dianagastrin/NLP
def trees_to_pcfg(trees):
    productions = trees_to_productions(trees)
    pcfg = induce_pcfg(Nonterminal('S'), productions)
    return pcfg
예제 #20
0
파일: tiger.py 프로젝트: neoyukito/Confopy
 def pcfg(self, include_edgelabels=True):
     sents = self.parsed_sents(include_edgelabels)
     tiger_prods = set(prod for sent in sents for prod in sent.productions())
     pcfg = induce_pcfg(Nonterminal(TigerCorpusReader.GRAMMAR_START), list(tiger_prods))
     return pcfg
예제 #21
0
                ta /= len(list_tag_val)
                
                # armazena o resultado
                r = {'lp':lp, 'lr': lr, 'f1':f1, 'ta':ta}
                resultados.append(r)
            else:
                print("Sentença com mais de 18 palavras.")
        except Exception:
            print("Árvore mal formada.")

    # realiza o calculo da media para cada metrica
    media_lp = sum(item['lp'] for item in resultados)/len(resultados)
    media_lr = sum(item['lr'] for item in resultados)/len(resultados)
    media_f1 = sum(item['f1'] for item in resultados)/len(resultados)
    media_ta = sum(item['ta'] for item in resultados)/len(resultados)
    print("media_lp",media_lp,"media_lr",media_lr,"media_f1",media_f1,"media_ta",media_ta)

# extrai as arvores da base de dados floresta, com suas respectivas tags
filter_errors(floresta.parsed_sents())

roots = []
ROOT = Nonterminal('ROOT') # nao-terminal representado o simbolo inicial da gramatica
initial_symbols = list(set(initial_symbols)) # remover repetidos
for t in initial_symbols:
    roots += [Production(ROOT,[t])] # unificar a gramatica para apenas um simbolo inicial

productions += roots
productions += [Production(Nonterminal("n"), ["UNK"])] # regra para palavras desconhecidas (substantivo)

pcfg = induce_pcfg(ROOT, productions) # cria a PCFG informando o simbolo inicial e as regras
do_cky(pcfg) # aplica o algoritmo CKY (ViterbiParser)
예제 #22
0
B -> B D [.5] | C [.5]
C -> 'a' [.1] | 'b' [0.9]
D -> 'b' [1.0]
""")
grammar.productions()

# In[ ]:

# get grammer from parsed sentences
from nltk import Nonterminal

productions = []
for fileid in treebank.fileids()[:2]:
    for t in treebank.parsed_sents(fileid):
        productions += t.productions()
grammar = induce_pcfg(Nonterminal('S'), productions)

# In[ ]:

print(grammar)

# In[ ]:

sorted(grammar.productions(lhs=Nonterminal('PP')))[:2]

# In[ ]:

sorted(grammar.productions(lhs=Nonterminal('NNP')))[:2]

# In[ ]: