Exemplo n.º 1
0
    def test_simple(self):
        grammar = CFG.fromstring(
            """
          S -> NP VP
          PP -> P NP
          NP -> Det N | NP PP P
          VP -> V NP | VP PP
          VP -> Det
          Det -> 'a' | 'the'
          N -> 'dog' | 'cat'
          V -> 'chased' | 'sat'
          P -> 'on' | 'in'
        """
        )
        self.assertFalse(grammar.is_flexible_chomsky_normal_form())
        self.assertFalse(grammar.is_chomsky_normal_form())
        grammar = grammar.chomsky_normal_form(flexible=True)
        self.assertTrue(grammar.is_flexible_chomsky_normal_form())
        self.assertFalse(grammar.is_chomsky_normal_form())

        grammar2 = CFG.fromstring(
            """
          S -> NP VP
          NP -> VP N P
          VP -> P
          N -> 'dog' | 'cat'
          P -> 'on' | 'in'
        """
        )
        self.assertFalse(grammar2.is_flexible_chomsky_normal_form())
        self.assertFalse(grammar2.is_chomsky_normal_form())
        grammar2 = grammar2.chomsky_normal_form()
        self.assertTrue(grammar2.is_flexible_chomsky_normal_form())
        self.assertTrue(grammar2.is_chomsky_normal_form())
Exemplo n.º 2
0
def demo(N=23):
	from nltk.grammar import CFG
	print('Generating the first %d sentences for demo grammar:' % (N,))
	print(demo_grammar)
	grammar = CFG.fromstring(demo_grammar)
	for n, sent in enumerate(generate(grammar, n=N), 1):
		print('%3d. %s' % (n, ' '.join(sent)))
Exemplo n.º 3
0
def demo():
    from nltk import Nonterminal, CFG
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
                                           for s in nonterminals.split()]

    grammar = CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    Det -> 'my'
    NP -> 'I'
    N -> 'dog'
    N -> 'man'
    N -> 'park'
    N -> 'statue'
    V -> 'saw'
    P -> 'in'
    P -> 'up'
    P -> 'over'
    P -> 'with'
    """)

    def cb(grammar): print(grammar)
    top = Tk()
    editor = CFGEditor(top, grammar, cb)
    Label(top, text='\nTesting CFG Editor\n').pack()
    Button(top, text='Quit', command=top.destroy).pack()
    top.mainloop()
Exemplo n.º 4
0
def app():
    """
    Create a recursive descent parser demo, using a simple grammar and
    text.
    """
    from nltk.grammar import CFG

    grammar = CFG.fromstring(
        """
    # Grammatical productions.
        S -> NP VP
        NP -> Det N PP | Det N
        VP -> V NP PP | V NP | V
        PP -> P NP
    # Lexical productions.
        NP -> 'I'
        Det -> 'the' | 'a'
        N -> 'man' | 'park' | 'dog' | 'telescope'
        V -> 'ate' | 'saw'
        P -> 'in' | 'under' | 'with'
    """
    )

    sent = 'the dog saw a man in the park'.split()

    RecursiveDescentApp(grammar, sent).mainloop()
Exemplo n.º 5
0
 def generate_context_free_grammar_novel_text(
     self, number_of_words_in_sentence=0, number_of_sentences_per_record=0, number_of_records=0
 ):
     """
     This method utilizes NLTK's Context Free Grammar parser objects to parse an available .*cfg file and generate
     novel text from it.
     
     @param number_of_words_in_sentence: An indicator as to the number of words to generate in each novel sentence.
     @type number_of_words_in_sentence: int
     @param number_of_sentences_per_record: An indicator as to the number of sentences per record to generate.
     @type number_of_sentences_per_record: int
     @param number_of_records: An indicator as to the total number of records to generate.
     @type number_of_records: int
     @return: str
     """
     words = []
     punct_selector = [". ", "! ", "? "]
     punctuation_stop_symbols = dict((ord(char), None) for char in string.punctuation)
     parser = None
     grammar = None
     try:
         if isinstance(self._corpus, CFG):
             _grammar = self._corpus
             if _grammar is not None:
                 parser = ChartParser(_grammar)
                 grammar = parser.grammar
         elif isinstance(self._corpus, FeatureGrammar):
             _grammar = self._corpus
             if _grammar is not None:
                 parser = FeatureChartParser(_grammar)
                 grammar = parser.grammar()
         elif isinstance(self._corpus, PCFG):
             _grammar = self._corpus
             if _grammar is not None:
                 parser = InsideChartParser(_grammar)
                 grammar = parser.grammar()
         else:
             grammar = CFG.fromstring(self._corpus)
         if grammar is not None:
             for _ in range(number_of_records):
                 novel_sentence = []
                 for _ in range(number_of_sentences_per_record):
                     sentence = " ".join(
                         [
                             sent
                             for _, sent in enumerate(generate_text(grammar, depth=2, n=number_of_words_in_sentence))
                         ]
                     )
                     sentence = sentence.translate(punctuation_stop_symbols) + random.choice(punct_selector)
                     sentence = sentence[0:].capitalize()
                     novel_sentence.append(sentence)
                 words.append("".join(novel_sentence))
     except Exception, error:
         self.logger.error(
             "TextGenerator.generate_context_free_grammar_novel_text: Error occurred - {0}".format(str(error))
         )
Exemplo n.º 6
0
def generate_text(grammar,N):
    from nltk.grammar import CFG
    import nltk.parse.generate as gen

    print('Generating the first %d sentences for demo grammar:' % (N,))
    print(grammar)
    grammar = CFG.fromstring(grammar)

    grm_list = gen.generate(grammar, n=N)
    for n, sent in enumerate(grm_list):
        print('%3d. %s' % (n, ' '.join(sent)))
def convert_grammar(cfg_grammar):
	"""
	Converts to Chomsky_Normal_form
	"""
	if cfg_grammar.is_chomsky_normal_form():
		return cfg_grammar

	# Go through every rule, and do the following conversions:
	# - remove terminals in non-solitary rules
	# - break up greater-than-2 rules
	# Notice that this loop-through will blissfully ignore small productions
	new_productions = []
	for production in cfg_grammar.productions():
		rhs_size = len(production)
		lhs = production.lhs()
		rhs = production.rhs()
		if rhs_size < 2:
			new_productions += [Production(lhs,rhs)]
		else:
			# Go through removing terminals
			term_rules = []
			for i in range(0, rhs_size):
				if is_terminal(rhs[i]):
					newnonterm = Nonterminal(rhs[i])
					term_rules += Production(newnonterm, rhs)
					rhs[i] = newnonterm
			new_productions += term_rules
			# Now break up large groups
			new_productions += break_large_rhs(lhs, rhs)

	# Reset for next loop through
	new_cfg = CFG(cfg_grammar.start(), new_productions)
	assert(new_cfg.is_binarised())

	# Remove empty productions 
	new_cfg = remove_empty_productions(new_cfg)

	# Go through the rules again, removing non-terminals in solitary rules 
	new_cfg = remove_unitary_productions(new_cfg)
	assert(new_cfg.is_chomsky_normal_form())
	return(new_cfg)
Exemplo n.º 8
0
def generateRawTemplates(depth):
    gram = CFG.fromstring(grammarstring)
    rawTemplates = generate(gram, depth=depth)
    templatefiles = []

    for index, state in enumerate(rawTemplates):
        filename = os.path.join("./templates","template"+str(index))
        with open(filename, 'w') as templatefile:
            templatefile.write(' '.join(state))
            templatefiles.append(filename)

    print str(len(templatefiles))+" template files generated"

    return templatefiles
Exemplo n.º 9
0
def generate_tweet(grammar):
    from nltk.grammar import CFG
    import nltk.parse.generate as gen

    print(grammar)
    grammar = CFG.fromstring(grammar)
    grm_list = gen.generate(grammar, n=SIZE) # TODO voir la taille max ? moyen de la recuperer ?
    from random import randint
    rd = randint(0,SIZE)
    cpt = 0
    for n, sent in enumerate(grm_list):
        if rd == cpt:
            print ("Your tweet : ")
            print('%3d. %s' % (n, ' '.join(sent)))
        cpt += 1
Exemplo n.º 10
0
    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None: self._cfg = cfg
        else: self._cfg = CFG(Nonterminal('S'), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side='top', fill='x', expand=0)
        self._init_prodframe()
        self._prodframe.pack(side='top', fill='both', expand=1)
        self._init_buttons()
        self._buttonframe.pack(side='bottom', fill='x', expand=0)

        self._textwidget.focus()
Exemplo n.º 11
0
Arquivo: cfg.py Projeto: Geolem/nltk
def demo():
    from nltk import Nonterminal, CFG

    nonterminals = "S VP NP PP P N Name V Det"
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]

    grammar = CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    Det -> 'my'
    NP -> 'I'
    N -> 'dog'
    N -> 'man'
    N -> 'park'
    N -> 'statue'
    V -> 'saw'
    P -> 'in'
    P -> 'up'
    P -> 'over'
    P -> 'with'
    """)

    def cb(grammar):
        print(grammar)

    top = Tk()
    editor = CFGEditor(top, grammar, cb)
    Label(top, text="\nTesting CFG Editor\n").pack()
    Button(top, text="Quit", command=top.destroy).pack()
    top.mainloop()
Exemplo n.º 12
0
def app():
    """
    Create a shift reduce parser app, using a simple grammar and
    text.
    """

    from nltk.grammar import Nonterminal, Production, CFG
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
                                           for s in nonterminals.split()]

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),

        # Lexical Productions
        Production(NP, ['I']),   Production(Det, ['the']),
        Production(Det, ['a']),  Production(N, ['man']),
        Production(V, ['saw']),  Production(P, ['in']),
        Production(P, ['with']), Production(N, ['park']),
        Production(N, ['dog']),  Production(N, ['statue']),
        Production(Det, ['my']),
        )

    grammar = CFG(S, productions)

    # tokenize the sentence
    sent = 'my dog saw a man in the park with a statue'.split()

    ShiftReduceApp(grammar, sent).mainloop()
Exemplo n.º 13
0
def remove_unary_rules(grammar):
  result = []
  unary = []
  fake_rules = []
  removed_rules = []
  for rule in grammar.productions():
    if len(rule) == 1 and rule.is_nonlexical():
      unary.append(rule)
    else:
      result.append(rule)

  while unary:
    rule = unary.pop(0)
    removed_rules.append(rule)
    for item in grammar.productions(lhs=rule.rhs()[0]):
      new_rule = Production(rule.lhs(), item.rhs())
      if len(new_rule) != 1 or new_rule.is_lexical():
        result.append(new_rule)
        fake_rules.append(new_rule)
      else:
        unary.append(new_rule)

  n_grammar = CFG(grammar.start(), result)
  return n_grammar, grammar
Exemplo n.º 14
0
def app():
    """
    Create a recursive descent parser demo, using a simple grammar and
    text.
    """
    from nltk.grammar import CFG

    grammar = CFG.fromstring("""
    # Grammatical productions.
        S -> NP VP
        NP -> Det N PP | Det N
        VP -> V NP PP | V NP | V
        PP -> P NP
    # Lexical productions.
        NP -> 'I'
        Det -> 'the' | 'a'
        N -> 'man' | 'park' | 'dog' | 'telescope'
        V -> 'ate' | 'saw'
        P -> 'in' | 'under' | 'with'
    """)

    sent = 'the dog saw a man in the park'.split()

    RecursiveDescentApp(grammar, sent).mainloop()
Exemplo n.º 15
0
s = ''

print('Bulding tree from parsed sentences')
with open('parsed_sentences.txt') as f:
    sentences = list(f) + ['']
    for line in sentences:
        line = line.strip()
        if len(line) > 0:
            if line[0] != '#':
                s += line
        elif len(s) > 0:
            t = tree.Tree.fromstring(s)
            prod += t.productions()
            t.chomsky_normal_form()
            t.collapse_unary(collapsePOS=True)
            prod_cnf += t.productions()
            s = ''

prod = set(prod)
prod_cnf = set(prod_cnf)

print('Writing CFG to file with %d productions' % len(prod))
grammar = CFG(Nonterminal('ROOT'), prod)
with open('grammar.cfg', 'w') as f:
    f.write('\n'.join([str(p) for p in grammar.productions()]))

print('Writing CFG (CNF) to file with %d productions' % len(prod_cnf))
grammar_cnf = CFG(Nonterminal('ROOT'), prod_cnf)
with open('grammar_cnf.cfg', 'w') as f:
    f.write('\n'.join([str(p) for p in grammar_cnf.productions()]))
Exemplo n.º 16
0
# -*- coding: utf-8 -*-
import pytest
from nltk.grammar import CFG
from nltk.parse.chart import BottomUpChartParser

with open("subject-verb.grammar") as f:
    grammar = CFG.fromstring(f.read(), encoding="utf-8")

tests = {
    "subject_verb_agreement": [
        "Je regarde la television", "Tu regardes la television",
        "Il regarde la television", "Nous regardons la television",
        "Vous regardez la television", "Ils regardent la television"
    ],
    "test_noun_phrases_and_proper_names": [
        "le chat", "la television", "les chats", "les televisions", "Jackie",
        "Montreal"
    ],
    "test_direct_object_pronouns": ["il la regarde"],
    "test_attribute_adjectives": [
        "le chat noir", "le chat heureux", "le beau chat", "le joli chat",
        "la derniere semaine", "la semaine derniere", "les chats noirs",
        "la television noire", "les televisions noires"
    ]
}


@pytest.mark.parametrize("test", ((test_name, sentence)
                                  for test_name, sentences in tests.items()
                                  for sentence in sentences))
def test(test):
Exemplo n.º 17
0
def parse(text):
    """ Parse some text.
"""
    '''
    # extract new words and numbers
    words = set([match.group(0) for match in re.finditer(r"[a-zA-Z]+", text)])
    numbers = set([match.group(0) for match in re.finditer(r"\d+", text)])        
    '''

    numbers = set([match.group(0) for match in re.finditer(r"\d+", text)])
    coordinates = set(
        [match.group(0) for match in re.finditer(r"\(\d+,\d+\)", text)])
    relations = [
        "segitiga", "kotak", "titik", "garis", "poligon", "negara", "kota",
        "provinsi"
    ]
    fields = ["nama", "ibukota", "geom", "id", "id_ibukota"]

    class Relation:
        def __init__(self, name, attrs, geom):
            self.name = name
            self.attrs = attrs
            self.geom = geom

    # segitiga: id, nama, geom
    # kotak: id, nama, geom
    # titik: id, nama, geom
    # garis: id, nama, geom
    # poligon: id, nama, geom
    # negara: id, nama, id_ibukota, geom
    # provinsi: id, nama, id_ibukota, geom
    # kota: id, nama, geom

    # Make a local copy of productions
    lproductions = list(productions)

    # Add a production for every words and number
    lproductions.extend(
        [literal_production("NUMBER", number) for number in numbers])
    lproductions.extend(
        [literal_production("RELATION", relation) for relation in relations])
    lproductions.extend(
        [literal_production("VALUE", value) for value in values])
    lproductions.extend(
        [literal_production("FIELD", field) for field in fields])
    lproductions.extend(
        [literal_production("COOR", coor) for coor in coordinates])

    key = "VALUE"
    lhs = Nonterminal(key)
    lproductions.extend([Production(lhs, ["bengawan", "solo"])])

    # Make a local copy of the grammar with extra productions
    lgrammar = CFG(grammar.start(), lproductions)

    # Load grammar into a parser
    parser = nltk.RecursiveDescentParser(lgrammar)

    tokens = text.split()

    return parser.parse(tokens)
Exemplo n.º 18
0
grammar = CFG.fromstring("""
S -> COMMAND QUERY
COMMAND -> COMMAND1 | COMMAND2 | COMMAND3
COMMAND1 -> 'tampil'
COMMAND2 -> 'tunjuk' | 'lihat'
COMMAND3 -> 'hitung' | 'kalkulasi'
QUERY -> RELATION | CONDITION | CONDITION CONDITION | CONDITION CONJ CONDITION | CONDITION QUERY | CONDITION CONJ QUERY
CONJ -> AND | OR
AND -> 'dan' | 'serta'
OR -> 'atau'
CONDITION -> FIELDS OPERATOR NUMBER | FIELDS RELATION | FIELDS RELATION SPATIALOP RELCOND | FIELDS RELATION NOT SPATIALOP RELCOND | FIELDS RELCOND | PART RELATION SPATIALOP GEOCOND | RELCOND | RELATION SPATIALOP GEOCOND | RELATION NOT SPATIALOP GEOCOND | RELATION SPATIALOP RELCOND | RELATION NOT SPATIALOP RELCOND | SPATIALOP RELATION SPATIALOP RELCOND | SPATIALOP RELATION NOT SPATIALOP RELCOND | SPATIALOP RELCOND |  SPATIALOP RELCOND RELCOND | SPATIALOP OPERATOR NUMBER | VALUES
PART -> 'daerah' | 'bagian' | 'potong'
GEOCOND -> GEOMETRY POINT COOR CONJ POINT COOR | GEOMETRY COOR SIZE NUMBER
GEOMETRY -> SQUARE | RECTANGLE
SQUARE -> 'persegi'
RECTANGLE -> 'segiempat' | 'persegi' 'panjang'
POINT -> LU | RU | LB | RB
LU -> 'titik' 'kiri' 'atas'
RB -> 'titik' 'kanan' 'bawah'
RELCOND -> RELATION VALUES | RELATION FIELDS VALUE | RELATION FIELDS NUMBER | RELATION
OPERATOR -> 'lebih' 'dari' | 'kurang' 'dari' | 'sama' 'dengan' | 'lebih' 'dari 'sama 'dengan' | 'kurang' 'dari' 'sama' 'dengan'
NOT -> 'tidak' | 'bukan'
SPATIALOP -> PANJANG | LUAS | KELILING | INSIDE | OUTSIDE | JARAK
JARAK -> 'jarak'
INSIDE -> 'dalam' | 'pada'
OUTSIDE -> 'luar'
PANJANG -> 'panjang'
LUAS -> 'luas'
KELILING -> 'keliling'
FIELDS -> FIELD FIELD | FIELD | FIELD FIELDS | FIELD CONJ FIELDS
VALUES -> VALUE VALUE | VALUE | VALUE VALUES
""")
Exemplo n.º 19
0
Arquivo: cfg.py Projeto: Geolem/nltk
 def _apply(self, *e):
     productions = self._parse_productions()
     start = Nonterminal(self._start.get())
     cfg = CFG(start, productions)
     if self._set_cfg_callback is not None:
         self._set_cfg_callback(cfg)
Exemplo n.º 20
0
# S -> NP VP        # Start state S
# A -> B | C        # Arrow and vbar
# C -> "a" | "b"    # non-terminals in quotes
#
##################################################

# Replace with your file name here
filename = "a2q2.txt"

with open(filename) as f:
    content = f.read()
    # Spent too long on this and gave up; I just manually converted accents within the grammar file
    content = content.lower().replace('é', 'e').replace('è', 'e').replace('ê', 'e') \
        .replace('á', 'a').replace('à', 'a').replace('â', 'a') \
        .replace('ó', 'o').replace('ò', 'o').replace('ô', 'o')
    grammar = CFG.fromstring(content, encoding="utf-8")

parser = BottomUpChartParser(grammar)


def parse(sentence, nonempty):
    trees = parser.parse(sentence.lower().split())
    data = list(trees)
    if nonempty:
        print(data)
        assert len(data) > 0
    else:
        assert len(data) == 0


validSentences = [
Exemplo n.º 21
0
    def guess(self, verbose=None):
        """
        Makes a guess based on the next observation.
        Updates self._curr_guess.

        :rtype: CFG
        :returns: The next guess
        """
        if verbose is not None:
            self._verbose = verbose

        sentence = Sentence(next(self._text))
        self._num_steps += 1
        self._log("String {}: {}".format(self._num_steps, sentence))

        if sentence in self._data:
            self._log("String already seen")
            return self._curr_guess

        # Info from previous guess
        num_contexts = len(self._contexts)
        num_subs = len(self._substrings)
        if self._curr_guess is not None:
            num_nts = len(set(p.lhs()
                              for p in self._curr_guess.productions())) - 1
        else:
            num_nts = 0

        total_timer = Timer()
        total_timer.start()

        # Update data and terminals
        words = sentence.get_words()
        self._data.add(sentence)
        self._terminals.update(set(words))

        # Update contexts
        self._log("Updating contexts...")
        inds = range(0, len(words) + 1)
        contexts = [
            Context(words[:i], words[j:]) for i in inds for j in inds[i:]
        ]
        self._contexts.update(ContextSet(contexts))
        self._log(
            "{} new contexts added".format(len(self._contexts) - num_contexts))

        # Update substrings
        self._log("Updating substrings...")

        is_new_sentence = True
        if self._curr_guess_parser is not None:
            try:
                parses = self._curr_guess_parser.parse(words)
                is_new_sentence = len(list(parses)) == 0
            except:
                is_new_sentence = True

        if is_new_sentence:
            subs = [Sentence(words[i:j]) for i in inds for j in inds[i:]]
            self._substrings.update(SentenceSet(subs))
            self._log("{} new substrings added".format(
                len(self._substrings) - num_subs))
        else:
            self._log("Sentence already generated by current guess")

        # Construct the nonterminals
        self._log("Constructing nonterminals...")

        kernels = set()
        for i in range(1, self._k + 1):
            subsets = [
                SentenceSet(j) for j in combinations(self._substrings, i)
            ]
            kernels.update(subsets)

        for kernel in kernels:
            if kernel not in self._nonterminals:
                nt_name = self._new_name()
                contexts = self._oracle.restr_right_triangle(
                    kernel, self._contexts)
                nt = Nonterminal(nt_name)
                self._nonterminals[kernel] = nt
                self._nt_contexts[nt] = contexts

        # Get a set of nonterminals with unique contexts
        self._log("Removing equivalent nonterminals...")
        context_nts = {con: nt for nt, con in self._nt_contexts.iteritems()}
        self._log(
            "{} nonterminals removed".format(len(kernels) - len(context_nts)))
        self._log("{} new nonterminals constructed".format(
            len(context_nts) - num_nts))

        # Construct the rules
        self._log("Constructing rules...")
        self._productions = set()
        timer = Timer()

        # Lexical rules
        timer.start()
        for t in self._terminals:
            t_kernel = SentenceSet([Sentence([t])])
            t_nt = self._nonterminals[t_kernel]
            t_contexts = self._nt_contexts[t_nt]

            for contexts, nt in context_nts.iteritems():
                rule = Production(nt, [t])
                if rule in self._productions:
                    continue
                if rule in self._eliminated_rules:
                    continue

                if contexts.issubset(t_contexts):
                    self._productions.add(rule)
                else:
                    self._eliminated_rules.add(rule)

        timer.stop()
        num_lex = len(self._productions)
        self._log("{} lexical rules ({:.2f} secs)".format(
            num_lex, timer.elapsed()))

        # Binary rules
        timer.reset()
        timer.start()
        for kernel_l in self._nonterminals:
            for kernel_r in self._nonterminals:
                kernel_rhs = kernel_l + kernel_r
                sents_rhs = list(kernel_rhs.intersection(self._substrings))

                inds = range(len(sents_rhs) / self._k + 1)
                kers_rhs = [
                    sents_rhs[self._k * i:self._k * (i + 1)] for i in inds
                ]
                kers_rhs = [SentenceSet(k) for k in kers_rhs if len(k) > 0]

                nts_rhs = [self._nonterminals[k] for k in kers_rhs]
                contexts_nts_rhs = [self._nt_contexts[nt] for nt in nts_rhs]
                if len(contexts_nts_rhs) > 0:
                    contexts_rhs = contexts_nts_rhs[0].intersection(
                        *contexts_nts_rhs)
                else:
                    contexts_rhs = self._contexts

                # Membership queries
                new_strs_rhs = kernel_rhs.difference(SentenceSet(sents_rhs))
                new_contexts_rhs = self._oracle.restr_right_triangle(
                    new_strs_rhs, contexts_rhs)
                contexts_rhs.intersection_update(new_contexts_rhs)

                # Building the rules
                for contexts, nt in context_nts.iteritems():
                    nt_l = context_nts[self._nt_contexts[
                        self._nonterminals[kernel_l]]]
                    nt_r = context_nts[self._nt_contexts[
                        self._nonterminals[kernel_r]]]
                    rule = Production(nt, [nt_l, nt_r])
                    if rule in self._productions:
                        continue
                    if rule in self._eliminated_rules:
                        continue

                    if contexts.issubset(contexts_rhs):
                        self._productions.add(rule)
                    else:
                        self._eliminated_rules.add(rule)

        timer.stop()
        num_bin = len(self._productions) - num_lex
        self._log("{} binary rules ({:.2f} secs)".format(
            num_bin, timer.elapsed()))

        # Start rules
        timer.reset()
        timer.start()
        for contexts, nt in context_nts.iteritems():
            rule = Production(self._start_symbol, [nt])
            if rule in self._productions:
                continue
            if rule in self._eliminated_rules:
                continue
            if Context([], []) in contexts:
                self._productions.add(rule)
            else:
                self._eliminated_rules.add(rule)

        timer.stop()
        num_start = len(self._productions) - num_lex - num_bin
        self._log("{} start rules ({:.2f} secs)".format(
            num_start, timer.elapsed()))

        # Construct the grammar
        self._curr_guess = CFG(self._start_symbol, self._productions)
        self._curr_guess_parser = ChartParser(self._curr_guess)

        total_timer.stop()
        elapsed = total_timer.elapsed()
        num_rules = len(self._curr_guess.productions())
        self._log("Constructed grammar with {} rules ({:.2f} secs)".format(
            num_rules, elapsed))

        return self._curr_guess
Exemplo n.º 22
0
class CFGEditor(object):
    """
    A dialog window for creating and editing context free grammars.
    ``CFGEditor`` imposes the following restrictions:

    - All nonterminals must be strings consisting of word
      characters.
    - All terminals must be strings consisting of word characters
      and space characters.
    """

    # Regular expressions used by _analyze_line.  Precompile them, so
    # we can process the text faster.
    ARROW = SymbolWidget.SYMBOLS['rightarrow']
    _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))")
    _ARROW_RE = re.compile("\s*(->|(" + ARROW + "))\s*")
    _PRODUCTION_RE = re.compile(
        r"(^\s*\w+\s*)"
        + "(->|("  # LHS
        + ARROW
        + "))\s*"
        + r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$"  # arrow
    )  # RHS
    _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")")
    _BOLD = ('helvetica', -12, 'bold')

    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None:
            self._cfg = cfg
        else:
            self._cfg = CFG(Nonterminal('S'), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side='top', fill='x', expand=0)
        self._init_prodframe()
        self._prodframe.pack(side='top', fill='both', expand=1)
        self._init_buttons()
        self._buttonframe.pack(side='bottom', fill='x', expand=0)

        self._textwidget.focus()

    def _init_startframe(self):
        frame = self._startframe = Frame(self._top)
        self._start = Entry(frame)
        self._start.pack(side='right')
        Label(frame, text='Start Symbol:').pack(side='right')
        Label(frame, text='Productions:').pack(side='left')
        self._start.insert(0, self._cfg.start().symbol())

    def _init_buttons(self):
        frame = self._buttonframe = Frame(self._top)
        Button(frame, text='Ok', command=self._ok, underline=0, takefocus=0).pack(
            side='left'
        )
        Button(frame, text='Apply', command=self._apply, underline=0, takefocus=0).pack(
            side='left'
        )
        Button(frame, text='Reset', command=self._reset, underline=0, takefocus=0).pack(
            side='left'
        )
        Button(
            frame, text='Cancel', command=self._cancel, underline=0, takefocus=0
        ).pack(side='left')
        Button(frame, text='Help', command=self._help, underline=0, takefocus=0).pack(
            side='right'
        )

    def _init_bindings(self):
        self._top.title('CFG Editor')
        self._top.bind('<Control-q>', self._cancel)
        self._top.bind('<Alt-q>', self._cancel)
        self._top.bind('<Control-d>', self._cancel)
        # self._top.bind('<Control-x>', self._cancel)
        self._top.bind('<Alt-x>', self._cancel)
        self._top.bind('<Escape>', self._cancel)
        # self._top.bind('<Control-c>', self._cancel)
        self._top.bind('<Alt-c>', self._cancel)

        self._top.bind('<Control-o>', self._ok)
        self._top.bind('<Alt-o>', self._ok)
        self._top.bind('<Control-a>', self._apply)
        self._top.bind('<Alt-a>', self._apply)
        self._top.bind('<Control-r>', self._reset)
        self._top.bind('<Alt-r>', self._reset)
        self._top.bind('<Control-h>', self._help)
        self._top.bind('<Alt-h>', self._help)
        self._top.bind('<F1>', self._help)

    def _init_prodframe(self):
        self._prodframe = Frame(self._top)

        # Create the basic Text widget & scrollbar.
        self._textwidget = Text(
            self._prodframe, background='#e0e0e0', exportselection=1
        )
        self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient='vertical')
        self._textwidget.config(yscrollcommand=self._textscroll.set)
        self._textscroll.config(command=self._textwidget.yview)
        self._textscroll.pack(side='right', fill='y')
        self._textwidget.pack(expand=1, fill='both', side='left')

        # Initialize the colorization tags.  Each nonterminal gets its
        # own tag, so they aren't listed here.
        self._textwidget.tag_config('terminal', foreground='#006000')
        self._textwidget.tag_config('arrow', font='symbol')
        self._textwidget.tag_config('error', background='red')

        # Keep track of what line they're on.  We use that to remember
        # to re-analyze a line whenever they leave it.
        self._linenum = 0

        # Expand "->" to an arrow.
        self._top.bind('>', self._replace_arrows)

        # Re-colorize lines when appropriate.
        self._top.bind('<<Paste>>', self._analyze)
        self._top.bind('<KeyPress>', self._check_analyze)
        self._top.bind('<ButtonPress>', self._check_analyze)

        # Tab cycles focus. (why doesn't this work??)
        def cycle(e, textwidget=self._textwidget):
            textwidget.tk_focusNext().focus()

        self._textwidget.bind('<Tab>', cycle)

        prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()]
        for i in range(len(prod_tuples) - 1, 0, -1):
            if prod_tuples[i][0] == prod_tuples[i - 1][0]:
                if () in prod_tuples[i][1]:
                    continue
                if () in prod_tuples[i - 1][1]:
                    continue
                print(prod_tuples[i - 1][1])
                print(prod_tuples[i][1])
                prod_tuples[i - 1][1].extend(prod_tuples[i][1])
                del prod_tuples[i]

        for lhs, rhss in prod_tuples:
            print(lhs, rhss)
            s = '%s ->' % lhs
            for rhs in rhss:
                for elt in rhs:
                    if isinstance(elt, Nonterminal):
                        s += ' %s' % elt
                    else:
                        s += ' %r' % elt
                s += ' |'
            s = s[:-2] + '\n'
            self._textwidget.insert('end', s)

        self._analyze()

    #         # Add the producitons to the text widget, and colorize them.
    #         prod_by_lhs = {}
    #         for prod in self._cfg.productions():
    #             if len(prod.rhs()) > 0:
    #                 prod_by_lhs.setdefault(prod.lhs(),[]).append(prod)
    #         for (lhs, prods) in prod_by_lhs.items():
    #             self._textwidget.insert('end', '%s ->' % lhs)
    #             self._textwidget.insert('end', self._rhs(prods[0]))
    #             for prod in prods[1:]:
    #                 print '\t|'+self._rhs(prod),
    #                 self._textwidget.insert('end', '\t|'+self._rhs(prod))
    #             print
    #             self._textwidget.insert('end', '\n')
    #         for prod in self._cfg.productions():
    #             if len(prod.rhs()) == 0:
    #                 self._textwidget.insert('end', '%s' % prod)
    #         self._analyze()

    #     def _rhs(self, prod):
    #         s = ''
    #         for elt in prod.rhs():
    #             if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol()
    #             else: s += ' %r' % elt
    #         return s

    def _clear_tags(self, linenum):
        """
        Remove all tags (except ``arrow`` and ``sel``) from the given
        line of the text widget used for editing the productions.
        """
        start = '%d.0' % linenum
        end = '%d.end' % linenum
        for tag in self._textwidget.tag_names():
            if tag not in ('arrow', 'sel'):
                self._textwidget.tag_remove(tag, start, end)

    def _check_analyze(self, *e):
        """
        Check if we've moved to a new line.  If we have, then remove
        all colorization from the line we moved to, and re-colorize
        the line that we moved from.
        """
        linenum = int(self._textwidget.index('insert').split('.')[0])
        if linenum != self._linenum:
            self._clear_tags(linenum)
            self._analyze_line(self._linenum)
            self._linenum = linenum

    def _replace_arrows(self, *e):
        """
        Replace any ``'->'`` text strings with arrows (char \\256, in
        symbol font).  This searches the whole buffer, but is fast
        enough to be done anytime they press '>'.
        """
        arrow = '1.0'
        while True:
            arrow = self._textwidget.search('->', arrow, 'end+1char')
            if arrow == '':
                break
            self._textwidget.delete(arrow, arrow + '+2char')
            self._textwidget.insert(arrow, self.ARROW, 'arrow')
            self._textwidget.insert(arrow, '\t')

        arrow = '1.0'
        while True:
            arrow = self._textwidget.search(self.ARROW, arrow + '+1char', 'end+1char')
            if arrow == '':
                break
            self._textwidget.tag_add('arrow', arrow, arrow + '+1char')

    def _analyze_token(self, match, linenum):
        """
        Given a line number and a regexp match for a token on that
        line, colorize the token.  Note that the regexp match gives us
        the token's text, start index (on the line), and end index (on
        the line).
        """
        # What type of token is it?
        if match.group()[0] in "'\"":
            tag = 'terminal'
        elif match.group() in ('->', self.ARROW):
            tag = 'arrow'
        else:
            # If it's a nonterminal, then set up new bindings, so we
            # can highlight all instances of that nonterminal when we
            # put the mouse over it.
            tag = 'nonterminal_' + match.group()
            if tag not in self._textwidget.tag_names():
                self._init_nonterminal_tag(tag)

        start = '%d.%d' % (linenum, match.start())
        end = '%d.%d' % (linenum, match.end())
        self._textwidget.tag_add(tag, start, end)

    def _init_nonterminal_tag(self, tag, foreground='blue'):
        self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD)
        if not self._highlight_matching_nonterminals:
            return

        def enter(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background='#80ff80')

        def leave(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background='')

        self._textwidget.tag_bind(tag, '<Enter>', enter)
        self._textwidget.tag_bind(tag, '<Leave>', leave)

    def _analyze_line(self, linenum):
        """
        Colorize a given line.
        """
        # Get rid of any tags that were previously on the line.
        self._clear_tags(linenum)

        # Get the line line's text string.
        line = self._textwidget.get(repr(linenum) + '.0', repr(linenum) + '.end')

        # If it's a valid production, then colorize each token.
        if CFGEditor._PRODUCTION_RE.match(line):
            # It's valid; Use _TOKEN_RE to tokenize the production,
            # and call analyze_token on each token.
            def analyze_token(match, self=self, linenum=linenum):
                self._analyze_token(match, linenum)
                return ''

            CFGEditor._TOKEN_RE.sub(analyze_token, line)
        elif line.strip() != '':
            # It's invalid; show the user where the error is.
            self._mark_error(linenum, line)

    def _mark_error(self, linenum, line):
        """
        Mark the location of an error in a line.
        """
        arrowmatch = CFGEditor._ARROW_RE.search(line)
        if not arrowmatch:
            # If there's no arrow at all, highlight the whole line.
            start = '%d.0' % linenum
            end = '%d.end' % linenum
        elif not CFGEditor._LHS_RE.match(line):
            # Otherwise, if the LHS is bad, highlight it.
            start = '%d.0' % linenum
            end = '%d.%d' % (linenum, arrowmatch.start())
        else:
            # Otherwise, highlight the RHS.
            start = '%d.%d' % (linenum, arrowmatch.end())
            end = '%d.end' % linenum

        # If we're highlighting 0 chars, highlight the whole line.
        if self._textwidget.compare(start, '==', end):
            start = '%d.0' % linenum
            end = '%d.end' % linenum
        self._textwidget.tag_add('error', start, end)

    def _analyze(self, *e):
        """
        Replace ``->`` with arrows, and colorize the entire buffer.
        """
        self._replace_arrows()
        numlines = int(self._textwidget.index('end').split('.')[0])
        for linenum in range(1, numlines + 1):  # line numbers start at 1.
            self._analyze_line(linenum)

    def _parse_productions(self):
        """
        Parse the current contents of the textwidget buffer, to create
        a list of productions.
        """
        productions = []

        # Get the text, normalize it, and split it into lines.
        text = self._textwidget.get('1.0', 'end')
        text = re.sub(self.ARROW, '->', text)
        text = re.sub('\t', ' ', text)
        lines = text.split('\n')

        # Convert each line to a CFG production
        for line in lines:
            line = line.strip()
            if line == '':
                continue
            productions += _read_cfg_production(line)
            # if line.strip() == '': continue
            # if not CFGEditor._PRODUCTION_RE.match(line):
            #    raise ValueError('Bad production string %r' % line)
            #
            # (lhs_str, rhs_str) = line.split('->')
            # lhs = Nonterminal(lhs_str.strip())
            # rhs = []
            # def parse_token(match, rhs=rhs):
            #    token = match.group()
            #    if token[0] in "'\"": rhs.append(token[1:-1])
            #    else: rhs.append(Nonterminal(token))
            #    return ''
            # CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
            #
            # productions.append(Production(lhs, *rhs))

        return productions

    def _destroy(self, *e):
        if self._top is None:
            return
        self._top.destroy()
        self._top = None

    def _ok(self, *e):
        self._apply()
        self._destroy()

    def _apply(self, *e):
        productions = self._parse_productions()
        start = Nonterminal(self._start.get())
        cfg = CFG(start, productions)
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(cfg)

    def _reset(self, *e):
        self._textwidget.delete('1.0', 'end')
        for production in self._cfg.productions():
            self._textwidget.insert('end', '%s\n' % production)
        self._analyze()
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(self._cfg)

    def _cancel(self, *e):
        try:
            self._reset()
        except:
            pass
        self._destroy()

    def _help(self, *e):
        # The default font's not very legible; try using 'fixed' instead.
        try:
            ShowText(
                self._parent,
                'Help: Chart Parser Demo',
                (_CFGEditor_HELP).strip(),
                width=75,
                font='fixed',
            )
        except:
            ShowText(
                self._parent,
                'Help: Chart Parser Demo',
                (_CFGEditor_HELP).strip(),
                width=75,
            )
Exemplo n.º 23
0
class PrimalLearner(Learner):
    """
        Implementation of the primal algorithm of Yoshinaka (2011).
    """
    def __init__(self, text, oracle, k):
        """
        Initialize from a Text and an Oracle.

        :type text: oracles.Text
        :param text: A text

        :type oracle: oracles.Oracle
        :param oracle: An oracle

        :type k: int
        :param k: The grammar learned will have the k-FKP.
        """
        super(PrimalLearner, self).__init__()
        self._text = text
        self._oracle = oracle
        self._k = k

        # Algorithm state
        self._data = SentenceSet([])
        self._substrings = SentenceSet([])
        self._contexts = ContextSet([])
        self._eliminated_rules = set()
        self._num_steps = 0

        self._verbose = False

        # Current guess
        self._name_ctr = 0
        self._kernels = []
        self._nonterminals = dict()
        self._nt_contexts = dict()
        self._terminals = set()
        self._productions = set()
        self._start_symbol = Nonterminal("start")
        self._curr_guess = None
        self._curr_guess_parser = None

    def _new_name(self):
        """
        Generates a unique name.

        :rtype: int
        :return: A unique int
        """
        self._name_ctr += 1
        return str(self._name_ctr - 1)

    def _log(self, message):
        if self._verbose:
            print message

    def guess(self, verbose=None):
        """
        Makes a guess based on the next observation.
        Updates self._curr_guess.

        :rtype: CFG
        :returns: The next guess
        """
        if verbose is not None:
            self._verbose = verbose

        sentence = Sentence(next(self._text))
        self._num_steps += 1
        self._log("String {}: {}".format(self._num_steps, sentence))

        if sentence in self._data:
            self._log("String already seen")
            return self._curr_guess

        # Info from previous guess
        num_contexts = len(self._contexts)
        num_subs = len(self._substrings)
        if self._curr_guess is not None:
            num_nts = len(set(p.lhs()
                              for p in self._curr_guess.productions())) - 1
        else:
            num_nts = 0

        total_timer = Timer()
        total_timer.start()

        # Update data and terminals
        words = sentence.get_words()
        self._data.add(sentence)
        self._terminals.update(set(words))

        # Update contexts
        self._log("Updating contexts...")
        inds = range(0, len(words) + 1)
        contexts = [
            Context(words[:i], words[j:]) for i in inds for j in inds[i:]
        ]
        self._contexts.update(ContextSet(contexts))
        self._log(
            "{} new contexts added".format(len(self._contexts) - num_contexts))

        # Update substrings
        self._log("Updating substrings...")

        is_new_sentence = True
        if self._curr_guess_parser is not None:
            try:
                parses = self._curr_guess_parser.parse(words)
                is_new_sentence = len(list(parses)) == 0
            except:
                is_new_sentence = True

        if is_new_sentence:
            subs = [Sentence(words[i:j]) for i in inds for j in inds[i:]]
            self._substrings.update(SentenceSet(subs))
            self._log("{} new substrings added".format(
                len(self._substrings) - num_subs))
        else:
            self._log("Sentence already generated by current guess")

        # Construct the nonterminals
        self._log("Constructing nonterminals...")

        kernels = set()
        for i in range(1, self._k + 1):
            subsets = [
                SentenceSet(j) for j in combinations(self._substrings, i)
            ]
            kernels.update(subsets)

        for kernel in kernels:
            if kernel not in self._nonterminals:
                nt_name = self._new_name()
                contexts = self._oracle.restr_right_triangle(
                    kernel, self._contexts)
                nt = Nonterminal(nt_name)
                self._nonterminals[kernel] = nt
                self._nt_contexts[nt] = contexts

        # Get a set of nonterminals with unique contexts
        self._log("Removing equivalent nonterminals...")
        context_nts = {con: nt for nt, con in self._nt_contexts.iteritems()}
        self._log(
            "{} nonterminals removed".format(len(kernels) - len(context_nts)))
        self._log("{} new nonterminals constructed".format(
            len(context_nts) - num_nts))

        # Construct the rules
        self._log("Constructing rules...")
        self._productions = set()
        timer = Timer()

        # Lexical rules
        timer.start()
        for t in self._terminals:
            t_kernel = SentenceSet([Sentence([t])])
            t_nt = self._nonterminals[t_kernel]
            t_contexts = self._nt_contexts[t_nt]

            for contexts, nt in context_nts.iteritems():
                rule = Production(nt, [t])
                if rule in self._productions:
                    continue
                if rule in self._eliminated_rules:
                    continue

                if contexts.issubset(t_contexts):
                    self._productions.add(rule)
                else:
                    self._eliminated_rules.add(rule)

        timer.stop()
        num_lex = len(self._productions)
        self._log("{} lexical rules ({:.2f} secs)".format(
            num_lex, timer.elapsed()))

        # Binary rules
        timer.reset()
        timer.start()
        for kernel_l in self._nonterminals:
            for kernel_r in self._nonterminals:
                kernel_rhs = kernel_l + kernel_r
                sents_rhs = list(kernel_rhs.intersection(self._substrings))

                inds = range(len(sents_rhs) / self._k + 1)
                kers_rhs = [
                    sents_rhs[self._k * i:self._k * (i + 1)] for i in inds
                ]
                kers_rhs = [SentenceSet(k) for k in kers_rhs if len(k) > 0]

                nts_rhs = [self._nonterminals[k] for k in kers_rhs]
                contexts_nts_rhs = [self._nt_contexts[nt] for nt in nts_rhs]
                if len(contexts_nts_rhs) > 0:
                    contexts_rhs = contexts_nts_rhs[0].intersection(
                        *contexts_nts_rhs)
                else:
                    contexts_rhs = self._contexts

                # Membership queries
                new_strs_rhs = kernel_rhs.difference(SentenceSet(sents_rhs))
                new_contexts_rhs = self._oracle.restr_right_triangle(
                    new_strs_rhs, contexts_rhs)
                contexts_rhs.intersection_update(new_contexts_rhs)

                # Building the rules
                for contexts, nt in context_nts.iteritems():
                    nt_l = context_nts[self._nt_contexts[
                        self._nonterminals[kernel_l]]]
                    nt_r = context_nts[self._nt_contexts[
                        self._nonterminals[kernel_r]]]
                    rule = Production(nt, [nt_l, nt_r])
                    if rule in self._productions:
                        continue
                    if rule in self._eliminated_rules:
                        continue

                    if contexts.issubset(contexts_rhs):
                        self._productions.add(rule)
                    else:
                        self._eliminated_rules.add(rule)

        timer.stop()
        num_bin = len(self._productions) - num_lex
        self._log("{} binary rules ({:.2f} secs)".format(
            num_bin, timer.elapsed()))

        # Start rules
        timer.reset()
        timer.start()
        for contexts, nt in context_nts.iteritems():
            rule = Production(self._start_symbol, [nt])
            if rule in self._productions:
                continue
            if rule in self._eliminated_rules:
                continue
            if Context([], []) in contexts:
                self._productions.add(rule)
            else:
                self._eliminated_rules.add(rule)

        timer.stop()
        num_start = len(self._productions) - num_lex - num_bin
        self._log("{} start rules ({:.2f} secs)".format(
            num_start, timer.elapsed()))

        # Construct the grammar
        self._curr_guess = CFG(self._start_symbol, self._productions)
        self._curr_guess_parser = ChartParser(self._curr_guess)

        total_timer.stop()
        elapsed = total_timer.elapsed()
        num_rules = len(self._curr_guess.productions())
        self._log("Constructed grammar with {} rules ({:.2f} secs)".format(
            num_rules, elapsed))

        return self._curr_guess

    def save_as(self, filename, verbose=False):
        """
        Saves this PrimalLearner object to a file.

        :type filename: str
        :param filename: The name of the file to save to

        :type verbose: bool
        :param verbose: If true, information will be printed

        :return: None
        """
        f = open(filename, "wb")
        parser = self._curr_guess_parser
        self._curr_guess_parser = None
        pickle.dump(self, filename)
        self._curr_guess_parser = parser
        f.close()

    @staticmethod
    def from_grammar(grammar, k):
        """
        Instantiate a PrimalLearner from a grammar.

        :type grammar: CFG
        :param grammar: A grammar

        :type k: int
        :param k: The grammar learned will have the k-FKP.

        :rtype: PrimalLearner
        :return: A PrimalLearner
        """
        text = oracles.GrammarText(grammar)
        oracle = oracles.GrammarOracle(grammar)
        return PrimalLearner(text, oracle, k)
Exemplo n.º 24
0
for sent in sentences:
    for p in parser.parse(sent):
        p.draw()

from nltk.corpus import treebank
print(treebank.parsed_sents()[0])
print(treebank.parsed_sents()[1])

from nltk.grammar import CFG, Nonterminal

prods = list({
    production
    for sent in treebank.parsed_sents() for production in sent.productions()
})
t_grammar = CFG(Nonterminal('S'), prods)

sents = [
    'Mr. Vinken is chairman .'.split(), 'Stocks rose .'.split(),
    'Alan introduced a plan .'.split()
]

t_parser = BottomUpChartParser(t_grammar)

parses = 0
for s in sents[:1]:
    for p in t_parser.parse(s):
        if parses < 5:
            print(p)
        parses += 1
        if m != "":
            stimtrees.append(
                ("(ROOT" + m, num)
            )  # add ROOT tag back at the beginning of the tree, and output a tuple with tree and the stimulus ID (num)

"""Fix up the last tree in stimulus 0.. stimulus 0 excluded the last IU (="stating that") in the speaker's turn, to make it fit with the desired IU count. Those two words were included in the text given to the parser in case the omission would have caused the parser difficulty, but we don't want to include them in our analysis since the subjects didn't actually hear them.. I removed those two words from the stanford parse tree text file that we read in earlier, but now I need to add in the final parentheses to make the parse processable by the Tree function
"""
stimtrees[5] = (stimtrees[5][0] + ")))))))\n(. ?))\n(. .)))\n\n", stimtrees[5][1])

processed_trees = [
    Tree.fromstring(tree[0]) for tree in stimtrees
]  # create Tree structure and viewable tree image for each tree
processed_trees[0]  # shows tree image for stimulus 0
# prods=[t.productions() for t in processed_trees]
rules = reduce(lambda x, y: x + y, [t.productions() for t in processed_trees])
mycfg = CFG(Nonterminal("ROOT"), rules)
mycfg.start()
mycfg.productions(
    lhs=Nonterminal("PP")
)  # Will print productions for the specified nonterminal item (e.g. "PP", a prepositional phrase), where the PP is the left-hand side of the rule (e.g. PP -> whatever)

#%%
# ==============================================================================
# Loop through Production rules to extract Syntactic Tags and Terminal Words, keep track of Clause boundaries by looking for the first word appearing after an "S" tag
# ==============================================================================
words = []
counter = 0
tags = []
ruleset = []
ClauseBoundary = (
    False
Exemplo n.º 26
0
def load_grammar(grammar_path):
    logger.info('Loading grammar in %s' % grammar_path)
    with open(grammar_path) as fin:
        grammar_string = fin.read()

    return CFG.fromstring(grammar_string)
grammar = CFG.fromstring("""
S -> Fallback Err Fallback
S -> Fallback
Fallback -> AllTags Fallback
Fallback ->
S -> AllTags
AllTags -> 'END' | 'QUOT' | '(' | ')' | ',' | '--' | '.' | 'CC' | 'CD' | 'DT' | 'EX' | 'FW' | 'IN' | 'JJ' | 'JJR' | 'JJS' | 'LS' | 'MD' | 'NN' | 'NNP' | 'NNPS' | 'NNS' | 'PDT' | 'POS' | 'PRP' | 'PRP$' | 'RB' | 'RBR' | 'RBS' | 'RP' | 'SYM' | 'TO' | 'UH' | 'VB' | 'VBD' | 'VBG' | 'VBN' | 'VBP' | 'VBZ' | 'WDT' | 'WP' | 'WP$' | 'WRB' | '``' | Det | ':'
Det -> DetPl | DetSg | DetNeut
DetNeut -> 'the' | 'some' | 'another' | 'no' | 'his' | 'her' | 'his/her' | 'any'
DetSg -> 'a' | 'an' | 'this' | 'every' | 'another' | 'that' | 'each' | 'neither'
DetPl -> 'all' | 'both' | 'these' | 'those'
Err -> ErrUD | ErrAGD | ErrFD | ErrAGV

NotNPHead -> 'END' | 'QUOT' | '(' | ')' | ',' | '--' | '.' | 'CC' | 'DT' | 'EX' | 'FW' | 'IN' | 'LS' | 'MD' | 'NN' | 'NNP' | 'NNPS' | 'NNS' | 'PDT' | 'POS' | 'PRP' | 'PRP$' | 'RB' | 'RBR' | 'RBS' | 'RP' | 'SYM' | 'TO' | 'UH' | 'VB' | 'VBD' | 'VBG' | 'VBN' | 'VBP' | 'VBZ' | 'WDT' | 'WP' | 'WP$' | 'WRB' | '``' | ':'

CDList -> 'CD' CDList
CDList ->

JJList -> 'JJ' JJList
JJList -> 'JJR' JJList
JJList -> 'JJS' JJList
JJList ->


ErrAGD -> DetPl JJList 'NN'
ErrAGD -> DetSg JJList CDList JJList 'NNS'

ErrFD -> 'a' AllTags
ErrFD -> 'an' AllTags

ErrUD -> Det JJList 'NNP'
ErrUD -> Det JJList CDList JJList 'NNPS'

""")
Exemplo n.º 28
0
    def generate_context_free_grammar_novel_text(
            self, corpus, number_of_words_in_sentence,
            number_of_sentences_per_record, number_of_records):
        '''
        This method utilizes NLTK's Context Free Grammar parser objects to
        parse an available .*cfg file and generate novel text from it.

        Params:
        -------
        - number_of_words_in_sentence (int): An indicator as to the number of
        words to generate in each novel sentence.
        - number_of_sentences_per_record (int): An indicator as to the number
        of sentences per record to generate.
        - number_of_records (int): An indicator as to the total number of
        records to generate.

        Returns: str
        '''
        words = []
        punct_selector = ['. ', '! ', '? ']
        punctuation_stop_symbols = dict(
            (ord(char), None) for char in string.punctuation)
        parser = None
        grammar = None
        try:
            if isinstance(corpus, CFG):
                _grammar = corpus
                if _grammar is not None:
                    parser = ChartParser(_grammar)
                    grammar = parser.grammar
            elif isinstance(corpus, FeatureGrammar):
                _grammar = corpus
                if _grammar is not None:
                    parser = FeatureChartParser(_grammar)
                    grammar = parser.grammar()
            elif isinstance(corpus, PCFG):
                _grammar = corpus
                if _grammar is not None:
                    parser = InsideChartParser(_grammar)
                    grammar = parser.grammar()
            else:
                grammar = CFG.fromstring(corpus)
            if grammar is not None:
                for _ in range(number_of_records):
                    novel_sentence = []
                    for _ in range(number_of_sentences_per_record):
                        sentence = ' '.join([
                            sent for _, sent in enumerate(
                                generate_text(grammar,
                                              depth=2,
                                              n=number_of_words_in_sentence))
                        ])
                        sentence = sentence.translate(
                            punctuation_stop_symbols) + random.choice(
                                punct_selector)
                        sentence = sentence[0:].capitalize()
                        novel_sentence.append(sentence)
                    words.append(''.join(novel_sentence))
        except Exception, error:
            logging.error('TextGenerator: Error occurred - {0}'.format(
                str(error)))
from nltk.grammar import CFG
from nltk.parse import EarleyChartParser
cfg = CFG.fromstring("""
S -> NP VP
NP -> DET NN
NP -> DET NP
NP -> JJ NN
VP -> VB NP
DET -> 'a' | 'the'
JJ -> 'lucky'
NN -> 'man' | 'woman'
VB -> 'loves' | 'shoots'
""")

cfgparser = EarleyChartParser(cfg)

s = 'a man loves a woman'.split()
for tree in cfgparser.parse(s):
    print(tree.pformat())
    tree.draw()
s = 'the man shoots a woman'.split()
for tree in cfgparser.parse(s):
    print(tree.pformat())
    tree.draw()
s = 'a lucky woman loves a man'.split()
for tree in cfgparser.parse(s):
    print(tree.pformat())
    tree.draw()
Exemplo n.º 30
0
Arquivo: cfg.py Projeto: Geolem/nltk
class CFGEditor:
    """
    A dialog window for creating and editing context free grammars.
    ``CFGEditor`` imposes the following restrictions:

    - All nonterminals must be strings consisting of word
      characters.
    - All terminals must be strings consisting of word characters
      and space characters.
    """

    # Regular expressions used by _analyze_line.  Precompile them, so
    # we can process the text faster.
    ARROW = SymbolWidget.SYMBOLS["rightarrow"]
    _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))")
    _ARROW_RE = re.compile(r"\s*(->|(" + ARROW + r"))\s*")
    _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" + "(->|("  # LHS
                                + ARROW + r"))\s*" +
                                r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$"  # arrow
                                )  # RHS
    _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")")
    _BOLD = ("helvetica", -12, "bold")

    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None:
            self._cfg = cfg
        else:
            self._cfg = CFG(Nonterminal("S"), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side="top", fill="x", expand=0)
        self._init_prodframe()
        self._prodframe.pack(side="top", fill="both", expand=1)
        self._init_buttons()
        self._buttonframe.pack(side="bottom", fill="x", expand=0)

        self._textwidget.focus()

    def _init_startframe(self):
        frame = self._startframe = Frame(self._top)
        self._start = Entry(frame)
        self._start.pack(side="right")
        Label(frame, text="Start Symbol:").pack(side="right")
        Label(frame, text="Productions:").pack(side="left")
        self._start.insert(0, self._cfg.start().symbol())

    def _init_buttons(self):
        frame = self._buttonframe = Frame(self._top)
        Button(frame, text="Ok", command=self._ok, underline=0,
               takefocus=0).pack(side="left")
        Button(frame,
               text="Apply",
               command=self._apply,
               underline=0,
               takefocus=0).pack(side="left")
        Button(frame,
               text="Reset",
               command=self._reset,
               underline=0,
               takefocus=0).pack(side="left")
        Button(frame,
               text="Cancel",
               command=self._cancel,
               underline=0,
               takefocus=0).pack(side="left")
        Button(frame,
               text="Help",
               command=self._help,
               underline=0,
               takefocus=0).pack(side="right")

    def _init_bindings(self):
        self._top.title("CFG Editor")
        self._top.bind("<Control-q>", self._cancel)
        self._top.bind("<Alt-q>", self._cancel)
        self._top.bind("<Control-d>", self._cancel)
        # self._top.bind('<Control-x>', self._cancel)
        self._top.bind("<Alt-x>", self._cancel)
        self._top.bind("<Escape>", self._cancel)
        # self._top.bind('<Control-c>', self._cancel)
        self._top.bind("<Alt-c>", self._cancel)

        self._top.bind("<Control-o>", self._ok)
        self._top.bind("<Alt-o>", self._ok)
        self._top.bind("<Control-a>", self._apply)
        self._top.bind("<Alt-a>", self._apply)
        self._top.bind("<Control-r>", self._reset)
        self._top.bind("<Alt-r>", self._reset)
        self._top.bind("<Control-h>", self._help)
        self._top.bind("<Alt-h>", self._help)
        self._top.bind("<F1>", self._help)

    def _init_prodframe(self):
        self._prodframe = Frame(self._top)

        # Create the basic Text widget & scrollbar.
        self._textwidget = Text(self._prodframe,
                                background="#e0e0e0",
                                exportselection=1)
        self._textscroll = Scrollbar(self._prodframe,
                                     takefocus=0,
                                     orient="vertical")
        self._textwidget.config(yscrollcommand=self._textscroll.set)
        self._textscroll.config(command=self._textwidget.yview)
        self._textscroll.pack(side="right", fill="y")
        self._textwidget.pack(expand=1, fill="both", side="left")

        # Initialize the colorization tags.  Each nonterminal gets its
        # own tag, so they aren't listed here.
        self._textwidget.tag_config("terminal", foreground="#006000")
        self._textwidget.tag_config("arrow", font="symbol")
        self._textwidget.tag_config("error", background="red")

        # Keep track of what line they're on.  We use that to remember
        # to re-analyze a line whenever they leave it.
        self._linenum = 0

        # Expand "->" to an arrow.
        self._top.bind(">", self._replace_arrows)

        # Re-colorize lines when appropriate.
        self._top.bind("<<Paste>>", self._analyze)
        self._top.bind("<KeyPress>", self._check_analyze)
        self._top.bind("<ButtonPress>", self._check_analyze)

        # Tab cycles focus. (why doesn't this work??)
        def cycle(e, textwidget=self._textwidget):
            textwidget.tk_focusNext().focus()

        self._textwidget.bind("<Tab>", cycle)

        prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()]
        for i in range(len(prod_tuples) - 1, 0, -1):
            if prod_tuples[i][0] == prod_tuples[i - 1][0]:
                if () in prod_tuples[i][1]:
                    continue
                if () in prod_tuples[i - 1][1]:
                    continue
                print(prod_tuples[i - 1][1])
                print(prod_tuples[i][1])
                prod_tuples[i - 1][1].extend(prod_tuples[i][1])
                del prod_tuples[i]

        for lhs, rhss in prod_tuples:
            print(lhs, rhss)
            s = "%s ->" % lhs
            for rhs in rhss:
                for elt in rhs:
                    if isinstance(elt, Nonterminal):
                        s += " %s" % elt
                    else:
                        s += " %r" % elt
                s += " |"
            s = s[:-2] + "\n"
            self._textwidget.insert("end", s)

        self._analyze()

    #         # Add the producitons to the text widget, and colorize them.
    #         prod_by_lhs = {}
    #         for prod in self._cfg.productions():
    #             if len(prod.rhs()) > 0:
    #                 prod_by_lhs.setdefault(prod.lhs(),[]).append(prod)
    #         for (lhs, prods) in prod_by_lhs.items():
    #             self._textwidget.insert('end', '%s ->' % lhs)
    #             self._textwidget.insert('end', self._rhs(prods[0]))
    #             for prod in prods[1:]:
    #                 print '\t|'+self._rhs(prod),
    #                 self._textwidget.insert('end', '\t|'+self._rhs(prod))
    #             print
    #             self._textwidget.insert('end', '\n')
    #         for prod in self._cfg.productions():
    #             if len(prod.rhs()) == 0:
    #                 self._textwidget.insert('end', '%s' % prod)
    #         self._analyze()

    #     def _rhs(self, prod):
    #         s = ''
    #         for elt in prod.rhs():
    #             if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol()
    #             else: s += ' %r' % elt
    #         return s

    def _clear_tags(self, linenum):
        """
        Remove all tags (except ``arrow`` and ``sel``) from the given
        line of the text widget used for editing the productions.
        """
        start = "%d.0" % linenum
        end = "%d.end" % linenum
        for tag in self._textwidget.tag_names():
            if tag not in ("arrow", "sel"):
                self._textwidget.tag_remove(tag, start, end)

    def _check_analyze(self, *e):
        """
        Check if we've moved to a new line.  If we have, then remove
        all colorization from the line we moved to, and re-colorize
        the line that we moved from.
        """
        linenum = int(self._textwidget.index("insert").split(".")[0])
        if linenum != self._linenum:
            self._clear_tags(linenum)
            self._analyze_line(self._linenum)
            self._linenum = linenum

    def _replace_arrows(self, *e):
        """
        Replace any ``'->'`` text strings with arrows (char \\256, in
        symbol font).  This searches the whole buffer, but is fast
        enough to be done anytime they press '>'.
        """
        arrow = "1.0"
        while True:
            arrow = self._textwidget.search("->", arrow, "end+1char")
            if arrow == "":
                break
            self._textwidget.delete(arrow, arrow + "+2char")
            self._textwidget.insert(arrow, self.ARROW, "arrow")
            self._textwidget.insert(arrow, "\t")

        arrow = "1.0"
        while True:
            arrow = self._textwidget.search(self.ARROW, arrow + "+1char",
                                            "end+1char")
            if arrow == "":
                break
            self._textwidget.tag_add("arrow", arrow, arrow + "+1char")

    def _analyze_token(self, match, linenum):
        """
        Given a line number and a regexp match for a token on that
        line, colorize the token.  Note that the regexp match gives us
        the token's text, start index (on the line), and end index (on
        the line).
        """
        # What type of token is it?
        if match.group()[0] in "'\"":
            tag = "terminal"
        elif match.group() in ("->", self.ARROW):
            tag = "arrow"
        else:
            # If it's a nonterminal, then set up new bindings, so we
            # can highlight all instances of that nonterminal when we
            # put the mouse over it.
            tag = "nonterminal_" + match.group()
            if tag not in self._textwidget.tag_names():
                self._init_nonterminal_tag(tag)

        start = "%d.%d" % (linenum, match.start())
        end = "%d.%d" % (linenum, match.end())
        self._textwidget.tag_add(tag, start, end)

    def _init_nonterminal_tag(self, tag, foreground="blue"):
        self._textwidget.tag_config(tag,
                                    foreground=foreground,
                                    font=CFGEditor._BOLD)
        if not self._highlight_matching_nonterminals:
            return

        def enter(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background="#80ff80")

        def leave(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background="")

        self._textwidget.tag_bind(tag, "<Enter>", enter)
        self._textwidget.tag_bind(tag, "<Leave>", leave)

    def _analyze_line(self, linenum):
        """
        Colorize a given line.
        """
        # Get rid of any tags that were previously on the line.
        self._clear_tags(linenum)

        # Get the line line's text string.
        line = self._textwidget.get(
            repr(linenum) + ".0",
            repr(linenum) + ".end")

        # If it's a valid production, then colorize each token.
        if CFGEditor._PRODUCTION_RE.match(line):
            # It's valid; Use _TOKEN_RE to tokenize the production,
            # and call analyze_token on each token.
            def analyze_token(match, self=self, linenum=linenum):
                self._analyze_token(match, linenum)
                return ""

            CFGEditor._TOKEN_RE.sub(analyze_token, line)
        elif line.strip() != "":
            # It's invalid; show the user where the error is.
            self._mark_error(linenum, line)

    def _mark_error(self, linenum, line):
        """
        Mark the location of an error in a line.
        """
        arrowmatch = CFGEditor._ARROW_RE.search(line)
        if not arrowmatch:
            # If there's no arrow at all, highlight the whole line.
            start = "%d.0" % linenum
            end = "%d.end" % linenum
        elif not CFGEditor._LHS_RE.match(line):
            # Otherwise, if the LHS is bad, highlight it.
            start = "%d.0" % linenum
            end = "%d.%d" % (linenum, arrowmatch.start())
        else:
            # Otherwise, highlight the RHS.
            start = "%d.%d" % (linenum, arrowmatch.end())
            end = "%d.end" % linenum

        # If we're highlighting 0 chars, highlight the whole line.
        if self._textwidget.compare(start, "==", end):
            start = "%d.0" % linenum
            end = "%d.end" % linenum
        self._textwidget.tag_add("error", start, end)

    def _analyze(self, *e):
        """
        Replace ``->`` with arrows, and colorize the entire buffer.
        """
        self._replace_arrows()
        numlines = int(self._textwidget.index("end").split(".")[0])
        for linenum in range(1, numlines + 1):  # line numbers start at 1.
            self._analyze_line(linenum)

    def _parse_productions(self):
        """
        Parse the current contents of the textwidget buffer, to create
        a list of productions.
        """
        productions = []

        # Get the text, normalize it, and split it into lines.
        text = self._textwidget.get("1.0", "end")
        text = re.sub(self.ARROW, "->", text)
        text = re.sub("\t", " ", text)
        lines = text.split("\n")

        # Convert each line to a CFG production
        for line in lines:
            line = line.strip()
            if line == "":
                continue
            productions += _read_cfg_production(line)
            # if line.strip() == '': continue
            # if not CFGEditor._PRODUCTION_RE.match(line):
            #    raise ValueError('Bad production string %r' % line)
            #
            # (lhs_str, rhs_str) = line.split('->')
            # lhs = Nonterminal(lhs_str.strip())
            # rhs = []
            # def parse_token(match, rhs=rhs):
            #    token = match.group()
            #    if token[0] in "'\"": rhs.append(token[1:-1])
            #    else: rhs.append(Nonterminal(token))
            #    return ''
            # CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
            #
            # productions.append(Production(lhs, *rhs))

        return productions

    def _destroy(self, *e):
        if self._top is None:
            return
        self._top.destroy()
        self._top = None

    def _ok(self, *e):
        self._apply()
        self._destroy()

    def _apply(self, *e):
        productions = self._parse_productions()
        start = Nonterminal(self._start.get())
        cfg = CFG(start, productions)
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(cfg)

    def _reset(self, *e):
        self._textwidget.delete("1.0", "end")
        for production in self._cfg.productions():
            self._textwidget.insert("end", "%s\n" % production)
        self._analyze()
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(self._cfg)

    def _cancel(self, *e):
        try:
            self._reset()
        except:
            pass
        self._destroy()

    def _help(self, *e):
        # The default font's not very legible; try using 'fixed' instead.
        try:
            ShowText(
                self._parent,
                "Help: Chart Parser Demo",
                (_CFGEditor_HELP).strip(),
                width=75,
                font="fixed",
            )
        except:
            ShowText(
                self._parent,
                "Help: Chart Parser Demo",
                (_CFGEditor_HELP).strip(),
                width=75,
            )
Exemplo n.º 31
0
from nltk.grammar import CFG
from nltk.parse.chart import ChartParser, BU_LC_STRATEGY

grammar = CFG.fromstring("""
S -> T1 T4
T1 -> NNP VBZ
T2 -> DT NN
T3 -> IN NNP
T4 -> T3 | T2 T3
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

cp = ChartParser(grammar, BU_LC_STRATEGY, trace=True)

sentence = "Bangalore is the capital of Karnataka"
tokens = sentence.split()
chart = cp.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
print("Total Edges :", len(chart.edges()))
for tree in parses:
    print(tree)
tree.draw()
Exemplo n.º 32
0
def extract_simple_cfg(n):
    rules = extract_simple_productions(n)
    rules = list(set(rules))
    return CFG(Nonterminal("S"), sort_rules(rules))
Exemplo n.º 33
0
    return len(rhs) == 1 and isinstance(rhs[0], str)


parser = CoreNLPParser(url="http://localhost:9000")

sentences = brown.sents()

# FILTER SHORT AND LONG SENTENCES
filter_sentences = []
for sentence in tqdm(sentences):
    nb_words = number_of_words(sentence)
    if nb_words >= 5 and nb_words <= 10:
        filter_sentences.append(sentence)

# PARSE SENTENCES
productions = []
for sentence in tqdm(filter_sentences):
    parse_tree = next(iter(parser.parse(sentence)))
    productions += parse_tree.productions()

unique_productions = list(set(productions))

# REMOVE TERMINAL SYMBOLS
productions_wo_term = []
for prod in unique_productions:
    if not is_rhs_terminal(prod):
        productions_wo_term.append(prod)

grammar = CFG(start=Nonterminal("ROOT"), productions=productions_wo_term)
pickle.dump(grammar, open("brown_grammar.pickle", "wb"))