def generate_phrase(self, pool): try: adj = choice(list(pool.adjectives)) parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) #adj = choice(list(pool.adjectives)) noun = choice(list(pool.comparisons[adj])) if en.noun.plural(noun.name) == noun.name: article = "the" else: article = en.noun.article(noun.name).split(" ")[0] replace_words = {'adj': adj, 'n': noun, 'det': article} for pos in replace_words: while pos in phrase: try: phrase = self.replace_pos(pos, replace_words[pos], phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase except: return
def generate_phrase(self, pool): parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) noun = choice(list(pool.nouns)) try: replace_words = { 'n': [noun], 'v': [Word(self.conjugate(v.name)) for v in list(pool.verbs[noun])], 'adj': pool.epithets[noun], 'atv': [Word(self.conjugate(v, self.person)) for v in self.atv], 'eva': [Word(self.conjugate(v, self.person)) for v in self.eva], 'ej': pool.emotional_adjectives, 'en': pool.emotional_nouns, 'erb': pool.emotional_adverbs, 'person': [Word(self.persons[self.person][0])], 'pron': [Word(self.persons[self.person][1])] } except: return for pos in replace_words: while pos in phrase: try: word = choice(replace_words[pos]) phrase = self.replace_pos(pos, word, phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def execute(text: str): groucho_grammer = CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' V -> 'shot' P -> 'in' """) parser = ChartParser(groucho_grammer) tokens = word_tokenize(text=SAMPLE_3) print(type(tokens)) print(tokens) for tree in parser.parse(tokens=[ 'The', 'little', 'bear', 'saw', 'the', 'fine', 'fat', 'trout', 'in', 'the', 'brook', ]): print(tree)
def context_free_grammar(): cfg = CFG.fromstring("""\ ################# Rules ################# S -> NP VP S -> PP NP VP S -> Wh Aux NP VP NP -> ProperNoun | CC ProperNoun | N | ProperNoun NP | AP N | DET NP | N PP VP -> V | V NP | Adv VP | V NP VP AP -> Adj | Adj AP PP -> P NP | P NP VP ################# Lexicons ################# N -> 'milk'| 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table' V -> 'laughs' | 'laughed' | 'drink' | 'wears' | 'serves' | 'drinks' | 'thinks' | 'wear' ProperNoun -> 'Bart' | 'Homer' | 'Lisa' Aux -> 'do' | 'does' CC -> 'and' Adj -> 'blue' | 'healthy' | 'green' DET -> 'a' | 'the' Adv -> 'always' | 'never' P -> 'in' | 'before' | 'on' | 'when' Wh -> 'when' """) cfparser = ChartParser(cfg) sents = text.splitlines() for sent in sents: parses = cfparser.parse(sent.split()) print(sent) for tree in parses: print(tree)
def generate_phrase(self, pool): parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) noun = choice(list(pool.nouns)) try: replace_words = {'n':[noun], 'v': [Word(self.conjugate(v.name)) for v in list(pool.verbs[noun])], 'adj': pool.epithets[noun], 'atv':[Word(self.conjugate(v, self.person)) for v in self.atv], 'eva':[Word(self.conjugate(v, self.person)) for v in self.eva], 'ej': pool.emotional_adjectives,'en':pool.emotional_nouns, 'erb': pool.emotional_adverbs, 'person':[Word(self.persons[self.person][0])], 'pron':[Word(self.persons[self.person][1])]} except: return for pos in replace_words: while pos in phrase: try: word = choice(replace_words[pos]) phrase = self.replace_pos(pos,word,phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def parse_original_sentences(grammar): ''' Uses given grammar to parse sentences from the file corpus.txt Writes the parse trees of each sentence in parsed_corpus.txt :param grammar: A context free grammar in the form of nltk.grammar.CFG :return: None (Output in parsed_corpus.txt) ''' parser = ChartParser(grammar) f = open("corpus.txt", "r") f_write = open("parsed_corpus.txt", "w") lines = f.readlines() count = 1 working = [] for line in lines: line = line.replace("didnt", "did not") s = "Tree {}:\n".format(count) sent = word_tokenize(line[:-2]) for tree in parser.parse(sent): s += str(tree) + "\n\n" working.append(count) break count += 1 f_write.write(s) f.close() f_write.close() print( "Parsed form of original corpus sentences using this CFG can be found in parsed_corpus.txt\n" )
def __init__(self, grammar): """ Initialize from a CFG. :type grammar: CFG :param grammar: The grammar for this oracle """ self._parser = ChartParser(grammar)
def parse_sentences(grammar): parser = ChartParser(grammar) sent = input("Parse a sentence (Q to quit): ") while sent != "Q": tokens = word_tokenize(sent) trees = parser.parse(tokens) print_trees(trees) sent = input("Parse a sentence (Q to quit): ")
def generate_name(G): grammar = CFG.fromstring(G) parser = ChartParser(grammar) gr = parser.grammar() tokens = produce(gr, gr.start()) name = ''.join(tokens) return name.title()
def generate_parse_tree(sentence, grammar): # then generate the parse trees tokens = word_tokenize(sentence) parser = ChartParser(grammar) # print type(grammar), type(parser) try: return parser.parse(tokens) except Exception: #print "Sentence '" + sentence + "' cannot be parsed using the given grammar." return Tree('Error', ['Error'])
def generate_impacts_question(attr, impacts, phase): impact = get_attribute_name(attr, impacts) parser = ChartParser(generate_impacts_grammar(impact, phase)) gr = parser.grammar() question = { 'text': ' '.join(produce(gr, gr.start())), 'answer': 0, 'questionId': 0, 'attrId': attr, 'topicId': 4 } return question
def generate_entities_question(attr, entities, phase): entity = get_attribute_name(attr, entities) parser = ChartParser(generate_entities_grammar(entity, phase)) gr = parser.grammar() question = { 'text': ' '.join(produce(gr, gr.start())), 'answer': 0, 'questionId': 0, 'attrId': attr, 'topicId': 3 } return question
def get_productions(sentence, grammar): trees = [] sent = sentence.split(' ') print sent cfgGrammar = CFG.fromstring(grammar) parser = ChartParser(cfgGrammar) for tree in parser.parse(sent): trees.append(str(tree).replace("\n", " ")) # print trees[0] t = Tree.fromstring(trees[0]) return t.productions()
class GrammarOracle(Oracle): """ An oracle from a grammar. """ def __init__(self, grammar): """ Initialize from a CFG. :type grammar: CFG :param grammar: The grammar for this oracle """ self._parser = ChartParser(grammar) def generates(self, sentence): """ Decides whether the grammar generates the sentence. :type sentence: Sentence :param sentence: A sentence :rtype: bool :return: Whether the grammar generates the sentence """ try: parses = self._parser.parse(sentence.get_words()) return list(parses) != [] except: return False
def generate_sources_question(attr, parent_attr, sources, phase): id = attr attribute = get_attribute_name(attr, sources) attribute = analyze_numerus(attribute) if parent_attr is not None: parent_attr = get_attribute_name(parent_attr, sources) parser = ChartParser( generate_sources_grammar(attribute, parent_attr, phase)) gr = parser.grammar() question = { 'text': ' '.join(produce(gr, gr.start())), 'answer': 0, 'questionId': 0, 'attrId': id, 'topicId': 1 } return question
def recognizes(cfg, word): """ cfg : a nltk.grammar.CFG instance word : a string with tokens separated with spaces. A parser is created at every call of this function. """ return _recognizes(ChartParser(cfg), word.split())
def parse_blazon(blazon): blazon = blazon.lower() to_discard = set(string.punctuation) to_discard.remove("&") blazon = ''.join(c for c in blazon if c not in to_discard) # Convert raw data to tokens to be parsed tokens = word_tokenize(blazon) # Replace instances of '1st', '2nd', etc with their non abbreviated forms for (index, item) in enumerate(tokens): if (item in abbr_to_full): tokens[index] = abbr_to_full[item] elif (item == "&"): tokens[index] = "and" # Sanitise tokens tokens = disambiguate_colours(tokens) tokens = reorder(tokens) # Construct grammar and parser with open('app/parser_cfg.txt') as f: raw_cfg = f.read() parser_grammar = CFG.fromstring(raw_cfg) parser = ChartParser(parser_grammar) # Parse data into tree output_data = None for tree in parser.parse(tokens): output_data = tree if (output_data is None): print("Error: Parse failed, please check input is of correct format.") else: # Convert Tree to dict to prepare it for JSON serialisation output_data = tree_to_dict(output_data) # If a tincture is in the top level of the dictionary, change its name to "field" if ("tincture" in output_data.keys()): output_data["field"] = output_data["tincture"] output_data.pop("tincture") # Convert dict to JSON return (output_data)
def verify(self, grammar, tags): """ Verify tag sequence as grammatically correct or not """ # rd_parser = RecursiveDescentParser(grammar) rd_parser = ChartParser(grammar) valid = False try: for tree in rd_parser.parse(tags): valid = True break except ValueError: print "This is a grammatical structure I don't understand yet." return if valid: print "Valid" return True else: print "Invalid" return False
def accepted_length(cfg, x): """ Returns a list of every accepted word of a context-free grammar with a specific length """ terminals = _get_terminal_symbols(cfg) parser = ChartParser(cfg) accepted = [] for y in product(terminals, repeat=x): if _recognizes(parser, y): accepted.append(' '.join(y)) return accepted
def generate_phrase(self): adj = choice([a for a in self.blackboard.pool.comparisons if len(self.blackboard.pool.comparisons[a]) > 0]) parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) noun = choice(list(self.blackboard.pool.comparisons[adj])) noun.name = en.singularize(noun.name) article = en.referenced(noun.name).split(" ")[0] replace_words = {'adj': adj, 'n': noun, 'det': article} for pos in replace_words: while pos in phrase: try: phrase = self.replace_pos( pos, replace_words[pos], phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def recognizesAll(cfg, words): """ Returns a list of boolean values corresponding to [recognizes(cfg,w) for w in words]. cfg : a nltk.grammar.CFG instance words must be a list of string with tokens separated with spaces. """ r = [] parser = ChartParser(cfg) for word in words: r.append(_recognizes(parser, word.split())) return r
def generate_phrase(self, pool): noun = random.choice(list(pool.nouns)) parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) phrase.append("?") try: adj = choice(pool.epithets[noun]) except: return replace_words = {'adj': adj, 'n': noun, 'be': self.conjugate("be")} for pos in replace_words: while pos in phrase: try: phrase = self.replace_pos(pos, replace_words[pos], phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def generate_phrase(self, pool): noun = random.choice(list(pool.nouns)) parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) phrase.append("?") try: adj = choice(pool.epithets[noun]) except: return replace_words = {'adj':adj, 'n': noun, 'be': self.conjugate("be")} for pos in replace_words: while pos in phrase: try: phrase = self.replace_pos(pos,replace_words[pos],phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def generate_phrase(self, pool): parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) noun = random.choice(list(pool.nouns)) adj = choice(pool.epithets[noun]) replace_words = { "adj": adj, "n": noun, "be": self.conjugate("be", self.person), "person": self.persons[self.person][0], } for pos in replace_words: while pos in phrase: try: phrase = self.replace_pos(pos, replace_words[pos], phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def accepted_under(cfg, length): """ Returns a list of every accepted word of a context-free grammar under a given length. cfg : a nltk.grammar.CFG instance. """ terminals = _get_terminal_symbols(cfg) parser = ChartParser(cfg) accepted = [] for x in range(1, length): for y in product(terminals, repeat=x): if _recognizes(parser, y): accepted.append(' '.join(y)) return accepted
def main(): cfparser = ChartParser(cfg) index = 0 for sent in text: index += 1 print_tree(sent, cfparser, index) print "Input testing sentece or the number of the above one: (q to quit)" str = sys.stdin.readline().strip() while str != "q": try: index = int(str) print_tree(text[index], cfparser, index) except IndexError: print "Index out of range. Please check." except ValueError: print_tree(str, cfparser, -1) print "Input testing sentece or the number of the above one: (q to quit)" str = sys.stdin.readline().strip()
Nominal -> NOUN | Nominal PP | ADJ Nominal | Nominal NOUN PP -> Prep NP AdvC -> CONJ S ProperNoun -> 'Bart' | 'Homer' | 'Lisa' CONJ -> 'and' | 'when' ADV -> 'always' | 'never' V -> 'laughs' | 'laughed' | 'drink' | 'wears' | 'serves' | 'drinks' | 'thinks' | 'wear' DET -> 'a' | 'the' NOUN -> 'milk' | 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table' ADJ -> 'blue' | 'healthy' | 'green' Prep -> 'in' | 'before' | 'on' WH -> 'when' Aux -> 'do' | 'does' """) cfparser = ChartParser(cfg) text = """ Bart laughs Homer laughed Bart and Lisa drink milk Bart wears blue shoes Lisa serves Bart a healthy green salad Homer serves Lisa Bart always drinks milk Lisa thinks Homer thinks Bart drinks milk Homer never drinks milk in the kitchen before midnight when Homer drinks milk Bart laughs when does Lisa drinks the milk on the table when do Lisa and Bart wear shoes """
import nltk from nltk import ChartParser # Load grammar. grammar = nltk.data.load('labelgrammar.cfg') parser = ChartParser(grammar) def analyze_label(label): "Analyze a label using our CFG." tokenized_label = label.split() try: analysis = parser.parse(tokenized_label) trees = list(analysis) for tree in trees: print(tree) if len(trees) > 0: return analysis else: print('No analysis possible') return None except ValueError as e: print('No analysis possible:', e.strerror) return None
from nltk import data, ChartParser from nltk import pos_tag from nltk.corpus import inaugural data.clear_cache() G = data.load("file:mygrammar.cfg") RDP = ChartParser(G) # extract_short_sents :: Int?, Int?, Corpus?-> [[(String, String)]] def extract_short_sents(num=8, max_len=8, corpus=inaugural): li = [] num = num if num < len(corpus.fileids()) else len(corpus.fileids()) for i in range(num): for sent in corpus.sents(corpus.fileids()[i]): if len(sent) <= max_len: li.append(pos_tag(sent)) if len(li) / 3.0 == i: break return li # parse :: String -> ParseTree def parse(s): return RDP.parse(s.split()) if __name__ == "__main__": sents = [
from nltk import CFG,ChartParser from nltk.tokenize import SpaceTokenizer grammar = CFG.fromstring(""" S -> NP VP NP -> Det N VP -> IV Det -> 'the' N -> 'man' IV -> 'walks' """) #>>> grammar #<Grammar with 14 productions> #>>> grammar.start() #S #>>> grammar.productions() #[S -> NP VP, NP -> Det N, VP -> IV, Det -> 'the', N -> 'man', IV -> 'walks'] parser = ChartParser(grammar) parses = parser.parse_all(SpaceTokenizer().tokenize("the man walks")) #>>> parses #[Tree('S', [Tree('NP', [Tree('Det', ['the']), Tree('N', ['man'])]), Tree('VP', [Tree('IV', ['walks'])])])]
S -> LImports LRules LImports -> Import LImports | Import -> '@import' '"string"' ';' LRules -> Rule LRules | Rule -> Selectors '{' LDeclaretions '}' LDeclaretions -> Declaration ';' MoreDeclerations MoreDeclerations -> LDeclaretions | Selectors -> SimpleSelector MoreSelectors MoreSelectors -> Selectors | SimpleSelector -> Astrisk SelectorModifier Astrisk -> '*' | SelectorModifier -> '.' 'name' | ':' 'name' | '[' 'name' '=' Term ']' | '#hashid' | 'name' Declaration -> 'name' ':' LTerms Important Important -> '!ImPoRtAnT' | LTerms -> Term MoreTerms MoreTerms -> LTerms | Term -> '1337' | '15%' | '"string"' | 'name' | '#hashid' """) parser = ChartParser(grammar) gr = parser.grammar() test_name = "generated" with open(test_name + '.in', 'w+') as writer: writer.write(' '.join(produce(gr, gr.start()))) with open(test_name + '.out', 'w+') as writer: writer.write("\n".join(map(str, rules))) writer.write("\nSuccess\n")
else: words.extend(produce(grammar, sym, minlen)) return words grammar = parse_cfg(''' F -> N1 '(' P ')' | N2 '(' P ',' P ')' N1 -> 'half' N2 -> 'sum' P -> 'a' | 'b' | F ''') ''' S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP V -> 'shot' | 'killed' | 'wounded' Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' | 'cat' | 'dog' P -> 'in' | 'outside' ''' parser = ChartParser(grammar) gr = parser.grammar() print ' '.join(produce(gr, gr.start(),3))
class GDev: # 1. We will create a grammer development tool. # Define a class caledd GDev. The __init__ method should take a # name (a string) as input, and store it in the member name. def __init__( self, name ): self.name = name return # 2. Define a method called load_grammar. It takes no arguments. # It expects the file name.cfg to exist, where name is the GDev name. # It loads a grammer from the file and stores it in the member grammer. def load_grammar( self ): s = open( self.name + '.cfg' ).read() self.grammar = CFG.fromstring(s) return # 3. Define a method called reload. It should call the method # load_grammar, even if the grammar has already been loaded before. # Then it should create a chart parser from the loaded grammar, and # store the parser in the member parser. def reload( self ): self.load_sents() self.load_grammar() self.parser = ChartParser( self.grammar ) return # 4. Define a method called parse. It should take one argument, a string. # It should call word_tokenize on the sentence, and pass the result to # the parser. The parse method should return a single tree. If the parser # returns more than one tree, then parse should return just the first one. # If the parser does not return any tress, then parse should return None. def parse( self, s ): try: return list( self.parser.parse( word_tokenize( s ) ) )[0] except: return None # 5. Define a method called load_sents. It takes no arguments. It expects # the file name.sents to exist. The file should contain one sentence per # line. Each sentence is either good or bad—good sentences are ones that # the grammar ought to generate, and bad sentences are ones that the # grammar should not generate. If the first character on the line is ’*’, # the sentence is bad, and otherwise it is good. The load_sents method # should produce a list of pairs (good, s) where good is True for good # sentences and False for bad ones, and s is the sentence itself (not # including the ’*’). The list of pairs should be stored in the member # sents. Create a file g1.sents containing the sentences Bob warbled, the # dog ate my telescope, and *Bob cat. def load_sents( self ): self.sents = [ ( True, line.rstrip('\r\n') ) \ if line[0] != '*' \ else (False, line.rstrip('\r\n')[1:]) \ for line in open(self.name + '.sents') ] # print( self.sents ) # 6. Define a method called parses. It should take no arguments. # It should iterate through the pairs (g,s) in sents, and it should # call parse on each sentence s in turn. For each sentence, it should # print an empty line, then the sentence, then the result of calling parse. def parses( self ): for s in self.sents: print( '\n' + s[1] ) print( self.parse( s[1] ) ) # 7. Write a method called regress that takes no arguments. It should go # through the pairs (good, s) in sents. For each, it should call parse on s. # Define the prediction to be True if parse returns a tree, and False otherwise. # If the prediction equals good, then the prediction is correct, and otherwise # the prediction is wrong. For each pair, print out one line of output. The output # line should start with '!!' if the prediction is wrong and ' ' (two spaces) # it is correct. Then print out a space. Then print '*' if good is False, and a # space if good is True. The output line ends with the sentence s. def regress( self ): prediction = False for s in self.sents: if self.parse( s[1] ) is not None: prediction = True else: prediction = False if prediction != s[0]: print( '!!' + ' ' , end = '') else: print( ' ' + ' ' , end = '') if s[0] == False: print( '*' , end = '') else: print( ' ' , end = '') print( s[1] ) # 8. Finally, the __call__ method should simply call reload and regress. # The idea is to use the set of example sentences to drive grammar development. # One adds sentences, calls gd() to see which ones are being handled correctly # or not, and then one edits the grammar to fix the prediction errors. After # each file edit, one needs merely call gd() to see the revised grammar's # predictions on the sentences. (Making sure that new revisions do not break # things that previously worked correctly is known as regression testing.) def __call__( self ): self.reload() self.regress()
def parse_sentences(grammar, sent): parser = ChartParser(grammar) tokens = word_tokenize(sent) trees = parser.parse(tokens) return trees
import nltk from nltk import parse_cfg, ChartParser from random import choice def produce(finalgrammar, symbol): words = [] productions = finalgrammar.productions(lhs = symbol) production = choice(productions) for sym in production.rhs(): if isinstance(sym, str): words.append(sym) else: words.extend(produce(finalgrammar, sym)) return words finalgrammar = nltk.data.load('file:grammarfinal.cfg',cache=False) parser = ChartParser(finalgrammar) gr = parser.finalgrammar() print ' '.join(produce(gr,gr.start()))
class QueryParser(object): #PYPARSING preterminal definitions LBRACE = Suppress(Literal('(')) RBRACE = Suppress(Literal(')')) WRD = Regex("[0-9a-zA-Z_\-\—\,\.\?\!\>\<\=\/\:\;\&\{\}\+]+") ABL = LBRACE + Suppress(Literal('ABL')) + WRD + RBRACE ABN = LBRACE + Suppress(Literal('ABN')) + WRD + RBRACE ABX = LBRACE + Suppress(Literal('ABX')) + WRD + RBRACE AP = LBRACE + Suppress(Literal('AP')) + WRD + RBRACE AT = LBRACE + Suppress(Literal('AT')) + WRD + RBRACE BE = LBRACE + Suppress(Literal('BE')) + WRD + RBRACE BED = LBRACE + Suppress(Literal('BED')) + WRD + RBRACE BEDZ = LBRACE + Suppress(Literal('BEDZ')) + WRD + RBRACE BEG = LBRACE + Suppress(Literal('BEG')) + WRD + RBRACE BEM = LBRACE + Suppress(Literal('BEM')) + WRD + RBRACE BEN = LBRACE + Suppress(Literal('BEN')) + WRD + RBRACE BER = LBRACE + Suppress(Literal('BER')) + WRD + RBRACE BEZ = LBRACE + Suppress(Literal('BEZ')) + WRD + RBRACE CC = LBRACE + Suppress(Literal('CC')) + WRD + RBRACE CD = LBRACE + Suppress(Literal('CD')) + WRD + RBRACE CS = LBRACE + Suppress(Literal('CS')) + WRD + RBRACE DO = LBRACE + Suppress(Literal('DO')) + WRD + RBRACE DOD = LBRACE + Suppress(Literal('DOD')) + WRD + RBRACE DOZ = LBRACE + Suppress(Literal('DOZ')) + WRD + RBRACE DT = LBRACE + Suppress(Literal('DT')) + WRD + RBRACE DTI = LBRACE + Suppress(Literal('DTI')) + WRD + RBRACE DTS = LBRACE + Suppress(Literal('DTS')) + WRD + RBRACE DTX = LBRACE + Suppress(Literal('DTX')) + WRD + RBRACE EX = LBRACE + Suppress(Literal('EX')) + WRD + RBRACE FW = LBRACE + Suppress(Literal('FW')) + WRD + RBRACE HL = LBRACE + Suppress(Literal('HL')) + WRD + RBRACE HV = LBRACE + Suppress(Literal('HV')) + WRD + RBRACE HVD = LBRACE + Suppress(Literal('HVD')) + WRD + RBRACE HVG = LBRACE + Suppress(Literal('HVG')) + WRD + RBRACE HVN = LBRACE + Suppress(Literal('HVN')) + WRD + RBRACE HVZ = LBRACE + Suppress(Literal('HVZ')) + WRD + RBRACE IN = LBRACE + Suppress(Literal('IN')) + WRD + RBRACE JJ = LBRACE + Suppress(Literal('JJ')) + WRD + RBRACE JJR = LBRACE + Suppress(Literal('JJR')) + WRD + RBRACE JJS = LBRACE + Suppress(Literal('JJS')) + WRD + RBRACE JJT = LBRACE + Suppress(Literal('JJT')) + WRD + RBRACE MD = LBRACE + Suppress(Literal('MD')) + WRD + RBRACE NC = LBRACE + Suppress(Literal('NC')) + WRD + RBRACE NN = LBRACE + Suppress(Literal('NN')) + WRD + RBRACE NNS = LBRACE + Suppress(Literal('NNS')) + WRD + RBRACE NP = LBRACE + Suppress(Literal('NP')) + WRD + RBRACE NPS = LBRACE + Suppress(Literal('NPS')) + WRD + RBRACE NR = LBRACE + Suppress(Literal('NR')) + WRD + RBRACE NRS = LBRACE + Suppress(Literal('NRS')) + WRD + RBRACE OD = LBRACE + Suppress(Literal('OD')) + WRD + RBRACE PN = LBRACE + Suppress(Literal('PN')) + WRD + RBRACE PPL = LBRACE + Suppress(Literal('PPL')) + WRD + RBRACE PPLS = LBRACE + Suppress(Literal('PPLS')) + WRD + RBRACE PPO = LBRACE + Suppress(Literal('PPO')) + WRD + RBRACE PPS = LBRACE + Suppress(Literal('PPS')) + WRD + RBRACE PPSS = LBRACE + Suppress(Literal('PPSS')) + WRD + RBRACE QL = LBRACE + Suppress(Literal('QL')) + WRD + RBRACE QLP = LBRACE + Suppress(Literal('QLP')) + WRD + RBRACE RB = LBRACE + Suppress(Literal('RB')) + WRD + RBRACE RBR = LBRACE + Suppress(Literal('RBR')) + WRD + RBRACE RBT = LBRACE + Suppress(Literal('RBT')) + WRD + RBRACE RN = LBRACE + Suppress(Literal('RN')) + WRD + RBRACE RP = LBRACE + Suppress(Literal('RP')) + WRD + RBRACE TL = LBRACE + Suppress(Literal('TL')) + WRD + RBRACE TO = LBRACE + Suppress(Literal('TO')) + WRD + RBRACE UH = LBRACE + Suppress(Literal('UH')) + WRD + RBRACE VB = LBRACE + Suppress(Literal('VB')) + WRD + RBRACE VBD = LBRACE + Suppress(Literal('VBD')) + WRD + RBRACE VBG = LBRACE + Suppress(Literal('VBG')) + WRD + RBRACE VBN = LBRACE + Suppress(Literal('VBN')) + WRD + RBRACE VBZ = LBRACE + Suppress(Literal('VBZ')) + WRD + RBRACE WDT = LBRACE + Suppress(Literal('WDT')) + WRD + RBRACE WPO = LBRACE + Suppress(Literal('WPO')) + WRD + RBRACE WPS = LBRACE + Suppress(Literal('WPS')) + WRD + RBRACE WQL = LBRACE + Suppress(Literal('WQL')) + WRD + RBRACE WRB = LBRACE + Suppress(Literal('WRB')) + WRD + RBRACE PRETERM = ABL ^ ABN ^ ABX ^ AP ^ AT ^ BE ^ BED ^ BEDZ ^ BEG ^ BEM ^ BEN ^ BER ^ BEZ ^ CC ^ CD ^ CS ^ DO ^ DOD ^ DOZ ^ DT ^ DTI ^ DTS ^ DTX ^ EX ^ FW ^ HL ^ HV ^ HVD ^ HVG ^ HVN ^ HVZ ^ IN ^ JJ ^ JJR ^ JJS ^ JJT ^ MD ^ NC ^ NN ^ NNS ^ NP ^ NPS ^ NR ^ NRS ^ OD ^ PN ^ PPL ^ PPLS ^ PPO ^ PPS ^ PPSS ^ QL ^ QLP ^ RB ^ RBR ^ RBT ^ RN ^ RP ^ TL ^ TO ^ UH ^ VB ^ VBD ^ VBG ^ VBN ^ VBZ ^ WDT ^ WPO ^ WPS ^ WQL ^ WRB UKWORD = Group(LBRACE + Literal('WORD') + PRETERM + RBRACE) #PYPARSING - DSL primary entity company = Group(LBRACE + Literal('company') + OneOrMore(WRD) + RBRACE) entity = Group(LBRACE + Literal('entity') + OneOrMore(WRD) + RBRACE) relation = LBRACE + Literal('relation') + OneOrMore(WRD) + RBRACE attribute = LBRACE + Literal('attribute') + OneOrMore(WRD) + RBRACE CASHFLOW = LBRACE + Literal('CASHFLOW') + OneOrMore(WRD) + RBRACE BALANCESHEET = LBRACE + Literal('BALANCESHEET') + OneOrMore(WRD) + RBRACE INCOMESTMT = LBRACE + Literal('INCOMESTMT') + OneOrMore(WRD) + RBRACE REPORT = Group(LBRACE + Suppress(Literal('REPORT')) + (CASHFLOW ^ BALANCESHEET ^ INCOMESTMT) + RBRACE) DATE = Group(LBRACE + Literal('DATE') + WRD + RBRACE) RELATION = LBRACE + Suppress(Literal('RELATION')) + relation + RBRACE ATTRIBUTE = LBRACE + Suppress(Literal('ATTRIBUTE')) + attribute + RBRACE COMPANY = LBRACE + Suppress(Literal('COMPANY')) + company + RBRACE ENTITY = LBRACE + Suppress(Literal('ENTITY')) + entity + RBRACE GREATERTHAN = LBRACE + Literal('GREATERTHAN') + Suppress(WRD) + RBRACE LESSTHAN = LBRACE + Literal('LESSTHAN') + Suppress(WRD) + RBRACE EQUAL = LBRACE + Literal('EQUAL') + Suppress(WRD) + RBRACE GTEQUAL = LBRACE + Literal('GTEQUAL') + Suppress(WRD) + RBRACE LTEQUAL = LBRACE + Literal('LTEQUAL') + Suppress(WRD) + RBRACE USD = LBRACE + Literal('USD') + Suppress(Regex("[$]+")) + RBRACE UNIT = LBRACE + Literal('UNIT') + USD + RBRACE EQUALITY = LBRACE + Suppress(Literal('EQUALITY')) + (GREATERTHAN ^ LESSTHAN ^ EQUAL ^ GTEQUAL ^ LTEQUAL) + RBRACE QUANTITY = LBRACE + Suppress(Literal('QUANTITY')) + Optional(UNIT) + CD + RBRACE QUANTIFIER = LBRACE + Suppress(Literal('QUANTIFIER')) + EQUALITY + QUANTITY + RBRACE #PYPARSING - AST parsing rules FILTER = Group(LBRACE + Literal('FILTER') + (ATTRIBUTE ^ RELATION) + RBRACE) MODIFIER = Group(LBRACE + Literal('MODIFIER') + (DATE ^ QUANTIFIER) + RBRACE) FUNCTIONLIST = Forward() FUNCTION = LBRACE + Suppress(Literal('FUNCTION')) + FILTER + Optional(MODIFIER) + RBRACE FUNCTIONLIST << LBRACE + Suppress('FUNCTIONLIST') + FUNCTION + Optional(FUNCTIONLIST) + RBRACE SUBJECT = LBRACE + Suppress(Literal('SUBJECT')) + (ENTITY ^ COMPANY) + RBRACE FILTEROBJECT = Group(LBRACE + Literal('FILTEROBJECT') + REPORT + RBRACE) DSLI = Group(LBRACE + Literal('DSLI') + (SUBJECT ^ FUNCTION) + RBRACE) QBODY = Forward() QUERYOBJ = LBRACE + Suppress(Literal("QUERYOBJ")) + (DSLI ^ FILTEROBJECT ^ UKWORD) + RBRACE QBODY << LBRACE + Suppress(Literal('QBODY')) + QUERYOBJ + Optional(QBODY) + RBRACE IS = LBRACE + Suppress(Literal('IS')) + (BE ^ BED ^ BEDZ ^ BER ^ BEZ) + RBRACE WHICHQ = LBRACE + Suppress(Literal('WHICHQ')) + WPS + IS + QBODY + RBRACE HOWQ = LBRACE + Suppress(Literal('WHICHQ')) + WRB + IS + QBODY + RBRACE WHATQ = LBRACE + Suppress(Literal('WHICHQ')) + WDT + IS + QBODY + RBRACE QUESTION = Group(LBRACE + Suppress(Literal('QUESTION')) + (WHICHQ ^ HOWQ ^ WHATQ ^ QBODY) + RBRACE) QUERY = LBRACE + Suppress(Literal('QUERY')) + OneOrMore(QUESTION) + RBRACE DSLOBJ = Suppress(SkipTo(company ^ FILTER)) + (company ^ FILTER) def __init__(self, tokens): """init parser with tokens and parser build from CFG :param tokens: tagged query tokens """ self.tokens = tokens self.CFGParser = ChartParser(self.__getCFG()) def _getAST(self): """Gets the words from the token list and passes them through the parser to build an AST :return nltk AST """ parseTokens = [t[0] for t in self.tokens] ASTs = [] try: syntaxTrees = self.CFGParser.parse(parseTokens) for tree in syntaxTrees: ASTs.append(tree) devLogger.info("AST generated: " + str(tree)) if not(len(ASTs)): devLogger.warn("Did not generate any AST. AST list empty.") except Exception as e: devLogger.error("Could not parse tokens into AST: " + str(e)) return ASTs def __getCFG(self): """Creates the CFG by combining the class defined rules, the standard preterminal rules for POS tags -> e, and finally the POS to word rules for the given query :return nltk CFG """ tg = tokenGrammar for t in self.tokens: tg += "\n" + t[1] + ' -> ' + "'" + t[0] + "'" devLogger.info("Preterminal added to grammar: " + str(t)) return nltk.CFG.fromstring(tg) def parseAST(self): """Parses the NLTK AST into a DSL string and view filters :return (List(DSL String),List(Filter references)) """ ast = self._getAST() dslItems = [] filterObjects = [] #TODO right now only consider the first AST. In furutre we will have to pick best AST if len(ast) >= 1: astLimmited = ast[0] else: astLimmited = False if astLimmited: try: parsedAST = self.QUERY.parseString(astLimmited.pprint()) devLogger.info("Parsed AST: " + str(parsedAST)) except Exception as e: parsedAST = [] devLogger.error("Could not parse AST: " + str(e)) for parsed in parsedAST.asList(): filterObjects = [self.getFilterObjects(item) for item in parsed if item[0] == 'FILTEROBJECT'] dslStr = DSLString(filterObjects) for item in parsed: if item[0] == 'DSLI': dslStr.addDSLI(item[1:]) dslItems.append(dslStr.getString()) if len(filterObjects) < 1: filterObjects = [DefaultDataFilter] devLogger.info('DSL query list is: ' + str(dslItems)) devLogger.info('Filter reference list is: ' + str(filterObjects)) return dslItems, filterObjects def getFilterObjects(self, parsedItem): """Links to the appropriate filter class :param parsedItems: List(List()) of parsed query items :return Filter reference """ def filterSwitch(x): return { 'CASHFLOW': CashFlowFilter, 'BALANCESHEET': BalanceSheetFilter, 'INCOMESTMT': IncomeStatementFilter, }.get(x, False) return filterSwitch(parsedItem[1][0])
def __init__(self, tokens): """init parser with tokens and parser build from CFG :param tokens: tagged query tokens """ self.tokens = tokens self.CFGParser = ChartParser(self.__getCFG())
def reload( self ): self.load_sents() self.load_grammar() self.parser = ChartParser( self.grammar ) return
def make_sentence(corpus, term_rules, *args, **kwargs): ''' Generate sentences with random structure and word choice using a context-free grammar The start point is taken from the sentence itself. Parameters ---------- corpus : str a string containing the full, cleaned corpus term_rules : str a string containing all the terminal rules for the corpus maxdepth : int The maximum allowed recursion depth before throwing a ValueError fixed_grammar : bool Turn off the random sentence selection and used a fixed grammar instead. sample_sentence : str When fixed_grammar is turned on, this is the sentence that will be parsed. This can be finicky with grammars containing specially punctuated constructions like quotations or positions args[0] : dict() Optional: a dictionary of kgrams and their subsequent words. If this variable exists then cfgen will use this to pick the next words with conditional weighting (The prescence of this argument turns on Markov text generation features.) Notes ----- Add the ability to turn off the kgram parsing, ideally by counting the number of unnamed arguments ----> Added this option ''' markov_flag = (not len(args) == 0) if markov_flag: kgram_dict = args[0] fixed_grammar = kwargs.pop('fixed_grammar', False) sample_sentence = kwargs.pop('sample_sentence', '') maxdepth = kwargs.pop('maxdepth', 25) if fixed_grammar: if sample_sentence == '': warnings.warn('When using fixed_grammar, user should specify ' \ 'the keyword argument "sample_sentence." Using a default simple sentence.') sample_sentence = 'The cow jumped over the moon.' else: pass flag = False attempts = 0 while not flag and attempts < 30: tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') if has_parser and not fixed_grammar: rsent = choice(tokenizer.tokenize(corpus)) elif fixed_grammar: rsent = sample_sentence elif not has_parser and not fixed_grammar: # select from a parsed corpus of pre-approved grammars print("Usage library being built") rsent = "The dog walked up the stairs slowly." else: print("Usage library being built") rsent = "The dog walked up the stairs slowly." parsed_syntax = parse_sentence(rsent) # print(parsed_syntax) cfg_str = term_rules + parsed_syntax try: startpt = parsed_syntax[:parsed_syntax.find(' ->')] startpt = nltk.grammar.Nonterminal(startpt) grammar = CFG.fromstring(cfg_str) parser = ChartParser(grammar) gr = parser.grammar() if markov_flag: out_txt = (' '.join( produce_kgram(gr, startpt, kgram_dict, maxdepth=maxdepth, sent=[]))) else: out_txt = (' '.join(produce(gr, startpt, maxdepth=maxdepth))) flag = True except ValueError: warnings.warn( 'Badly formed sentence encountered, resampling the corpus.') attempts = attempts + 1 # now re-tag special characters swappairs = zip(replacements, to_replace) for member in swappairs: out_txt = out_txt.replace(member[0], member[1]) return out_txt
Sp -> P Sa -> 'tells' 'you' 'that' | 'says' | 'says' 'that' | 'claims' | 'claims' 'that' | 'tells you' St -> PG Is Class | PG Quant Is Class | Quant -> Comp Count Comp -> 'exactly' Count -> 'one' Not -> 'neither' | 'nor' PG -> 'i' | PG PG | Not P | P | 'of' PG | PG 'and' PG P -> 'zoey' | 'mel' | 'peggy' | 'zippy' | 'sue' | 'sally' | 'homer' | 'bozo' | 'marge' | 'zed' | 'alice' | 'ted' | 'bart' | 'bob' | 'betty' Is -> 'is' 'a' | 'are' Class -> Kni | Kna Kni -> 'knight' | 'knights' Kna -> 'knave' | 'knaves' """) def preprocess(sent): return "".join([letter for letter in sent.lower() if letter in "qwertyuiopasdfghjklzxcvbnm "]).split() sents = ["Zoey tells you that mel is a Knave", "Mel says, `Neither Zoey nor I are knaves.'", "Peggy tells you that 'of Zippy and I, exactly one is a knight'."] sents = [preprocess(sent) for sent in sents] parser = ChartParser(kk_grammar) for sent in sents: for tree in parser.parse(sent): print(tree)
import nltk from nltk import ChartParser # Load grammar. grammar = nltk.data.load('../../Grammar/full_grammar.cfg') parser = ChartParser(grammar) with open('human_chunks.txt') as f: noun_chunks = [line.strip().split() for line in f] not_covered = [] for chunk in noun_chunks: try: result = parser.parse(chunk) print(f"Valid: {chunk}") except ValueError: print(f"Not covered: {chunk}") chunk = ' '.join(chunk) + '\n' not_covered.append(chunk) with open("not-covered.txt", 'w') as f: f.writelines(not_covered) num_chunks = len(noun_chunks) num_covered = len(noun_chunks) - len(not_covered) num_not_covered = len(not_covered) print(f"Number of unique noun chunks: {num_chunks}") print(f"Covered: {num_covered} ({(num_covered/num_chunks) * 100}%)") print( f"Not covered: {num_not_covered} ({(num_not_covered/num_chunks) * 100}%)")