def demo(): """ A demonstration that shows the output of several different tokenizers on the same string. """ from nodebox_linguistics_extended.parser.nltk_lite import tokenize # Define the test string. s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." print 'Input text:' print ` s ` print print 'Tokenize using whitespace:' _display(tokenize.whitespace(s)) print print 'Tokenize sequences of alphanumeric characters:' _display(tokenize.regexp(s, pattern=r'\w+', gaps=False)) print print 'Tokenize sequences of letters and sequences of nonletters:' _display(tokenize.wordpunct(s)) print print 'Tokenize by lines:' _display(tokenize.line(s)) print print 'Tokenize by blank lines:' _display(tokenize.blankline(s)) print print 'A simple sentence tokenizer:' _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True)) print
def _parse(s): rx_pattern = re.compile( r""" \(CODE .*\) |\(ID .*\d\) """, re.VERBOSE | re.UNICODE) s = re.sub(rx_pattern, '', s) s = split(s, '\n') fullPhrase = "" # loop through the sentences and parse each sentence # every time a new sentence marker is found for sent in s: if list(tokenize.regexp(sent, r'^\(')) != []: fullPhrase = _strip_spaces(fullPhrase) if fullPhrase != "": yield fullPhrase fullPhrase = sent else: fullPhrase += sent # Get the last of the buffer and output a yield fullPhrase = _strip_spaces(fullPhrase) if fullPhrase != "": yield fullPhrase
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk_lite context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk_lite.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string """ 1. Tokenize ------------------------------------------------------------------------ """ # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp(self.string, re_all) """ 2. Develop a context free grammar ------------------------------------------------------------------------ """ # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = cfg.nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" cfg.Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain cfg.Production(H, [D, '/', D]), # domain, forward slash, another operator cfg.Production(H, [D, '/', O])) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = cfg.Production(D, [tok]), productions = productions + prod # Make a grammar out of our productions grammar = cfg.Grammar(O, productions) rd_parser = parse.RecursiveDescent(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp(self.string, re_all) toklist = list(tokens) """ 3. Parse using the context free grammar ------------------------------------------------------------------------ """ # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return """ 4. Refine and convert to a Tree representation ------------------------------------------------------------------------ """ # Set the nltk_lite.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":", "").replace("')'", "").replace( "table(", "").replace("','", "").replace("'", "").replace("/", "") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk_lite.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()
def re2nfa(fsa, re): tokens = tokenize.regexp(re, pattern=r'.') tree = _parser.parse(tokens) if tree is None: raise ValueError('Bad Regexp') state = re2nfa_build(fsa, fsa.start(), tree) fsa.set_final([state])