def demo():
	gfile = GrammarFile.read_file('test.cfg')
	cp = gfile.earley_parser()
	sent = 'the police read the solutions that Poirot sent'
	tokens = list(tokenize.whitespace(sent))
	trees = cp.parse_n(tokens)
	for tree in trees: print tree
Пример #2
0
    def processWhitespacesWithoutStopWords(self, corpus, caseSensitive):
        # initialise token buffer
        tokens = []

        # get tokens separated by whitespaces
        tokenizedCorpus = tokenize.whitespace(corpus)

        # compile regular expression for matching whitespaces
        whitespaces = re.compile(r'\s\&nbsp\;')

        # go through each token in corpus
        for token in tokenizedCorpus:
            # if case-sensitive handling of tokens
            if caseSensitive == 1:
                pass
            else:
                token = token.lower()

            # remove white spaces at beginning
            token = whitespaces.sub('', token)

            # append token to list
            tokens.append(token)

        # return tokens
        return tokens
Пример #3
0
    def processWhitespacesWithoutStopWords(self, corpus, caseSensitive):
	# initialise token buffer
	tokens = []

	# get tokens separated by whitespaces
	tokenizedCorpus = tokenize.whitespace(corpus)

	# compile regular expression for matching whitespaces
	whitespaces = re.compile(r'\s\&nbsp\;')

	# go through each token in corpus
	for token in tokenizedCorpus:
	    # if case-sensitive handling of tokens
	    if caseSensitive == 1:
		pass
	    else:
		token = token.lower()

	    # remove white spaces at beginning
	    token = whitespaces.sub('', token)

	    # append token to list
	    tokens.append(token)

	# return tokens
	return tokens
Пример #4
0
    def processWhitespaces(self,
                           corpus,
                           stopWordList,
                           caseSensitive,
                           minimumTokenLength=3,
                           maximumTokenLength=25):
        # initialise token list
        tokens = []

        # initialise token buffer
        tokenBuffer = ''

        # get tokens separated by whitespaces
        tokenizedCorpus = tokenize.whitespace(corpus)

        # compile regular expression for matching special characters
        specialCharacters = re.compile(r'\&.+\;')

        # compile regular expression for matching whitespaces
        whitespaces = re.compile(r'\s|\&nbsp\;')

        # compile regular expression for sentence-boundary matching
        sentenceBoundary = re.compile(r'[\.\:\!\?\,]')

        # go through each token in corpus
        for token in tokenizedCorpus:
            # get token length
            tokenLength = len(token)

            # see, if token contains special character
            specialCharacterMatches = specialCharacters.findall(token)

            # reduce special characters to size one
            if specialCharacterMatches:
                for match in specialCharacterMatches:
                    tokenLength -= (len(match) - 1)

            # if case-sensitive handling of tokens
            if caseSensitive == 1:
                pass
            else:
                token = token.lower()

            # remove white spaces at beginning and end
            token = whitespaces.sub('', token)

            # write token to buffer and remove punctuation
            tokenBuffer = sentenceBoundary.sub('', token)

            # mark stop words
            if tokenLength < minimumTokenLength or tokenLength > maximumTokenLength or tokenBuffer in stopWordList or tokenBuffer.lower(
            ) in stopWordList:
                tokens.append(token + '<STOPWORD>')
            else:
                tokens.append(token)

        # return tokens
        return tokens
def text_parse(grammar, sent, trace=2, drawtrees=False, latex=False):
	parser = grammar.earley_parser(trace=trace)
	print parser._grammar
	tokens = list(tokenize.whitespace(sent))
	trees = parser.parse_n(tokens)
	if drawtrees:
		from treeview import TreeView
		TreeView(trees)
	else:
		for tree in trees:
			if latex: print tree.latex_qtree()
			else: print tree
Пример #6
0
def _demo_stemmer(stemmer):
    # Tokenize a sample text.
    from nltk_lite import tokenize

    text = "John was eating icecream"
    tokens = tokenize.whitespace(text)

    # Print the results.
    print stemmer
    for word in tokens:
        print "%20s => %s" % (word, stemmer.stem(word))
    print
Пример #7
0
    def processWhitespaces(self, corpus, stopWordList, caseSensitive, minimumTokenLength = 3, maximumTokenLength = 25):
	# initialise token list
	tokens = []

	# initialise token buffer
	tokenBuffer = ''

	# get tokens separated by whitespaces
	tokenizedCorpus = tokenize.whitespace(corpus)

	# compile regular expression for matching special characters
	specialCharacters = re.compile(r'\&.+\;')

	# compile regular expression for matching whitespaces
	whitespaces = re.compile(r'\s|\&nbsp\;')

	# compile regular expression for sentence-boundary matching
	sentenceBoundary = re.compile(r'[\.\:\!\?\,]')

	# go through each token in corpus
	for token in tokenizedCorpus:
	    # get token length
	    tokenLength = len(token)

	    # see, if token contains special character
	    specialCharacterMatches = specialCharacters.findall(token)

	    # reduce special characters to size one
	    if specialCharacterMatches:
		for match in specialCharacterMatches:
		    tokenLength -= (len(match) - 1)

	    # if case-sensitive handling of tokens
	    if caseSensitive == 1:
		pass
	    else:
		token = token.lower()

	    # remove white spaces at beginning and end
	    token = whitespaces.sub('', token)

	    # write token to buffer and remove punctuation
	    tokenBuffer = sentenceBoundary.sub('', token)

	    # mark stop words
	    if tokenLength < minimumTokenLength or tokenLength > maximumTokenLength or tokenBuffer in stopWordList or tokenBuffer.lower() in stopWordList:
		tokens.append(token + '<STOPWORD>')
	    else:
		tokens.append(token)

	# return tokens
	return tokens
def demo():
    import sys, time

    S = GrammarCategory.parse('S')
    VP = GrammarCategory.parse('VP')
    NP = GrammarCategory.parse('NP')
    PP = GrammarCategory.parse('PP')
    V = GrammarCategory.parse('V')
    N = GrammarCategory.parse('N')
    P = GrammarCategory.parse('P')
    Name = GrammarCategory.parse('Name')
    Det = GrammarCategory.parse('Det')
    DetSg = GrammarCategory.parse('Det[-pl]')
    DetPl = GrammarCategory.parse('Det[+pl]')
    NSg = GrammarCategory.parse('N[-pl]')
    NPl = GrammarCategory.parse('N[+pl]')

    # Define some grammatical productions.
    grammatical_productions = [
        cfg.Production(S, (NP, VP)),  cfg.Production(PP, (P, NP)),
        cfg.Production(NP, (NP, PP)),
        cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)),
        cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)),
        cfg.Production(NP, (DetSg, NSg))]

    # Define some lexical productions.
    lexical_productions = [
        cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)),
        cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)),
        cfg.Production(Det, ('a',)),
        cfg.Production(NSg, ('dog',)),   cfg.Production(NSg, ('cookie',)),
        cfg.Production(V, ('ate',)),  cfg.Production(V, ('saw',)),
        cfg.Production(P, ('with',)), cfg.Production(P, ('under',)),
        ]
    
    earley_grammar = cfg.Grammar(S, grammatical_productions)
    earley_lexicon = {}
    for prod in lexical_productions:
        earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())

    sent = 'I saw John with a dog with my cookie'
    print "Sentence:\n", sent
    from nltk_lite import tokenize
    tokens = list(tokenize.whitespace(sent))
    t = time.time()
    cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1)
    trees = cp.parse_n(tokens)
    print "Time: %s" % (time.time() - t)
    for tree in trees: print tree
def raw(files="english-kjv"):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str:
        files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "genesis", file + ".txt")
        s = open(path).read()
        for t in tokenize.whitespace(s):
            yield t
Пример #10
0
def getNGramStructure(sourceFile):
    # initialise n-gram dictionary
    ngrams = {}

    # read file
    corpus = sourceFile.read()

    # get tokens separated by whitespaces
    tokenizedCorpus = tokenize.whitespace(corpus)

    # go through each token
    for token in tokenizedCorpus:
        # split token in single characters
        characters = list(token)

        # copy character list
        charactersBuffer = list(characters)

        # initialise buffer
        buffer1 = ""

        # go through character list
        for char1 in characters:
            # write each n-gram to list
            buffer1 += char1
            ngrams[buffer1] = ngrams.get(buffer1, 0) + 1

            # shift from character list copy
            charactersBuffer.pop(0)

            # initialise buffer
            buffer2 = ""

            # go through copy of character list
            for char2 in charactersBuffer:
                buffer2 += char2
                ngrams[buffer2] = ngrams.get(buffer2, 0) + 1

                # return n-grams
    return ngrams
Пример #11
0
def getNGramStructure(sourceFile):
    # initialise n-gram dictionary
    ngrams = {}

    # read file
    corpus = sourceFile.read()

    # get tokens separated by whitespaces
    tokenizedCorpus = tokenize.whitespace(corpus)

    # go through each token
    for token in tokenizedCorpus:
        # split token in single characters
        characters = list(token)

        # copy character list
        charactersBuffer = list(characters)

        # initialise buffer
        buffer1 = ''

        # go through character list
        for char1 in characters:
            # write each n-gram to list
            buffer1 += char1
            ngrams[buffer1] = ngrams.get(buffer1, 0) + 1

            # shift from character list copy
            charactersBuffer.pop(0)

            # initialise buffer
            buffer2 = ''

            # go through copy of character list
            for char2 in charactersBuffer:
                buffer2 += char2
                ngrams[buffer2] = ngrams.get(buffer2, 0) + 1

# return n-grams
    return ngrams
Пример #12
0
def demo():
    """
    A demonstration of the recursive descent parser.
    """

    from nltk_lite.parse import cfg

    # Define some nonterminals
    S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP')
    V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det')

    # Define a grammar.
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, 'saw', NP]),
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(NP, [Det, N, PP]),
        cfg.Production(PP, [P, NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),   cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),  cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),  cfg.Production(P, ['in']),
        cfg.Production(P, ['with']), cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),  cfg.Production(N, ['telescope'])
        )
    grammar = cfg.Grammar(S, productions)

    # Tokenize a sample sentence.
    sent = list(tokenize.whitespace('I saw a man in the park'))

    # Define a list of parsers.
    parser = RecursiveDescent(grammar)
    parser.trace()
    for p in parser.get_parse_list(sent):
        print p
def demo():
    """
    Create a recursive descent parser demo, using a simple grammar and
    text.
    """    
    from nltk_lite.parse import cfg
    grammar = cfg.parse_grammar("""
    # Grammatical productions.
        S -> NP VP
        NP -> Det N PP | Det N
        VP -> V NP PP | V NP | V
        PP -> P NP
    # Lexical productions.
        NP -> 'I'
        Det -> 'the' | 'a'
        N -> 'man' | 'park' | 'dog' | 'telescope'
        V -> 'ate' | 'saw'
        P -> 'in' | 'under' | 'with'
    """)

    sent = list(tokenize.whitespace('the dog saw a man in the park'))

    RecursiveDescentDemo(grammar, sent).mainloop()
def demo():
    """
    Create a shift reduce parser demo, using a simple grammar and
    text. 
    """
    
    from nltk_lite.parse import cfg
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s)
                                           for s in nonterminals.split()]
    
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(NP, [NP, PP]),
        cfg.Production(VP, [VP, PP]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(VP, [V, NP]),
        cfg.Production(PP, [P, NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),   cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),  cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),  cfg.Production(P, ['in']),
        cfg.Production(P, ['with']), cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),  cfg.Production(N, ['statue']),
        cfg.Production(Det, ['my']),
        )

    grammar = cfg.Grammar(S, productions)

    # tokenize the sentence
    sent = list(tokenize.whitespace('my dog saw a man in the park with a statue'))

    ShiftReduceDemo(grammar, sent).mainloop()
def string2words(s, sep="/"):
    return [tag2tuple(t, sep)[0] for t in tokenize.whitespace(s)]
Пример #16
0
    words = []
    sentences = []
    rowID = 0

    # open file
    file = open(path + category, 'r')

    # add each line to corpus
    for line in file:
        corpus += line

    # close file pointer
    file.close()

    # get tokens from corpus
    tokenizedCorpus = tokenize.whitespace(corpus)

    # go through tokens
    for token in tokenizedCorpus:
        # add token to sentence
        words.append(tag.sub('', token))

        # if sentence-boundary has been found in this token
        if sentenceBoundary.findall(token):
            # recompose sentence
            for word in words:
                sentenceString += word + ' '

            # add to sentence string list
            sentences.append(sentenceString)
Пример #17
0
from nltk_lite import tokenize
from nltk_lite.parse import cfg
from nltk_lite.draw.rdparser import RecursiveDescentDemo

productions = """
NP -> NP AND NP
NP -> N
N -> "cabbages"
N -> "kings"
AND -> "and"
"""
grammar = cfg.parse_grammar(productions)
text = list(tokenize.whitespace('cabbages and kings'))
RecursiveDescentDemo(grammar, text).mainloop()

Пример #18
0
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade):
    # allow any kind of bracketing for flexibility

    L_BRACKET = re.compile(r"[\(\[\{<]")
    R_BRACKET = re.compile(r"[\)\]\}>]")

    if type(files) is str:
        files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
        s = open(path).read()
        data = _parse(s)
        for s in data:
            bracket = 0
            itmType = None
            stack = [tree.Tree(top_node, [])]
            inTag = []
            for itm in list(tokenize.whitespace(s)):
                if L_BRACKET.match(itm[0]):
                    bracket += 1
                    itm = itm[1:]
                    matched = False
                    if partial_match == True:
                        for eachItm in chunk_types:
                            if len(eachItm) <= len(itm) and eachItm == itm[: len(eachItm)]:
                                matched = True
                                if collapse_partials == True:
                                    itm = eachItm
                    else:
                        if chunk_types is not None and itm in chunk_types:
                            matched = True
                    if matched == True:  # and inTag == 0:
                        chunk = tree.Tree(itm, [])
                        if cascade == True:
                            stack.append(chunk)
                            inTag += [bracket]
                        else:
                            if len(inTag) == 0:
                                stack[-1].append(chunk)
                                inTag += [bracket]
                    itmType = itm
                if R_BRACKET.match(itm[-1]):
                    tmpItm = split(itm, itm[-1])
                    if tmpItm != "":
                        if len(inTag) > 0 and inTag[-1] <= bracket:  # inTag <= bracket:
                            if cascade == True:
                                stack[-1].append((itmType, tmpItm[0]))
                            else:
                                stack[-1][-1].append((itmType, tmpItm[0]))
                        else:
                            if cascade == True:
                                if len(stack) > 1:
                                    stack[-2].append(stack[-1])
                                    stack = stack[:-1]
                            stack[-1].append((itmType, tmpItm[0]))
                            inTag = [] + inTag[:-2]
                    bracket -= len(tmpItm) - 1
                    while len(inTag) > 0 and bracket < inTag[-1]:
                        if cascade == True:
                            if len(stack) > 1:
                                stack[-2].append(stack[-1])
                                stack = stack[:-1]
                        inTag = [] + inTag[:-2]
            yield stack
def _list_sent(sent):
    return [tokenize.whitespace(line) for line in tokenize.line(sent)]
Пример #20
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk_lite import tokenize
    from nltk_lite.parse import cfg, pcfg, pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [('I saw John with my cookie', pcfg.toy1),
             ('the boy saw Jack with Bob under the table with a telescope',
              pcfg.toy2)]

    # Ask the user which demo they want to use.
    print
    for i in range(len(demos)):
        print '%3s: %s' % (i+1, demos[i][0])
        print '     %r' % demos[i][1]
        print
    print 'Which demo (%d-%d)? ' % (1, len(demos)),
    try:
        snum = int(sys.stdin.readline().strip())-1
        sent, grammar = demos[snum]
    except:
        print 'Bad sentence number'
        return

    # Tokenize the sentence.
    tokens = list(tokenize.whitespace(sent))

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideParse(grammar),
        pchart.RandomParse(grammar),
        pchart.UnsortedParse(grammar),
        pchart.LongestParse(grammar),
        pchart.BeamParse(len(tokens)+1, grammar)
        ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print '\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,pcfg)
        parser.trace(3)
        t = time.time()
        parses = parser.get_parse_list(tokens)
        times.append(time.time()-t)
        if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
        else: p = 0
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses: all_parses[p.freeze()] = 1

    # Print some summary statistics
    print
    print '       Parser      | Time (secs)   # Parses   Average P(parse)'
    print '-------------------+------------------------------------------'
    for i in range(len(parsers)):
        print '%18s |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
                                         times[i],num_parses[i],average_p[i])
    parses = all_parses.keys()
    if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
    else: p = 0
    print '-------------------+------------------------------------------'
    print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)

    # Ask the user if we should draw the parses.
    print
    print 'Draw parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        from nltk_lite.draw.tree import draw_trees
        print '  please wait...'
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print
    print 'Print parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print parse
Пример #21
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk_lite import tokenize
    from nltk_lite.parse import cfg, pcfg, ViterbiParse

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [('I saw John with my cookie', pcfg.toy1),
             ('the boy saw Jack with Bob under the table with a telescope',
              pcfg.toy2)]

    # Ask the user which demo they want to use.
    print
    for i in range(len(demos)):
        print '%3s: %s' % (i+1, demos[i][0])
        print '     %r' % demos[i][1]
        print
    print 'Which demo (%d-%d)? ' % (1, len(demos)),
    try:
        snum = int(sys.stdin.readline().strip())-1
        sent, grammar = demos[snum]
    except:
        print 'Bad sentence number'
        return

    # Tokenize the sentence.
    tokens = list(tokenize.whitespace(sent))

    parser = ViterbiParse(grammar)
    all_parses = {}

    print '\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)
    parser.trace(3)
    t = time.time()
    parses = parser.get_parse_list(tokens)
    time = time.time()-t
    if parses:
        average = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
    else:
        average = 0
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print
    print 'Time (secs)   # Parses   Average P(parse)'
    print '-----------------------------------------'
    print '%11.4f%11d%19.14f' % (time, num_parses, average)
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
    else: p = 0
    print '------------------------------------------'
    print '%11s%11d%19.14f' % ('n/a', len(parses), p)

    # Ask the user if we should draw the parses.
    print
    print 'Draw parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        from nltk_lite.draw.tree import draw_trees
        print '  please wait...'
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print
    print 'Print parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print parse
 def set_sentence(self, sentence):
     self._sent = list(tokenize.whitespace(sentence)) #[XX] use tagged?
     self.reset()
Пример #23
0
    # initialise co-occurrence matrix
    coOccurrences = {}

    # open file
    file = open(path + category, 'r')

    # add each line to corpus
    for line in file:
	corpus += line

    # close file pointer
    file.close()

    # get tokens from corpus
    tokenizedCorpus = tokenize.whitespace(corpus)

    # go through tokens
    for token in tokenizedCorpus:
	# add token to sentence
	words.append(tag.sub('', token))

	# if sentence-boundary has been found in this token
	if sentenceBoundary.findall(token):
	    # recompose sentence
	    for word in words:
		sentenceString += word + ' '

	    # add to sentence string list
	    sentences.append(sentenceString)