예제 #1
0
파일: nlp.py 프로젝트: codehacken/Athena
 def __init__(self,sentence):
    f = open('data/training_data', 'r')
    train_text=f.read()
    #data=open('data2','r')
    #test_data=data.read()
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    self.tokenized = custom_sent_tokenizer.tokenize(sentence)
예제 #2
0
class Tokenizer(object):

    def __init__(self, language, normalize=False, train_text_gen=None):
        """
        A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
        Params:
        language: Language to tokenize (currently doesn't do anything)
        train_text_gen: A generator of training text for the sentence tokenizer.
        """
        self.language = language
        self.train_text_gen = train_text_gen
        self.normalize = normalize
        
        if train_text_gen:
            self.sent_tokenizer = self._train_sentence_tokenizer()
        else:
            self.sent_tokenizer = PunktSentenceTokenizer()

    def _train_sentence_tokenizer(self):
        return PunktSentenceTokenizer(train_text="\n".join(self.train_text_gen))

    def tokenize(self, text):
        tokenized = []
        for sentence in self.sent_tokenizer.tokenize(text):
            tokenized_sentence = []
            for word in word_tokenize(sentence):
                if self.normalize:
                    word = word.lower()
                tokenized_sentence.append(word)
            tokenized.append(tokenized_sentence)

        return tokenized
예제 #3
0
def POS_tagging(corpus):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = corpus
    #print(train_text)
    custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)

    # textfile = open("POS_tagged",'w')
    # textfile.write(train_text)
    # textfile.write("\n\n\n\n\n\n\n\n\n\n")
    # print(custom_sentence_tokenizer)

    tokenized = custom_sentence_tokenizer.tokenize(sample_text)
    tuples_list = []
    def process_content():
        try:
            for i in tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                for w in tagged:
                    tuples_list.append(w)
        except Exception as e:
            c=0
            # print(str(e))
    process_content()
    return tuples_list
def extractNounPhrases(sentence):

    nounPhrases = []
    try:
        tokenizer = PunktSentenceTokenizer(sentence)
        tokenized = tokenizer.tokenize(sentence)

        words = nltk.word_tokenize(tokenized[0])
        tagged = nltk.pos_tag(words)

        firstNN = False

        for tag in tagged:
            pos = tag[1]
            if "NN" in pos:
                if firstNN:
                    nounPhrase = firstNoun + " " + tag[0]
                    nounPhrases.append(nounPhrase)
                    firstNN = False
                    continue
                else:
                    firstNoun = tag[0]
                    firstNN = True
                    continue

            firstNN = False

    except Exception as e:
        print(str(e))

    return nounPhrases
예제 #5
0
    def get_sentences(self, remove_url=True):
        '''
        generator
        :param remove_url --> replace URLs in sentences with one space char ;
        :return: tuple of sentences for each mime-part ;
        '''

        tokenizer = PunktSentenceTokenizer()

        for raw_line, mime_type, lang in tuple(self.get_text_mime_part()):

            if 'html' in mime_type:
                soup = BeautifulSoup(raw_line)
                if not soup.body:
                    continue
                # cause exactly sentences are needed, soup.body.strings returns lines+0d0a
                lines = tuple(soup.body.strings)
                raw_line = ''.join(lines)

            try:
                sents = tuple(tokenizer.tokenize(raw_line))
            except Exception as err:
                sents = tuple(raw_line)

            if remove_url:
                sents = tuple(map(lambda sent: self.__URLINTEXT_PAT.sub(' ', sent.lower()), sents))

            sents = (s.strip().lower() for s in sents)
            sents = tuple(s for s in tuple(sents) if s)
            if len(sents) == 0:
                continue

            yield sents
def normalize(text):
    p = PunktSentenceTokenizer()
    bullet1 = '\xe2\x80\xa2'.decode('utf-8')
    bullet2 = '\xc2\xb7'.decode('utf-8')
    usable = ''
    for sentence in p.tokenize(text):
        if len(sentence) < 500:
            if bullet1 not in sentence and bullet2 not in sentence:
                usable += '%s ' % sentence
    return usable
예제 #7
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
def tag(sentence):

    try:
        tokenizer = PunktSentenceTokenizer(sentence)
        tokenized = tokenizer.tokenize(sentence)

        words = nltk.word_tokenize(tokenized[0])
        tagged = nltk.pos_tag(words)

        return tagged

    except Exception as e:
        print(str(e))
예제 #9
0
 def aristo_get_named_entities(self, text):
     """
     Parses the texts to obtain named entities
     :param text: The text to parse
     :return:returns a named entity treexw
     """
     custom_sent_tokenizer = PunktSentenceTokenizer(text)
     tokenized = custom_sent_tokenizer.tokenize(text)
     for i in tokenized[5:]:
         words = nltk.word_tokenize(i)
         tagged = nltk.pos_tag(words)
         namedEnt = nltk.ne_chunk(tagged, binary=False)
         return ((namedEnt))
예제 #10
0
def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
    except Exception as e:
        print(str(e))
    return namedEnt
예제 #11
0
def sentenceTagging(text, trainingText):
    csTokenizer = PunktSentenceTokenizer(trainingText)
    tokenized = csTokenizer.tokenize(text)
    taggedSentence = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            taggedSentence.append(tagged)
            #chinkingWords(tagged).draw()
            namedEntityRecog(tagged)
    except Exception as e:
        print(str(e))

    return taggedSentence
예제 #12
0
	def pos(self, paragraph):

		wordsdict = collections.OrderedDict()
		sent_tokenizer = PunktSentenceTokenizer()

		for sentence in self.sent_detector.tokenize(paragraph):
			tokens = sent_tokenizer.tokenize(sentence)

			for token in tokens:
				words = nltk.word_tokenize(token)
				tagged = nltk.pos_tag(words)
				for word in tagged:
					if word[1] in self.tagdict:
						wordsdict[word[0]] = self.tagdict[word[1]][0]

		return wordsdict
def main():
    training_text = state_union.raw('2005-GWBush.txt')
    sample_text = state_union.raw('2006-GWBush.txt')
    custom_sent_tokenizer = PunktSentenceTokenizer(training_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)

    choice = 0
    while choice < 5:
        choice = input("1 for named_chunks. This provides some information about proper nouns.\n, 2 for process_chunks. This tells you if a noun phrase followed by n adverb occurs., \n3 for proccess content, this just prints stuff, 4 for...")
        if choice == 1:
            named_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 2:
            process_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 3:
            process_content(text_trained_tokenized(sample_text, training_text))
        elif choice == 4:
            print "try again, bitch!"
예제 #14
0
def get_sentence_occurrences(document, terms, doc_term=None):
    terms_present = get_document_occurrences(document, terms)

    # Use a Tokenizer from NLTK to build a sentence list
    tokenizer = Tokenizer(document)
    sentences = tokenizer.tokenize(document)
    
    # Create a list of lists containing the collection of terms which cooccurr
    # in a sentence
    occurrences = []
    for sentence in sentences:
        sentence_occurrences = set() 

        for term in terms_present:
            if term != doc_term:
                if re.search(' %s ' % term.label, sentence):
                    sentence_occurrences.add(term)
        

        if len(sentence_occurrences) > 0:
            sentence_occurrences = list(sentence_occurrences)
            to_remove = set()

            for inside in sentence_occurrences:
                for term in sentence_occurrences:
                    if term != inside and\
                        term.label.find(inside.label) != -1:
                        to_remove.add(inside)
            
            if to_remove:
                print "removing", to_remove

            for term in to_remove:
                sentence_occurrences.remove(term)

            if doc_term:
                sentence_occurrences.append(doc_term)

            occurrences.append(sentence_occurrences)
    
    return occurrences
예제 #15
0
def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
            # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>*<NN>?}"""
            # # chunkGram = r"""Chunk: {<.*>+}
            # #                     }<VB.?|IN|DT>+{"""
            # chunkParser = nltk.RegexpParser(chunkGram)
            # chunked = chunkParser.parse(tagged)
            # print(chunked)
            # #print(tagged)
    except Exception as e:
        print(str(e))
    return namedEnt
예제 #16
0
파일: ner.py 프로젝트: Mitgorakh/myproject
class NER:
    """docstring for ClassName"""
    def __init__(self, query):
        self.original_query = query
       	conf = shelve.open('conf') 
        self.train_text = conf['train_text']
        self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text)
        self.tokenized = self.custom_sent_tokenizer.tokenize(self.original_query)

    def processContent(self):
        try:
            for i in self.tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                namedEnt = nltk.ne_chunk(tagged, binary=True)
                #print(namedEnt)
                #namedEnt.draw()
            return namedEnt
        except Exception as e:
            print(str(e))
        

    # Parse named entities from tree
    def structureNamedEntities(self):
    	ne = []
    	for subtree in self.named_entity_tree:
    		if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
    			ne_label = subtree.label()
    			ne_string = " ".join([token for token, pos in subtree.leaves()])
    			ne.append((ne_string, ne_label))
    	return ne

    def performNER(self):
        self.named_entity_tree = self.processContent()
        #print(type(self.named_entity_tree))
        self.named_entity_tuple = self.structureNamedEntities()
        #print(ne)
        names = [element[0] for element in self.named_entity_tuple]
        return names
예제 #17
0
    def extract_features(self):
        """
        All approach of extracting features from raw data implemented here
        """
        custom_tokenizer = PunktSentenceTokenizer()
        regex_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        ps = PorterStemmer()
        tokenized = []

        with open(self.file_path, 'r') as current_document:
            for each_line in current_document:
                tokenized.extend(custom_tokenizer.tokenize(each_line))  # tokenizing words line by line
        feature_list = []
        try:
            for each_sentence in tokenized:
                # words = nltk.word_tokenize(each_sentence)
                words = regex_tokenizer.tokenize(each_sentence)
                tagged = nltk.pos_tag(words)
                feature_list.extend([ps.stem(pos[0].lower()) for pos in tagged if pos[1] == 'NN'])  # listing the nouns in a list
        except Exception as E:
            print(str(E))
        feature_dictionary = Counter(feature_list)  # converts an iterable object(in this case, LIST) to dictionary
        return feature_dictionary
예제 #18
0
def tokenize_sentence(input_text: str) -> List[str]:
    """ Converts a text into a list of sentence tokens """
    if input_text is None or len(input_text) == 0:
        return []
    tokenizer = PunktSentenceTokenizer()
    return tokenizer.tokenize(input_text)
예제 #19
0
def tokenizeText(text):
    text = text.replace("?", "?,")
    custom_sent_tokenizer = PunktSentenceTokenizer(text)
    tokenize = custom_sent_tokenizer.tokenize(text)
    return tokenize
예제 #20
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)  ## train tokenizer

tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()
            ##print(chunked)

    except Exception as e:
        print(str(e))


process_content()
예제 #21
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

trainText = state_union.raw("2005-GWBush.txt")
sampleText = state_union.raw("2006-GWBush.txt")

# train tokenizer:
customSentTokenizer = PunktSentenceTokenizer(trainText)

tokenized = customSentTokenizer.tokenize(sampleText)


def processContent():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))


processContent()
예제 #22
0
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

cst = PunktSentenceTokenizer(train_text)
tknd = cst.tokenize(sample_text)

try:
    for i in tknd[5:]:
        word = nltk.word_tokenize(i)
        # print(word)
        tgd = nltk.pos_tag(word)
        nER = nltk.ne_chunk(tgd)
        print(nER)

except Exception as e:
    print(str(e))

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_txt = state_union.raw("2005-GWBush.txt")
sample_txt = state_union.raw("2006-GWBush.txt")

custom_sent_tokenize = PunktSentenceTokenizer(train_txt)

tokenized = custom_sent_tokenize.tokenize(sample_txt)


def process_content():
    try:
        for w in tokenized:
            words = nltk.word_tokenize(w)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))


process_content()
예제 #24
0
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
import nltk
import json
import re

train_text = state_union.raw('2005-GWBush.txt')
inputFile = 'data.json'
outputFile = 'sample_text.txt'

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(outputFile)

def createInput():
    
    with open(inputFile, 'r') as i:
        with open(outputFile, 'w') as o:
            for line in i:
                try:
                    tweet = json.loads(line)
                    text = tweet['text']
                    o.write(' '.join(re.sub("(RT)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split()))
                except BaseException as e:
                    continue

def tagInput():
    try:
        with open(outputFile, 'r') as f:
            for line in f:
                words = nltk.word_tokenize(line)
    print(ps.stem(w))

# Part of speach Tagging

tag_list = [nltk.pos_tag(w) for w in words]
print("Part of speech Tagg:", tag_list)

for i in words:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    print(tagged)

# Chunking
custom_sent_tokenizer = PunktSentenceTokenizer(text)

tokenized = custom_sent_tokenizer.tokenize(text)


def process_content():
    try:

        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
예제 #26
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_raw = state_union.raw("2005-GWBush.txt")
sample_raw = state_union.raw("2006-GWBush.txt")

tokenizer = PunktSentenceTokenizer(train_raw)
sentences = tokenizer.tokenize(sample_raw)


def process_data():
    try:
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            namedEnt.draw()

    except Exception as e:
        print(str(e))


process_data()
예제 #27
0
import nltk
from nltk.tokenize import word_tokenize, PunktSentenceTokenizer

text_file = open(".\login2.txt", "r")
text = text_file.read()

word = nltk.word_tokenize(text)

custom_sen = PunktSentenceTokenizer(text)
tokenized = custom_sen.tokenize("hello. how can i login. where is otp?")


def process():
    try:
        for w in tokenized:
            word = nltk.word_tokenize(w)
            tagged = nltk.pos_tag(word)

            namedEnt = nltk.ne_chunk(tagged)
            namedEnt.draw()
            print(tagged)

    except Exception as e:
        print(str(e))


process()
예제 #28
0
import nltk
import wikipedia
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union

train_text = state_union.raw("2005-GWBush.txt")
_text = input("Enter a text:")
sample_text = wikipedia.summary(_text, sentences=1)
custom_SentTok = PunktSentenceTokenizer(train_text)
tokenized = custom_SentTok.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                   }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()

    except Exception as e:
        print(str(e))


process_content()
예제 #29
0
            m = textid_re.search(line)
            if m:
                sys.stdout.write(u".\n<{{{%s}}}!!!>\n" % m.group(1))
                continue
            if line == "\n":
                continue
    
        if normquotes == 1:
            line = line.replace(u"«", " ' ")
            line = line.replace(u"»", " ' ")
            line = line.replace(u"“", " ' ")
            line = line.replace(u"”", " ' ")
            line = line.replace(u"\"", " ' ")


        sentences = st.tokenize(line)

        for s in sentences:
            if wptokenizer == 1:
                for w1 in wtt.tokenize(s):
                    for w in wtw.tokenize(w1):
                        sys.stdout.write(w.encode("utf-8"))
                        sys.stdout.write("\n")
            else:
                for w in wtt.tokenize(s):
                    sys.stdout.write(w.encode("utf-8"))
                    sys.stdout.write("\n")
#            sys.stdout.write(".\n")
            sys.stdout.write("\n")
    sys.stdout.write("\n")
예제 #30
0
"""

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  # The Tokenizer which will be used.

'''This tokenizer is capable of unsupervised machine learning --> PuckSentenceTokenizer '''
'''o you can actually train it on any body of text that you use. First, let's get some imports out of the way that we're going to use:'''
def process_content():
    try:
        for i in tokenized[:5]:  # here we are applying sentence limit so we can use this one for the processing the sentences.
            words = nltk.word_tokenize(i) # Tokenizes all the word , using the word tokenize!
            tagged = nltk.pos_tag(words)  # Tags the specific words with the Natural language .
            print(tagged)  # Prints the words with the Tags in the form of the tupple .!


    except Exception as e:
        print(str(e))  # if there is an exception then this prints out the exception



if __name__ == '__main__':
    train_text = state_union.raw("2005-GWBUSH.txt")   # This is the train text which will be used to tokenize the sample Test(unsupervised learning)
    sample_text = state_union.raw("2006-GWBUSH.txt")  # This is the sample text which will be tokenized later onward
    print(type(sample_text))
    custom_sent_tokenizer = PunktSentenceTokenizer(
        train_text)  # This is the Train Text in the form of sentence being tokenized using the unsupervised learning.!
    #tokenized  = custom_sent_tokenizer.tokenize(sample_text) # Tokenizing he Custom sentence tokenize
    tokenized  = custom_sent_tokenizer.tokenize("Hi! my name is Shafay. I am 20 years old. I love playing games.")
    #print(tokenized) # this is just for the Debugging purposes!
    process_content()  # Calling the process content function!
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 24 06:45:40 2020

@author: Dell
"""

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

trainText = "A malapropism also called a malaprop, acyrologia, or Dogberryism is the mistaken use of an incorrect word in place of a word with a similar sound, resulting in a nonsensical, sometimes humorous utterance. An example is the statement by baseball player Yogi Berra, Texas has a lot of electrical votes, rather than electoral votes. Malapropisms often occur as errors in natural speech and are sometimes the subject of media attention, especially when made by politicians or other prominent individuals. Philosopher Donald Davidson has said that malapropisms show the comple process through which the brain translates thoughts into language.Humorous malapropisms are the type that attract the most attention and commentary, but bland malapropisms are common in speech and writing."

customSentTokenizer = PunktSentenceTokenizer(trainText)
tokenized = customSentTokenizer.tokenize(trainText)


def processContent():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            #chunkGram = """Chunk: {<RB}"""
            print(tagged)

    except Exception as e:
        print(str(e))


processContent()
예제 #32
0
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
"""


# retrieving the corpus
train_text = state_union.raw('2005-GWBush.txt')
text = state_union.raw('2006-GWBush.txt')

# training the sentence tokenizer (unsupervised)
tokenizer = PunktSentenceTokenizer(train_text)
sentence = tokenizer.tokenize(text)

# tagging the tokens by word tokenizing the sentence and the using regular exp to chunk the tokens
try:
    for s in sentence:
        token = word_tokenize(s)
        pos = pos_tag(token)
        print(pos)
        chunkreg = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        chunkParser = nltk.RegexpParser(chunkreg)
        chunked = chunkParser.parse(pos)
        chunked.draw()

except Exception as e:
    print(str(e))
PERSON - Eddy Bonte, President Obama
LOCATION - Murray River, Mount Everest
DATE - June, 2008-06-29
TIME - two fifty a m, 1:30 p.m.
MONEY - 175 million Canadian Dollars, GBP 10.40
PERCENT - twenty pct, 18.75 %
FACILITY - Washington Monument, Stonehenge
GPE - South East Asia, Midlothian
'''

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = sent_tokenizer.tokenize(sample_text)


def processContent():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # print(tagged)

            # namedEnt = nltk.ne_chunk(tagged)
            namedEnt = nltk.ne_chunk(
                tagged, binary=True
            )  ## When type of the named entity is not important(puts all named entities together.
            # namedEnt.draw()
            print(namedEnt)
예제 #34
0
def get_sentence_occurrences(document, terms, terms_present=None, 
                             remove_overlap=False, remove_duplicates=False):
    """
    Returns a list of lists representing the terms occuring in each sentence.
    Semantically equivalent to: 
    [[term for term in terms if term in sent] for sent in document]

    Order of optional operations is: remove duplicates, remove overlap, 
    add doc terms, remove duplicate doc terms
    """
    # get list of terms in the document to narrow sentence-level search
    if terms_present is None:
        terms_present = set(get_document_occurrences(document, terms))

    # Use a Tokenizer from NLTK to build a sentence list
    tokenizer = Tokenizer(document)
    sentences = tokenizer.tokenize(document)
    logging.info("scanning %d sentences for %d terms" % (len(sentences), len(terms)))
    
    # Create a list of lists containing the collection of terms which cooccurr
    # in a sentence
    occurrences = []
    for sentence in sentences:
        sentence_occurrences = [] 

        for term in terms_present:
            # build list of search patterns starting with label
            patterns = ['\b%s\b' % term.label]
            patterns.extend(term.searchpatterns)

            for pattern in patterns:
                try:
                    # search for any occurrence of term, stop when found
                    if re.search(pattern, sentence, flags=re.IGNORECASE):
                        sentence_occurrences.append(term)
                        break
                except re.error:
                    logging.warning('Term %d (%s) pattern "%s" failed' % 
                                    (term.ID, term.label, pattern))
                    term.searchpatterns.remove(pattern)

        # remove duplicates
        if remove_duplicates:
            sentence_occurrences = list(set(sentence_occurrences))

        # remove overlapping elements
        if remove_overlap:
            to_remove = set()
            
            # build set of terms to remove
            for inside in sentence_occurrences:
                for term in sentence_occurrences:
                    if term != inside and\
                        inside.label.find(term.label) != -1:
                        to_remove.add(term)

            # remove terms
            for term in to_remove:
                sentence_occurrences.remove(term)

        # add to list of sentences if any terms are found
        if sentence_occurrences:
            occurrences.append(sentence_occurrences)
    
    return occurrences
예제 #35
0
#01 Segmentation

sentences = brown.sents(categories=category)

tokens = brown.words(categories=category)
new_token = []
for w in tokens:
    word = re.sub(r'[-[_\],`!?():{}&$#@%*+;/\'"\t\n\b0-9]', r'', w.lower())
    if word != '' and word not in stop_Words:
        new_token.append(word)

row_text = ' '.join(new_token)
#unsupervised learning ML alogrithm to detect end of sentences (EOS)
custom_sent_tokenizer = PunktSentenceTokenizer(row_text)
tokenized = custom_sent_tokenizer.tokenize(row_text)
last_text = ' '.join(tokenized)


#prediction Algorithm
def markov_chain(text):
    words = text.split(' ')
    myDict = defaultdict(list)
    for currentWord, nextWord in zip(words[0:-1], words[1:]):
        myDict[currentWord].append(nextWord)
    myDict = dict(myDict)
    return myDict


markov_return = markov_chain(last_text)
numOfKeys = len(markov_return)
예제 #36
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

traintext = state_union.raw(
    "/home/varun/PycharmProjects/untitled/speechtrain.txt")
sampletext = state_union.raw(
    "/home/varun/PycharmProjects/untitled/speechsample.txt")
costum_sent_tokenizer = PunktSentenceTokenizer(traintext)
tokenized = costum_sent_tokenizer.tokenize(sampletext)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

    #chunking        chunkgram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?}"""
    #chunking        chunkgram = r"""Chunk: {<.*>+}
    #                                              }<VB.?|IN|DT|TO>+{"""
        nameEnt = nltk.ne_chunk(tagged, binary=true)
        print(nameEnt)
    #       chunkParser = nltk.RegexpParser(chunkgram)
    #      chunked = chunkParser.parse(tagged)
    #     print(chunked)

    except Exception as e:
        print(str(e))

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
#sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(
    "Please save my ass my no is 98437598237459, 47598437")


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<CD>+}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            #print(chunked.label())
            for subtree in chunked.subtrees():
                if subtree.label() == 'Chunk':
                    print(subtree[0][0])

    except Exception as e:
        print(str(e))
예제 #38
0
import nltk
from nltk.tokenize import PunktSentenceTokenizer

sentence1 = """The group arrived at two o'clock on Monday afternoon to start
class."""
sentence2 = """The Little Mermaid (Danish: Den lille havfrue) is a fairy tale 
written by the Danish author Hans Christian Andersen about a young mermaid who 
is willing to give up her life in the sea and her identity as a mermaid to gain 
a human soul."""

#Chunking
custom_sent_tokenizer = PunktSentenceTokenizer(sentence1)
tokenized = custom_sent_tokenizer.tokenize(sentence2)


def process_content():
    for i in tokenized:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        # chunked.draw()
        print chunked


process_content()

# many named nouns
# chunking: chunk: 'noun phrases' be anoun, and modifiers around that noun.
# descriptive group of words surrounding that noun. downside: only can use
예제 #39
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer,word_tokenize

sample_test= state_union.raw("2006-GWBush.txt")
sample_train=state_union.raw("2005-GWBush.txt")

custom_sent_tokenizer=PunktSentenceTokenizer(sample_train)

tokenize=custom_sent_tokenizer.tokenize(sample_test)
#words =word_tokenize(tokenizerr)
#print(words)

for w in tokenize:
    words=word_tokenize(w)
    tagged=nltk.pos_tag(words)
    print(tagged)
    chunkgram= r"""Chunk: {<.*>+}
                             }<NNP.?>+{"""
    chunkParser =nltk.RegexpParser(chunkgram)
    chunked=chunkParser.parse(tagged)
    print(chunked)
    

예제 #40
0
words = word_tokenize(mlk)
filtered_mlk = []

for w in words:
    if w not in stop_words:
        filtered_mlk.append(w)

# Fancy One Liner
# filtered_mlk = [w in words if w not in stop_words]

# pp(filtered_mlk)

train_text = state_union.raw('2005-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

custom_tokenized = custom_sent_tokenizer.tokenize(mlk)

def process_content():
    for i in custom_tokenized[5:]:
        words = word_tokenize(i)
        tagged = nltk.pos_tag(words)
        namedEnt = nltk.ne_chunk(tagged);

        print(namedEnt)

    #         chunkGram = r"""Chunk: {<.*>+}
    #                         }<VB.?|IN|DT|TO>+{"""
    #         chunkParser = nltk.RegexpParser(chunkGram)
    #         chunked = chunkParser.parse(tagged)
    #         chunked.draw()
    # except Exception as e:
예제 #41
0
Girl: Hola amigo... 
Hobo: his is all theatrical.
我说: "U.S.A 你好啊".
U.S.A is the abbreviation of United States. To use statistical parameters such as mean and standard deviation reliably, you need to have a good estimator for them. The maximum likelihood estimates (MLEs) provide one such estimator. However, an MLE might be biased, which means that its expected value of the parameter might not equal the parameter being estimated."""

sentences = sent_tokenize(article)

for sentence in sentences:
    tokens = word_tokenize(sentence)
    #print(sentence)

text = webtext.raw('overheard.txt')

print(text)
sent_tokenizer = PunktSentenceTokenizer(text)
sents1 = sent_tokenizer.tokenize(text)
sents2 = sent_tokenize(text)

sents1_article = sent_tokenizer.tokenize(article)
sents2_article = sent_tokenize(article)

print(sents1[0])
print(sents2[0])
print()
print(sents1[677])
print(sents2[677])
print()
print(sents1[678])
print(sents2[678])
print()
print(sents1[679])
def natural_sentence(string):
	pst = PunktSentenceTokenizer(string)
	t = pst.tokenize(string)

	word = nltk.word_tokenize(t[0])							#here we chunking sentance into word
	tagged = nltk.pos_tag(word)								#here each word is tagged means it is noud, pronoun, etc... is recognized
	print tagged
	chunkGram = r"""WRB:{<WRB.?>*<WP>*<WDT>?}"""			#REGEXP for detecting wh question
	chunkParser = nltk.RegexpParser(chunkGram)				#differentiate wh question
	chunked = chunkParser.parse(tagged)						#getting each word this will gives the output in tree form
	for subtree in chunked.subtrees():
		if subtree.label() == 'WRB':			# for only wh question
			for j in subtree.leaves():
				f = 0
				final = ""
				final += j[0]

				chunk = r"""VB: {<VBZ>*<VBP>?}"""							#here we are detecting type of wording and arranging it to proper place
				cp = nltk.RegexpParser(chunk)
				word = nltk.word_tokenize(t[0])
				tagged = nltk.pos_tag(word)
				ch = cp.parse(tagged)
				flg = 0
				for subtree in ch.subtrees():
					if subtree.label() == 'VB':
						for j in subtree.leaves():
							final += " "+j[0]

							flg = 1
						break
				if flg == 0:
					final += " is"

				chunk = r"""PRP: {<PRP.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<JJ.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<RB.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<VB.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""NN: {<NN.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'NN':
						for j in subtree.leaves():
							if f == 0:
								final += " "+j[0]
								f = 1
							else:
								final += " of "+j[0]
				f = 0
				print final
				final_string = grammar(final)				#sending generated sentance to ginger grammer for correcting grammar
				print final_string
				ws.send(final_string.upper())				#sending final sentance to board
				return
	chunkGram = r"""NN:{<PRP.?>*<NN.?>?}"""					#same thing like wh question is here for simple present tence sentance
	chunkParser = nltk.RegexpParser(chunkGram)
	chunked = chunkParser.parse(tagged)
	for subtree in chunked.subtrees():
		if subtree.label() == 'NN':
			for j in subtree.leaves():
				f = 0
				w = nltk.word_tokenize(string)
				w.remove(j[0])
				final = ""
				final += " "+j[0]
				chunk = r"""VB: {<VBP>*<VBZ>*<VB>*<VB.?>*<MD.?>?}"""
				cp = nltk.RegexpParser(chunk)
				word = nltk.word_tokenize(t[0])
				tagged = nltk.pos_tag(word)
				ch = cp.parse(tagged)
				flg = 0
				for subtree in ch.subtrees():
					if subtree.label() == 'VB':
						for j in subtree.leaves():
							w.remove(j[0])
							final += " "+j[0]
							flg = 1
						break
				if flg == 0:
					final += " is"
				chunk = r"""PRP: {<PRP.?>?}"""
				cp = nltk.RegexpParser(chunk)

				ch = cp.parse(nltk.pos_tag(w))
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

							w.remove(j[0])
				chunk = r"""NN: {<NN.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(nltk.pos_tag(w))
				for subtree in ch.subtrees():
					if subtree.label() == 'NN':
						for j in subtree.leaves():
							if f == 0:
								final += " "+j[0]
								f = 1
							else:
								final += " of "+j[0]
							w.remove(j[0])
				f = 0
				for wrd in w:
					final += " "+wrd
				print final
				final_string = grammar(final)
				print final_string
				ws.send(final_string.upper())
				return
예제 #43
0
def create_sentence_list(text):
    from nltk.tokenize import PunktSentenceTokenizer
    p = PunktSentenceTokenizer()
    return [sentence for sentence in p.tokenize(text)]
예제 #44
0
31.	VBP	Verb, non-3rd person singular present
32.	VBZ	Verb, 3rd person singular present
33.	WDT	Wh-determiner
34.	WP	Wh-pronoun
35.	WP$	Possessive wh-pronoun
36.	WRB	Wh-adverb
'''

train = state_union.raw("2005-GWBush.txt")
# text = state_union.raw("2006-GWBush.txt")

text = "George W Bush is the president of United States. Sky is blue and so are you."

# PunktSentenceTokenizer is a unsupervised ML tokenizer
training = PunktSentenceTokenizer(train)
tokenized_text = training.tokenize(text)


def process_content():
    try:
        for i in tokenized_text:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            #print tagged
            chunk_gram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>+<NN>?}"""

            chunkParser = nltk.RegexpParser(chunk_gram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()

    except Exception as e:
group_seq = Sequence(1)
for group in os.listdir(input_dir):
    group_id = group_seq.id_for(group)
    sys.stderr.write("%s\t%s\n" % (group, group_id))
    for article_path in os.listdir("%s/%s" % (input_dir, group)):
        # stitch article into a single string
        article = ""
        for line in open("%s/%s/%s" % (input_dir, group, article_path)):
            line = line.strip()
            if len(line) > 0 and not line.startswith(">"):
                article += " "
                article += line
        # tokenise sentences then some simple normalisation, collect just binary features
        all_tokens = set()
        for sentence in sent_tokenizer.tokenize(article):
            tokens = token_tokenizer.tokenize(sentence)
            tokens = map(lambda t: t.lower().replace(":","").replace("|",""), tokens)  # : and | reserved for vw format
            tokens = filter(lambda t: len(t) > 3, tokens)
            all_tokens.update(tokens)
        # write out token features ; weight of 0.5
        sys.stdout.write("1 1 '%s_%s |tokens" % (group, article_path))  # 1 weight=1 label
        for token in all_tokens:
            sys.stdout.write(" %s:0.5" % token)
        # followed by single group feature (with weighting = magic 10, therefore quadratic features have weight 5)
        sys.stdout.write(" |group %s:10" % group)
        sys.stdout.write("\n")



예제 #46
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
#PunktSentenceTokenizer
#is the abstract class for the default sentence tokenizer,
#i.e. sent_tokenize()

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sentence_tokenizer = PunktSentenceTokenizer(
    train_text)  #training our module, which is optional

tokenized = custom_sentence_tokenizer.tokenize(
    sample_text
)  # you can also do, tokenized = PunktSentenceTokenizer().tokenize(sample_text)


def process_content():
    try:
        example_sent = "This is merely an example sentence, which shows the use of stopwords"
        stop_words = set(stopwords.words("english"))
        #we can also add our own stop_words
        stop_words.add("Hiiii")

        # stopwords are those words which have no meaning, thus removing them is quite appreciated
        word_tokens = word_tokenize(example_sent)
        filtered_sentence = [w for w in word_tokens
                             if not w in stop_words]  #List-Complrehension
        #            OR
예제 #47
0
+ = match 1 or more
? = match 0 or 1 repetitions.
* = match 0 or MORE repetitions	  
. = Any character except a new line
'''

from nltk.tag import pos_tag
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from nltk import RegexpParser

sample_text_file = open("../sample.txt", "r")
text = sample_text_file.read()

pst = PunktSentenceTokenizer()

tokenized = pst.tokenize(text)


def process_content():
    try:
        for s in tokenized:
            words = word_tokenize(s)
            tagged = pos_tag(words)
            chunkGram = r"""Chunk: {<VB.?>*<NNP>+<NN>?}"""
            # look for any verb, atleast one proper noun and zero or one noun
            chunkParser = RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()
            # print(chunked)

    except Exception as e:
예제 #48
0
Chinking is basically alot of chunking, since the process of Chunking of another chunk is termed as Chinking!
Removing of a Chunk from a Chunk.
You just need to denote }{ this after the Chunking sequence , so that these are explicit out from the Data!

'''
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")  # Adding the Raw text using the state_unio from the Text file
sample_text = state_union.raw("2006-GWBush.txt")  # Adding the Sample Text With the same process using the state union

custom_sent_tokenizer = PunktSentenceTokenizer(
    train_text)  # using the Custom sentence Tokenizer which uses the Punksentence tokenizer for the training of the text

tokenized = custom_sent_tokenizer.tokenize(
    sample_text)  # Using the Custom text Tokenizer for the Tokenizing of the sample Text


def process_content ():
    try:
        for i in tokenized[0:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            # using the regular expression to Regexly parse the Chuncked Parser
            chunked = chunkParser.parse(tagged)
            # Using the Parser and adding the Parsing to the Tags which have been formulated
def natural_sentence(string):
	pst = PunktSentenceTokenizer(string)
	t = pst.tokenize(string)

	word = nltk.word_tokenize(t[0])							#here we chunking sentance into word
	tagged = nltk.pos_tag(word)								#here each word is tagged means it is noud, pronoun, etc... is recognized
	print tagged
	chunkGram = r"""WRB:{<WRB.?>*<WP>*<WDT>?}"""			#REGEXP for detecting wh question
	chunkParser = nltk.RegexpParser(chunkGram)				#differentiate wh question
	chunked = chunkParser.parse(tagged)						#getting each word this will gives the output in tree form
	for subtree in chunked.subtrees():
		if subtree.label() == 'WRB':			# for only wh question
			for j in subtree.leaves():
				f = 0
				final = ""
				final += j[0]

				chunk = r"""VB: {<VBZ>*<VBP>?}"""							#here we are detecting type of wording and arranging it to proper place
				cp = nltk.RegexpParser(chunk)
				word = nltk.word_tokenize(t[0])
				tagged = nltk.pos_tag(word)
				ch = cp.parse(tagged)
				flg = 0
				for subtree in ch.subtrees():
					if subtree.label() == 'VB':
						for j in subtree.leaves():
							final += " "+j[0]

							flg = 1
						break
				if flg == 0:
					final += " is"

				chunk = r"""PRP: {<PRP.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<JJ.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<RB.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<VB.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""NN: {<NN.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'NN':
						for j in subtree.leaves():
							if f == 0:
								final += " "+j[0]
								f = 1
							else:
								final += " of "+j[0]
				f = 0
				print final
				final_string = grammar(final)				#sending generated sentance to ginger grammer for correcting grammar
				print final_string
				ws.send(final_string.upper())				#sending final sentance to board
				return
	chunkGram = r"""NN:{<PRP.?>*<NN.?>?}"""					#same thing like wh question is here for simple present tence sentance
	chunkParser = nltk.RegexpParser(chunkGram)
	chunked = chunkParser.parse(tagged)
	for subtree in chunked.subtrees():
		if subtree.label() == 'NN':
			for j in subtree.leaves():
				f = 0
				w = nltk.word_tokenize(string)
				w.remove(j[0])
				final = ""
				final += " "+j[0]
				chunk = r"""VB: {<VBP>*<VBZ>*<VB>*<VB.?>*<MD.?>?}"""
				cp = nltk.RegexpParser(chunk)
				word = nltk.word_tokenize(t[0])
				tagged = nltk.pos_tag(word)
				ch = cp.parse(tagged)
				flg = 0
				for subtree in ch.subtrees():
					if subtree.label() == 'VB':
						for j in subtree.leaves():
							w.remove(j[0])
							final += " "+j[0]
							flg = 1
						break
				if flg == 0:
					final += " is"
				chunk = r"""PRP: {<PRP.?>?}"""
				cp = nltk.RegexpParser(chunk)

				ch = cp.parse(nltk.pos_tag(w))
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

							w.remove(j[0])
				chunk = r"""NN: {<NN.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(nltk.pos_tag(w))
				for subtree in ch.subtrees():
					if subtree.label() == 'NN':
						for j in subtree.leaves():
							if f == 0:
								final += " "+j[0]
								f = 1
							else:
								final += " of "+j[0]
							w.remove(j[0])
				f = 0
				for wrd in w:
					final += " "+wrd
				print final
				final_string = grammar(final)
				print final_string
				ws.send(final_string.upper())
				return
import nltk
from nltk.corpus import state_union
# An unsupervised machine learning tokenizer(comes pre-trained.)
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize

# We train the PunktSentenceTokenizer to a clinton speech in 1993
TRAIN_TEXT = state_union.raw("1993-Clinton.txt")
# We train the PunktSentenceTokenizer to a clinton speech in 1994
SAMPLE_TEXT = state_union.raw("1994-Clinton.txt")

# Actual training of the PunktSentenceTokenizer
SENTENCE_TOKENIZER = PunktSentenceTokenizer(TRAIN_TEXT)

# Tokenizing using the trained model.
TOKENIZED = SENTENCE_TOKENIZER.tokenize(SAMPLE_TEXT)


# Processing function.
def process_content():
    try:
        for i in TOKENIZED:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print "POS of tagged words", (tagged)
    except Exception as e:
        print(str(e))


process_content()
예제 #51
0
파일: all_nltk.py 프로젝트: Utkagr/NLPrel
ps = PosterStemmer()

example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
	print(ps.stem(w))

##Part of Speech Tagging
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
#Unsupervised machine learning tokenizer -> PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text) #training on train_text
	
tokenized = custom_sent_tokenizer.tokenize(sample_text) #applying model to sample_text
#this will generate sentences

def process_content():
	try:
		for i in tokenized:
			words= nltk.word_tokenize(i)
			tagged = nltk.pos_tag(words)
			print(tagged)
	except: Exception as e:
		print(str(e))
		
process_content()

#POS tag list
"""
예제 #52
0
#read from txt file

# inputText = state_union.raw(os.path.abspath(os.path.join(os.getcwd(),"..\Dataset\RabindranathTagore.txt")))
inputText = state_union.raw(
    os.path.abspath(os.getcwd() + "\Dataset\RabindranathTagore.txt"))
experimentText = state_union.raw(
    "I:\Information\WorkSpace\AdiRepo\MachineLearning\DataSet\SubhasChandraBose.txt"
)

#train tokenizer, if require.
trainedTokenizer = PunktSentenceTokenizer()
# trainedTokenizer = PunktSentenceTokenizer(inputText)

#tokenizing experimentText
sentences = trainedTokenizer.tokenize(experimentText)


def partOfSpeechTaggig():
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)

        #  pos_tag() take list of words or sentence as input and tag part of speech
        taggedWords = nltk.pos_tag(words)

        #region Chunking
        grammer = R"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>*}"""
        chunckingParser = nltk.RegexpParser(grammer)
        chunk = chunckingParser.parse(taggedWords)

        #for tWord in taggedWords:
예제 #53
0
pa = parser.parse_args()
lang = pa.lang
filePath = pa.file
outputPath = filePath + '.sent'


if __name__ == "__main__":
    file = open(filePath, 'r')
    output = open(outputPath, 'w')
    sst = None
    if lang == 'EN':
        sst = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    elif lang == 'ES':
        sst = nltk.data.load('nltk:tokenizers/punkt/spanish.pickle')
    else:
        sst = PunktSentenceTokenizer()
    for line in file:
        if line == "\n":
            sys.stdout.write(line)
            continue
        line = line.replace("«", "'")
        line = line.replace("»", "'")
        line = line.replace("“", "'")
        line = line.replace("”", "'")
        line = line.replace("\"", "'")
        sentences = sst.tokenize(line.decode("utf-8"))
        for s in sentences:
            output.write((s+'\n').encode('utf-8'))
    file.close()
    output.close()
예제 #54
0
)

#------------------------------------------------------------------------------------ Preprocessing
# Stop words
stop_words = set(stopwords.words('english'))
# Initializes the lemmatizer
normalizer = WordNetLemmatizer()
# Creates an empty list of processed speeches
preprocessed_speeches = []
# ---------------------- Preprocessing loop
for speech in speeches:
    # ------------------ Tokenizing
    # Initializes sentence tokenizer
    sentence_tokenizer = PunktSentenceTokenizer()
    # Tokenizes speech into sentences
    sentence_tokenized_speech = sentence_tokenizer.tokenize(speech)
    # ------------------ Normalizing loop
    # Creates an empty sentences list
    word_sentences = []
    for sentence in sentence_tokenized_speech:
        # ----------- Removes noise from sentence and tokenizes the sentence into words
        word_tokenized_sentence = [re.sub('[^a-zA-Z0-9]+', '', word.lower()) \
                                   for word in sentence.replace(",", "").replace("-", " ").replace(":", "").split()]
        # ---------------- Removes stopwords from sentences
        sentence_no_stopwords = [
            word for word in word_tokenized_sentence if word not in stop_words
        ]
        # ---------------- Before lemmatizing, adds a 's' to the word 'us'
        word_sentence_us = [
            'uss' if word == 'us' else word for word in sentence_no_stopwords
        ]
예제 #55
0
#!/usr/bin/env python

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #unsupervised tokenizer


train_text = state_union.raw('2005-GWBush.txt')

#print train_text

test_text = state_union.raw('2006-GWBush.txt')

custom_sent_token = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_token.tokenize(test_text)

#print tokenized
#print type(tokenized)

def chunk():
	try:
		for i in tokenized:
			words = nltk.word_tokenize(i)
			tagged = nltk.pos_tag(words)

			regexp = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} 
								}<VB.?|IN|DT|TO>+{"""

			parser = nltk.RegexpParser(regexp)
[] = quant[ia]tative = will find either quantitative, or quantatative.
[a-z] = return any lowercase letter a-z
[1-5a-qA-Z] = return all numbers 1-5, lowercase letters a-q and uppercase A-Z"""

# In[3]:

train = state_union.raw("2005-GWBush.txt")
test = state_union.raw("2006-GWBush.txt")

# In[4]:

pst = PunktSentenceTokenizer(train)  #training the tokenizer

# In[5]:

tokenised = pst.tokenize(test)

# In[7]:

for i in tokenised:
    words = word_tokenize(i)
    tokenise = nltk.pos_tag(words)

    chunkgram = r"""chunk :{<RB.?>*<VB.?>*<NNP>+<NN>?}"""
    chunkprase = nltk.RegexpParser(chunkgram)
    chunkd = chunkprase.parse(tokenise)
    print(chunkd)

# In[11]:

예제 #57
0
## file myself or do I feed it straight into NLTK?
## option 1
content = file.read()
#print content
match = re.findall(r"msgstr\s\"([^\"\\]*)\"", content)
match = " ".join(match)
match = match.decode('utf8')
#print match
#print(sent_tokenize(match))

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(match)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print (str(e))

processed_content = process_content()

print(processed_content)
#labeling part of speech
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  #unsupervised [we can re train]

#train_text = state_union.raw("2005-GWBush.txt")
#sample_text = state_union.raw("2006-GWBush.txt")

train_text = "This is a training text, which consists of manya place name like: India, America and USA. Since Chinahas corona virus in the contry. Donald trump refuse to add them in the meeting."
sample_text = "This is a sample text which goes under the program and test  out that either it working or not. Contry : India , China, USA and Name : Donald Trump, Modi, Varun better etc."

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)  #training the text

tokenized = custom_sent_tokenizer.tokenize(
    sample_text)  #after training we use it to different data

#Give the result about who all are verbs, adjective and all the things.


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()
예제 #59
0
파일: chinkikng.py 프로젝트: MIS407/pyFiles
"""
Created on Thu Nov 19 09:15:11 2015

@author: nilakant
"""


import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
#unsupervised tokenizer
train_text = state_union.raw("2006-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()
            
            #print(tagged)
예제 #60
0
                    "--train",
                    type=str,
                    nargs="*",
                    metavar="language(s)",
                    default=None,
                    help="Train a model of given language(s)")

# parse the arguments from standard input
args = parser.parse_args()

if args.intra is not None:
    if len(args.intra) == 0:
        sentence = input('Enter a sentence for POS tagging: ')
        print('sent for POS tagging:  {}'.format(sentence))
        punktok = PunktSentenceTokenizer()
        tokenized = punktok.tokenize(text=sentence)
        POSGenerator = pos.POSGenerator(method='nltk')
        POSGenerator.process_content(tokenized_text=tokenized)

    else:  # file name input
        print('file path for POS tagging: {}'.format(args.intra[0]))
        try:
            f = open(args.intra[0])
            print('file read')
        except Exception as e:
            print(e)

if args.train is not None:
    if len(args.train) == 0:
        print('Please specify a language')
    else: