Python PunktSentenceTokenizer.PunktSentenceTokenizer示例，nltk.tokenize.PunktSentenceTokenizer.PunktSentenceTokenizer Python示例

示例#1

0

显示文件

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

traintext = state_union.raw(
    "/home/varun/PycharmProjects/untitled/speechtrain.txt")
sampletext = state_union.raw(
    "/home/varun/PycharmProjects/untitled/speechsample.txt")
costum_sent_tokenizer = PunktSentenceTokenizer(traintext)
tokenized = costum_sent_tokenizer.tokenize(sampletext)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

    #chunking        chunkgram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?}"""
    #chunking        chunkgram = r"""Chunk: {<.*>+}
    #                                              }<VB.?|IN|DT|TO>+{"""
        nameEnt = nltk.ne_chunk(tagged, binary=true)
        print(nameEnt)
    #       chunkParser = nltk.RegexpParser(chunkgram)
    #      chunked = chunkParser.parse(tagged)
    #     print(chunked)

    except Exception as e:
        print(str(e))

示例#2

0

显示文件

checkpoint = torch.load(checkpoint, map_location=device)
model = checkpoint['model']
model = model.to(device)
model.eval()

# Pad limits, can use any high-enough value since our model does not compute over the pads
sentence_limit = 15
word_limit = 20

# Word map to encode with
data_folder = './han_data'
with open(os.path.join(data_folder, 'word_map.json'), 'r') as j:
    word_map = json.load(j)

# Tokenizers
sent_tokenizer = PunktSentenceTokenizer()
word_tokenizer = TreebankWordTokenizer()

classes = ["1", "2", "3", "4", "5"]
label_map = {k: v for v, k in enumerate(classes)}
rev_label_map = {v: k for k, v in label_map.items()}


def classify(document):
    doc = list()

    # Tokenize document into sentences
    sentences = list()
    for paragraph in preprocess(document).splitlines():
        sentences.extend([s for s in sent_tokenizer.tokenize(paragraph)])

示例#3

0

显示文件

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(sample_text)

tokenize = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenize[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                            }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()

    except Exception as e:
        print(str(e))

process_content()

示例#4

0

显示文件

文件： P4_SpeechTagging.py 项目： AdarshRazor/NLTK-with-Python-for-NLP-Notes-

#labeling part of speech
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  #unsupervised [we can re train]

#train_text = state_union.raw("2005-GWBush.txt")
#sample_text = state_union.raw("2006-GWBush.txt")

train_text = "This is a training text, which consists of manya place name like: India, America and USA. Since Chinahas corona virus in the contry. Donald trump refuse to add them in the meeting."
sample_text = "This is a sample text which goes under the program and test  out that either it working or not. Contry : India , China, USA and Name : Donald Trump, Modi, Varun better etc."

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)  #training the text

tokenized = custom_sent_tokenizer.tokenize(
    sample_text)  #after training we use it to different data

#Give the result about who all are verbs, adjective and all the things.


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()

示例#5

0

显示文件

文件： main.py 项目： generationzcode/Question-Fragmentor

def tokenizeText(text):
    text = text.replace("?", "?,")
    custom_sent_tokenizer = PunktSentenceTokenizer(text)
    tokenize = custom_sent_tokenizer.tokenize(text)
    return tokenize

示例#6

0

显示文件

文件： chunking.py 项目： ArbazSk/NLP-Lab

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize

train_txt = "Shooting is a very popular sport. Hunters use guns to shoot animals. Terrorist also use them to kill."
sample = "Sharpshooter Mark shoots a dangerous animal with a gun."

PunktSentenceTokenizer(train_text=train_txt)
tokenized = sent_tokenize(sample)


def process():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunk_gram = r"""chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunk_parser = nltk.RegexpParser(chunk_gram)
            chunked = chunk_parser.parse(tagged)
            chunked.draw()

    except Exception as e:
        print(e)


process()

示例#7

0

显示文件

文件： Chunking.py 项目： IdeaLaboratory/MachineLearning

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

#read from txt file

# inputText = state_union.raw(os.path.abspath(os.path.join(os.getcwd(),"..\Dataset\RabindranathTagore.txt")))
inputText = state_union.raw(
    os.path.abspath(os.getcwd() + "\Dataset\RabindranathTagore.txt"))
experimentText = state_union.raw(
    "I:\Information\WorkSpace\AdiRepo\MachineLearning\DataSet\SubhasChandraBose.txt"
)

#train tokenizer, if require.
trainedTokenizer = PunktSentenceTokenizer()
# trainedTokenizer = PunktSentenceTokenizer(inputText)

#tokenizing experimentText
sentences = trainedTokenizer.tokenize(experimentText)


def partOfSpeechTaggig():
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)

        #  pos_tag() take list of words or sentence as input and tag part of speech
        taggedWords = nltk.pos_tag(words)

        #region Chunking
        grammer = R"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>*}"""

示例#8

0

显示文件

import nltk
from nltk.tokenize import PunktSentenceTokenizer

sentence1 = """The group arrived at two o'clock on Monday afternoon to start
class."""
sentence2 = """The Little Mermaid (Danish: Den lille havfrue) is a fairy tale 
written by the Danish author Hans Christian Andersen about a young mermaid who 
is willing to give up her life in the sea and her identity as a mermaid to gain 
a human soul."""

#Chunking
custom_sent_tokenizer = PunktSentenceTokenizer(sentence1)
tokenized = custom_sent_tokenizer.tokenize(sentence2)


def process_content():
    for i in tokenized:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        # chunked.draw()
        print chunked


process_content()

# many named nouns
# chunking: chunk: 'noun phrases' be anoun, and modifiers around that noun.
# descriptive group of words surrounding that noun. downside: only can use

示例#9

0

显示文件

文件： PSL to text system.py 项目： iammurtaza44/Pakistan-sign-language-to-text-using-leap-motion-cntroller

def natural_sentence(string):
	pst = PunktSentenceTokenizer(string)
	t = pst.tokenize(string)

	word = nltk.word_tokenize(t[0])							#here we chunking sentance into word
	tagged = nltk.pos_tag(word)								#here each word is tagged means it is noud, pronoun, etc... is recognized
	print tagged
	chunkGram = r"""WRB:{<WRB.?>*<WP>*<WDT>?}"""			#REGEXP for detecting wh question
	chunkParser = nltk.RegexpParser(chunkGram)				#differentiate wh question
	chunked = chunkParser.parse(tagged)						#getting each word this will gives the output in tree form
	for subtree in chunked.subtrees():
		if subtree.label() == 'WRB':			# for only wh question
			for j in subtree.leaves():
				f = 0
				final = ""
				final += j[0]

				chunk = r"""VB: {<VBZ>*<VBP>?}"""							#here we are detecting type of wording and arranging it to proper place
				cp = nltk.RegexpParser(chunk)
				word = nltk.word_tokenize(t[0])
				tagged = nltk.pos_tag(word)
				ch = cp.parse(tagged)
				flg = 0
				for subtree in ch.subtrees():
					if subtree.label() == 'VB':
						for j in subtree.leaves():
							final += " "+j[0]

							flg = 1
						break
				if flg == 0:
					final += " is"

				chunk = r"""PRP: {<PRP.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<JJ.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<RB.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<VB.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""NN: {<NN.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'NN':
						for j in subtree.leaves():
							if f == 0:
								final += " "+j[0]
								f = 1
							else:
								final += " of "+j[0]
				f = 0
				print final
				final_string = grammar(final)				#sending generated sentance to ginger grammer for correcting grammar
				print final_string
				ws.send(final_string.upper())				#sending final sentance to board
				return
	chunkGram = r"""NN:{<PRP.?>*<NN.?>?}"""					#same thing like wh question is here for simple present tence sentance
	chunkParser = nltk.RegexpParser(chunkGram)
	chunked = chunkParser.parse(tagged)
	for subtree in chunked.subtrees():
		if subtree.label() == 'NN':
			for j in subtree.leaves():
				f = 0
				w = nltk.word_tokenize(string)
				w.remove(j[0])
				final = ""
				final += " "+j[0]
				chunk = r"""VB: {<VBP>*<VBZ>*<VB>*<VB.?>*<MD.?>?}"""
				cp = nltk.RegexpParser(chunk)
				word = nltk.word_tokenize(t[0])
				tagged = nltk.pos_tag(word)
				ch = cp.parse(tagged)
				flg = 0
				for subtree in ch.subtrees():
					if subtree.label() == 'VB':
						for j in subtree.leaves():
							w.remove(j[0])
							final += " "+j[0]
							flg = 1
						break
				if flg == 0:
					final += " is"
				chunk = r"""PRP: {<PRP.?>?}"""
				cp = nltk.RegexpParser(chunk)

				ch = cp.parse(nltk.pos_tag(w))
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

							w.remove(j[0])
				chunk = r"""NN: {<NN.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(nltk.pos_tag(w))
				for subtree in ch.subtrees():
					if subtree.label() == 'NN':
						for j in subtree.leaves():
							if f == 0:
								final += " "+j[0]
								f = 1
							else:
								final += " of "+j[0]
							w.remove(j[0])
				f = 0
				for wrd in w:
					final += " "+wrd
				print final
				final_string = grammar(final)
				print final_string
				ws.send(final_string.upper())
				return

示例#10

0

显示文件

def tokenize_sentence(input_text: str) -> List[str]:
    """ Converts a text into a list of sentence tokens """
    if input_text is None or len(input_text) == 0:
        return []
    tokenizer = PunktSentenceTokenizer()
    return tokenizer.tokenize(input_text)

示例#11

0

显示文件

文件： CPW.py 项目： Cogitans/context_predictive_words

# This section triggers if you supply the --generate flag, indicating
# you want to recreate the training data/labels
if REGENERATE:

    print("Generating data from scratch.")

    texts = pickle.load(open(OUTFILE, 'rb'))[0]

    # This splits your list of texts into a list of sentences
    # At this point (in the training data) document borders
    # are removed.

    sentences = [
        item for text in texts
        for item in PunktSentenceTokenizer().tokenize(text.decode("utf8"))
    ]
    sentences = [
        i.strip(' \n,.;:').replace('\n', ' ').split(' ') for i in sentences
    ]

    # Create and train bigram/trigram converters
    unigram = Phrases(sentences, threshold=float("inf"))
    unigrams = unigram.export_phrases(sentences)

    grams = []  #[gmp.Phraser(unigram)]

    sentences_copy = sentences

    threshold = 8.0

示例#12

0

显示文件

# ? = match 0 or 1 repetitions.
# * = match 0 or MORE repetitions
# . = Any character except a new line

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
#PunktSentenceTokenizer
#is the abstract class for the default sentence tokenizer,
#i.e. sent_tokenize()

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sentence_tokenizer = PunktSentenceTokenizer(
    train_text)  #training our module, which is optional

tokenized = custom_sentence_tokenizer.tokenize(
    sample_text
)  # you can also do, tokenized = PunktSentenceTokenizer().tokenize(sample_text)


def process_content():
    try:
        example_sent = "This is merely an example sentence, which shows the use of stopwords"
        stop_words = set(stopwords.words("english"))
        #we can also add our own stop_words
        stop_words.add("Hiiii")

        # stopwords are those words which have no meaning, thus removing them is quite appreciated
        word_tokens = word_tokenize(example_sent)

示例#13

0

显示文件

30.	VBN	Verb, past participle
31.	VBP	Verb, non-3rd person singular present
32.	VBZ	Verb, 3rd person singular present
33.	WDT	Wh-determiner
34.	WP	Wh-pronoun
35.	WP$	Possessive wh-pronoun
36.	WRB	Wh-adverb
'''

train = state_union.raw("2005-GWBush.txt")
# text = state_union.raw("2006-GWBush.txt")

text = "George W Bush is the president of United States. Sky is blue and so are you."

# PunktSentenceTokenizer is a unsupervised ML tokenizer
training = PunktSentenceTokenizer(train)
tokenized_text = training.tokenize(text)


def process_content():
    try:
        for i in tokenized_text:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            #print tagged
            chunk_gram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>+<NN>?}"""

            chunkParser = nltk.RegexpParser(chunk_gram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()

示例#14

0

显示文件

from nltk.tokenize import PunktSentenceTokenizer

'''
Chinking is basically alot of chunking, since the process of Chunking of another chunk is termed as Chinking!
Removing of a Chunk from a Chunk.
You just need to denote }{ this after the Chunking sequence , so that these are explicit out from the Data!

'''
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")  # Adding the Raw text using the state_unio from the Text file
sample_text = state_union.raw("2006-GWBush.txt")  # Adding the Sample Text With the same process using the state union

custom_sent_tokenizer = PunktSentenceTokenizer(
    train_text)  # using the Custom sentence Tokenizer which uses the Punksentence tokenizer for the training of the text

tokenized = custom_sent_tokenizer.tokenize(
    sample_text)  # Using the Custom text Tokenizer for the Tokenizing of the sample Text


def process_content ():
    try:
        for i in tokenized[0:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)

示例#15

0

显示文件

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train = state_union.raw('2005-GWBush.txt')
sample = state_union.raw('2006-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(train)

tokenized = custom_sent_tokenizer.tokenize(sample)

words = nltk.word_tokenize(tokenized[0])
tagged = nltk.pos_tag(words)
print(tagged)


# # chunking
# def process_content():
#     try:
#         for i in tokenized:
#             words = nltk.word_tokenize(i)
#             tagged = nltk.pos_tag(words)
#             chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} """
            
#             chunkParser = nltk.RegexpParser(chunkGram)
#             chunked = chunkParser.parse(tagged)
            
#             print(chunked)
            

#     except Exception as e:
#         print(str(e))

示例#16

0

显示文件

#named Entity Recognition

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenzier = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenzier.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            namedEnt = nltk.ne_chunk(tagged, binary=True)

            namedEnt.draw()

    except Exception as e:
        print(str(e))


process_content()

示例#17

0

显示文件

Mainly from regular expressions, we are going to utilize the following:

+ = match 1 or more
? = match 0 or 1 repetitions.
* = match 0 or MORE repetitions	  
. = Any character except a new line
'''

from nltk.tag import pos_tag
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from nltk import RegexpParser

sample_text_file = open("../sample.txt", "r")
text = sample_text_file.read()

pst = PunktSentenceTokenizer()

tokenized = pst.tokenize(text)


def process_content():
    try:
        for s in tokenized:
            words = word_tokenize(s)
            tagged = pos_tag(words)
            chunkGram = r"""Chunk: {<VB.?>*<NNP>+<NN>?}"""
            # look for any verb, atleast one proper noun and zero or one noun
            chunkParser = RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()
            # print(chunked)

示例#18

0

显示文件

文件： Bloomberg With Sentiment.py 项目： VarunLondhe/WebScraper-and-Sentiment-Analyser

    return re.compile(r'({0})'.format(w), flags=re.IGNORECASE).search


POLARITY_TEXTBLOB = []
SUBJECTIVITY = []
POLARITY_VADER = []
POLARITY_ARTICLE = []
TEXTBLOB_FULL_ARTICLE = []
for news in df["Content"]:
    VADER_ARTICLE_COMPOUND = []
    TEXTBLOB_ARTICLE_POLARITY = []
    TEXTBLOB_ARTICLE_SUBJECTIVITY = []
    try:
        a = find_whole_word('/Bloomberg')(news).span()[1]
        #       b = find_whole_word('Reporting by')(news).span()[0]
        sentences = PunktSentenceTokenizer().tokenize(news[a + 1:])
    except:
        sentences = PunktSentenceTokenizer().tokenize(news)

    for sentence in sentences:
        vaderAnalyzer = SentimentIntensityAnalyzer()
        vs = vaderAnalyzer.polarity_scores(sentence)
        textBlobAnalyzer = TextBlob(sentence)
        VADER_ARTICLE_COMPOUND.append(vs["compound"])
        TEXTBLOB_ARTICLE_POLARITY.append(textBlobAnalyzer.sentiment.polarity)
        TEXTBLOB_ARTICLE_SUBJECTIVITY.append(
            textBlobAnalyzer.sentiment.subjectivity)
    POLARITY_TEXTBLOB.append(st.mean(TEXTBLOB_ARTICLE_POLARITY))
    SUBJECTIVITY.append(st.mean(TEXTBLOB_ARTICLE_SUBJECTIVITY))
    POLARITY_VADER.append(st.mean(VADER_ARTICLE_COMPOUND))
    TEXTBLOB_FULL_ARTICLE.append(TextBlob(news).sentiment.polarity)

示例#19

0

显示文件

文件： part_of_speech_tagging.py 项目： crikeli/natural-language-processing_basics

# Opinion mining and Sentiment analysis using the Natural Language Tool-kit
# Exercise number 4.

import nltk
from nltk.corpus import state_union
# An unsupervised machine learning tokenizer(comes pre-trained.)
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize

# We train the PunktSentenceTokenizer to a clinton speech in 1993
TRAIN_TEXT = state_union.raw("1993-Clinton.txt")
# We train the PunktSentenceTokenizer to a clinton speech in 1994
SAMPLE_TEXT = state_union.raw("1994-Clinton.txt")

# Actual training of the PunktSentenceTokenizer
SENTENCE_TOKENIZER = PunktSentenceTokenizer(TRAIN_TEXT)

# Tokenizing using the trained model.
TOKENIZED = SENTENCE_TOKENIZER.tokenize(SAMPLE_TEXT)


# Processing function.
def process_content():
    try:
        for i in TOKENIZED:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print "POS of tagged words", (tagged)
    except Exception as e:
        print(str(e))

示例#20

0

显示文件

"""

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  # The Tokenizer which will be used.

'''This tokenizer is capable of unsupervised machine learning --> PuckSentenceTokenizer '''
'''o you can actually train it on any body of text that you use. First, let's get some imports out of the way that we're going to use:'''
def process_content():
    try:
        for i in tokenized[:5]:  # here we are applying sentence limit so we can use this one for the processing the sentences.
            words = nltk.word_tokenize(i) # Tokenizes all the word , using the word tokenize!
            tagged = nltk.pos_tag(words)  # Tags the specific words with the Natural language .
            print(tagged)  # Prints the words with the Tags in the form of the tupple .!


    except Exception as e:
        print(str(e))  # if there is an exception then this prints out the exception



if __name__ == '__main__':
    train_text = state_union.raw("2005-GWBUSH.txt")   # This is the train text which will be used to tokenize the sample Test(unsupervised learning)
    sample_text = state_union.raw("2006-GWBUSH.txt")  # This is the sample text which will be tokenized later onward
    print(type(sample_text))
    custom_sent_tokenizer = PunktSentenceTokenizer(
        train_text)  # This is the Train Text in the form of sentence being tokenized using the unsupervised learning.!
    #tokenized  = custom_sent_tokenizer.tokenize(sample_text) # Tokenizing he Custom sentence tokenize
    tokenized  = custom_sent_tokenizer.tokenize("Hi! my name is Shafay. I am 20 years old. I love playing games.")
    #print(tokenized) # this is just for the Debugging purposes!
    process_content()  # Calling the process content function!

示例#21

0

显示文件

文件： chunking- reg exp.py 项目： chaitanya1908/Natural-Language-Processing

. + * ? [ ] $ ^ ( ) { } | \
Brackets:

[] = quant[ia]tative = will find either quantitative, or quantatative.
[a-z] = return any lowercase letter a-z
[1-5a-qA-Z] = return all numbers 1-5, lowercase letters a-q and uppercase A-Z"""

# In[3]:

train = state_union.raw("2005-GWBush.txt")
test = state_union.raw("2006-GWBush.txt")

# In[4]:

pst = PunktSentenceTokenizer(train)  #training the tokenizer

# In[5]:

tokenised = pst.tokenize(test)

# In[7]:

for i in tokenised:
    words = word_tokenize(i)
    tokenise = nltk.pos_tag(words)

    chunkgram = r"""chunk :{<RB.?>*<VB.?>*<NNP>+<NN>?}"""
    chunkprase = nltk.RegexpParser(chunkgram)
    chunkd = chunkprase.parse(tokenise)
    print(chunkd)

示例#22

0

显示文件

new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))
    
#%% POS (Part Of Speech) TAGGING
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text) #Pretrained tokenizer, can be retrained
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))

process_content()

#%% Lemmatizing

示例#23

0

显示文件

文件： my_pos.py 项目： ashish-ram/pos_tagging

parser.add_argument("-t",
                    "--train",
                    type=str,
                    nargs="*",
                    metavar="language(s)",
                    default=None,
                    help="Train a model of given language(s)")

# parse the arguments from standard input
args = parser.parse_args()

if args.intra is not None:
    if len(args.intra) == 0:
        sentence = input('Enter a sentence for POS tagging: ')
        print('sent for POS tagging:  {}'.format(sentence))
        punktok = PunktSentenceTokenizer()
        tokenized = punktok.tokenize(text=sentence)
        POSGenerator = pos.POSGenerator(method='nltk')
        POSGenerator.process_content(tokenized_text=tokenized)

    else:  # file name input
        print('file path for POS tagging: {}'.format(args.intra[0]))
        try:
            f = open(args.intra[0])
            print('file read')
        except Exception as e:
            print(e)

if args.train is not None:
    if len(args.train) == 0:
        print('Please specify a language')

示例#24

0

显示文件

文件： Chinking.py 项目： Abhay0899193/FinalYear

import nltk
import wikipedia
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union

train_text = state_union.raw("2005-GWBush.txt")
_text = input("Enter a text:")
sample_text = wikipedia.summary(_text, sentences=1)
custom_SentTok = PunktSentenceTokenizer(train_text)
tokenized = custom_SentTok.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                   }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()

    except Exception as e:
        print(str(e))


process_content()

示例#25

0

显示文件

文件： JBushYOUTUBE.py 项目： vatanozcelik/Natural_languages_programing

# -*- coding: utf-8 -*-
"""
Created on Tue Nov 24 06:45:40 2020

@author: Dell
"""

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

trainText = "A malapropism also called a malaprop, acyrologia, or Dogberryism is the mistaken use of an incorrect word in place of a word with a similar sound, resulting in a nonsensical, sometimes humorous utterance. An example is the statement by baseball player Yogi Berra, Texas has a lot of electrical votes, rather than electoral votes. Malapropisms often occur as errors in natural speech and are sometimes the subject of media attention, especially when made by politicians or other prominent individuals. Philosopher Donald Davidson has said that malapropisms show the comple process through which the brain translates thoughts into language.Humorous malapropisms are the type that attract the most attention and commentary, but bland malapropisms are common in speech and writing."

customSentTokenizer = PunktSentenceTokenizer(trainText)
tokenized = customSentTokenizer.tokenize(trainText)


def processContent():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            #chunkGram = """Chunk: {<RB}"""
            print(tagged)

    except Exception as e:
        print(str(e))


processContent()

示例#26

0

显示文件

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_raw = state_union.raw("2005-GWBush.txt")
sample_raw = state_union.raw("2006-GWBush.txt")

tokenizer = PunktSentenceTokenizer(train_raw)
sentences = tokenizer.tokenize(sample_raw)


def process_data():
    try:
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            namedEnt.draw()

    except Exception as e:
        print(str(e))


process_data()

示例#27

0

显示文件

文件： Named_Entity_Recognition.py 项目： AwsafAlam/Machine_Learning

ORGANIZATION - Georgia-Pacific Corp., WHO
PERSON - Eddy Bonte, President Obama
LOCATION - Murray River, Mount Everest
DATE - June, 2008-06-29
TIME - two fifty a m, 1:30 p.m.
MONEY - 175 million Canadian Dollars, GBP 10.40
PERCENT - twenty pct, 18.75 %
FACILITY - Washington Monument, Stonehenge
GPE - South East Asia, Midlothian
'''

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = sent_tokenizer.tokenize(sample_text)


def processContent():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # print(tagged)

            # namedEnt = nltk.ne_chunk(tagged)
            namedEnt = nltk.ne_chunk(
                tagged, binary=True
            )  ## When type of the named entity is not important(puts all named entities together.

示例#28

0

显示文件

import nltk
from nltk.tokenize import word_tokenize, PunktSentenceTokenizer

text_file = open(".\login2.txt", "r")
text = text_file.read()

word = nltk.word_tokenize(text)

custom_sen = PunktSentenceTokenizer(text)
tokenized = custom_sen.tokenize("hello. how can i login. where is otp?")


def process():
    try:
        for w in tokenized:
            word = nltk.word_tokenize(w)
            tagged = nltk.pos_tag(word)

            namedEnt = nltk.ne_chunk(tagged)
            namedEnt.draw()
            print(tagged)

    except Exception as e:
        print(str(e))


process()

示例#29

0

显示文件

文件： main.py 项目： nairaAbdallah/Text-Prediction

'''

#01 Segmentation

sentences = brown.sents(categories=category)

tokens = brown.words(categories=category)
new_token = []
for w in tokens:
    word = re.sub(r'[-[_\],`!?():{}&$#@%*+;/\'"\t\n\b0-9]', r'', w.lower())
    if word != '' and word not in stop_Words:
        new_token.append(word)

row_text = ' '.join(new_token)
#unsupervised learning ML alogrithm to detect end of sentences (EOS)
custom_sent_tokenizer = PunktSentenceTokenizer(row_text)
tokenized = custom_sent_tokenizer.tokenize(row_text)
last_text = ' '.join(tokenized)


#prediction Algorithm
def markov_chain(text):
    words = text.split(' ')
    myDict = defaultdict(list)
    for currentWord, nextWord in zip(words[0:-1], words[1:]):
        myDict[currentWord].append(nextWord)
    myDict = dict(myDict)
    return myDict


markov_return = markov_chain(last_text)

示例#30

0

显示文件

文件： parts of speech.py 项目： naveenpiedy/sentiment

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

test_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_toz = PunktSentenceTokenizer(test_text)

to = custom_toz.tokenize(sample_text)


def process_content():
    try:
        for i in to[:100]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            print(tagged)
    except Exception as e:
        print(str(e))


process_content()