Пример #1
0
def POS_tagging(corpus):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = corpus
    #print(train_text)
    custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)

    # textfile = open("POS_tagged",'w')
    # textfile.write(train_text)
    # textfile.write("\n\n\n\n\n\n\n\n\n\n")
    # print(custom_sentence_tokenizer)

    tokenized = custom_sentence_tokenizer.tokenize(sample_text)
    tuples_list = []
    def process_content():
        try:
            for i in tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                for w in tagged:
                    tuples_list.append(w)
        except Exception as e:
            c=0
            # print(str(e))
    process_content()
    return tuples_list
Пример #2
0
 def __init__(self,sentence):
    f = open('data/training_data', 'r')
    train_text=f.read()
    #data=open('data2','r')
    #test_data=data.read()
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    self.tokenized = custom_sent_tokenizer.tokenize(sentence)
def extractNounPhrases(sentence):

    nounPhrases = []
    try:
        tokenizer = PunktSentenceTokenizer(sentence)
        tokenized = tokenizer.tokenize(sentence)

        words = nltk.word_tokenize(tokenized[0])
        tagged = nltk.pos_tag(words)

        firstNN = False

        for tag in tagged:
            pos = tag[1]
            if "NN" in pos:
                if firstNN:
                    nounPhrase = firstNoun + " " + tag[0]
                    nounPhrases.append(nounPhrase)
                    firstNN = False
                    continue
                else:
                    firstNoun = tag[0]
                    firstNN = True
                    continue

            firstNN = False

    except Exception as e:
        print(str(e))

    return nounPhrases
Пример #4
0
    def get_sentences(self, remove_url=True):
        '''
        generator
        :param remove_url --> replace URLs in sentences with one space char ;
        :return: tuple of sentences for each mime-part ;
        '''

        tokenizer = PunktSentenceTokenizer()

        for raw_line, mime_type, lang in tuple(self.get_text_mime_part()):

            if 'html' in mime_type:
                soup = BeautifulSoup(raw_line)
                if not soup.body:
                    continue
                # cause exactly sentences are needed, soup.body.strings returns lines+0d0a
                lines = tuple(soup.body.strings)
                raw_line = ''.join(lines)

            try:
                sents = tuple(tokenizer.tokenize(raw_line))
            except Exception as err:
                sents = tuple(raw_line)

            if remove_url:
                sents = tuple(map(lambda sent: self.__URLINTEXT_PAT.sub(' ', sent.lower()), sents))

            sents = (s.strip().lower() for s in sents)
            sents = tuple(s for s in tuple(sents) if s)
            if len(sents) == 0:
                continue

            yield sents
def normalize(text):
    p = PunktSentenceTokenizer()
    bullet1 = '\xe2\x80\xa2'.decode('utf-8')
    bullet2 = '\xc2\xb7'.decode('utf-8')
    usable = ''
    for sentence in p.tokenize(text):
        if len(sentence) < 500:
            if bullet1 not in sentence and bullet2 not in sentence:
                usable += '%s ' % sentence
    return usable
Пример #6
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
def tag(sentence):

    try:
        tokenizer = PunktSentenceTokenizer(sentence)
        tokenized = tokenizer.tokenize(sentence)

        words = nltk.word_tokenize(tokenized[0])
        tagged = nltk.pos_tag(words)

        return tagged

    except Exception as e:
        print(str(e))
Пример #8
0
 def aristo_get_named_entities(self, text):
     """
     Parses the texts to obtain named entities
     :param text: The text to parse
     :return:returns a named entity treexw
     """
     custom_sent_tokenizer = PunktSentenceTokenizer(text)
     tokenized = custom_sent_tokenizer.tokenize(text)
     for i in tokenized[5:]:
         words = nltk.word_tokenize(i)
         tagged = nltk.pos_tag(words)
         namedEnt = nltk.ne_chunk(tagged, binary=False)
         return ((namedEnt))
Пример #9
0
def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
    except Exception as e:
        print(str(e))
    return namedEnt
Пример #10
0
def sentenceTagging(text, trainingText):
    csTokenizer = PunktSentenceTokenizer(trainingText)
    tokenized = csTokenizer.tokenize(text)
    taggedSentence = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            taggedSentence.append(tagged)
            #chinkingWords(tagged).draw()
            namedEntityRecog(tagged)
    except Exception as e:
        print(str(e))

    return taggedSentence
Пример #11
0
	def pos(self, paragraph):

		wordsdict = collections.OrderedDict()
		sent_tokenizer = PunktSentenceTokenizer()

		for sentence in self.sent_detector.tokenize(paragraph):
			tokens = sent_tokenizer.tokenize(sentence)

			for token in tokens:
				words = nltk.word_tokenize(token)
				tagged = nltk.pos_tag(words)
				for word in tagged:
					if word[1] in self.tagdict:
						wordsdict[word[0]] = self.tagdict[word[1]][0]

		return wordsdict
Пример #12
0
class Tokenizer(object):

    def __init__(self, language, normalize=False, train_text_gen=None):
        """
        A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
        Params:
        language: Language to tokenize (currently doesn't do anything)
        train_text_gen: A generator of training text for the sentence tokenizer.
        """
        self.language = language
        self.train_text_gen = train_text_gen
        self.normalize = normalize
        
        if train_text_gen:
            self.sent_tokenizer = self._train_sentence_tokenizer()
        else:
            self.sent_tokenizer = PunktSentenceTokenizer()

    def _train_sentence_tokenizer(self):
        return PunktSentenceTokenizer(train_text="\n".join(self.train_text_gen))

    def tokenize(self, text):
        tokenized = []
        for sentence in self.sent_tokenizer.tokenize(text):
            tokenized_sentence = []
            for word in word_tokenize(sentence):
                if self.normalize:
                    word = word.lower()
                tokenized_sentence.append(word)
            tokenized.append(tokenized_sentence)

        return tokenized
def main():
    training_text = state_union.raw('2005-GWBush.txt')
    sample_text = state_union.raw('2006-GWBush.txt')
    custom_sent_tokenizer = PunktSentenceTokenizer(training_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)

    choice = 0
    while choice < 5:
        choice = input("1 for named_chunks. This provides some information about proper nouns.\n, 2 for process_chunks. This tells you if a noun phrase followed by n adverb occurs., \n3 for proccess content, this just prints stuff, 4 for...")
        if choice == 1:
            named_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 2:
            process_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 3:
            process_content(text_trained_tokenized(sample_text, training_text))
        elif choice == 4:
            print "try again, bitch!"
Пример #14
0
def get_sentence_occurrences(document, terms, doc_term=None):
    terms_present = get_document_occurrences(document, terms)

    # Use a Tokenizer from NLTK to build a sentence list
    tokenizer = Tokenizer(document)
    sentences = tokenizer.tokenize(document)
    
    # Create a list of lists containing the collection of terms which cooccurr
    # in a sentence
    occurrences = []
    for sentence in sentences:
        sentence_occurrences = set() 

        for term in terms_present:
            if term != doc_term:
                if re.search(' %s ' % term.label, sentence):
                    sentence_occurrences.add(term)
        

        if len(sentence_occurrences) > 0:
            sentence_occurrences = list(sentence_occurrences)
            to_remove = set()

            for inside in sentence_occurrences:
                for term in sentence_occurrences:
                    if term != inside and\
                        term.label.find(inside.label) != -1:
                        to_remove.add(inside)
            
            if to_remove:
                print "removing", to_remove

            for term in to_remove:
                sentence_occurrences.remove(term)

            if doc_term:
                sentence_occurrences.append(doc_term)

            occurrences.append(sentence_occurrences)
    
    return occurrences
Пример #15
0
def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
            # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>*<NN>?}"""
            # # chunkGram = r"""Chunk: {<.*>+}
            # #                     }<VB.?|IN|DT>+{"""
            # chunkParser = nltk.RegexpParser(chunkGram)
            # chunked = chunkParser.parse(tagged)
            # print(chunked)
            # #print(tagged)
    except Exception as e:
        print(str(e))
    return namedEnt
Пример #16
0
    def extract_features(self):
        """
        All approach of extracting features from raw data implemented here
        """
        custom_tokenizer = PunktSentenceTokenizer()
        regex_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        ps = PorterStemmer()
        tokenized = []

        with open(self.file_path, 'r') as current_document:
            for each_line in current_document:
                tokenized.extend(custom_tokenizer.tokenize(each_line))  # tokenizing words line by line
        feature_list = []
        try:
            for each_sentence in tokenized:
                # words = nltk.word_tokenize(each_sentence)
                words = regex_tokenizer.tokenize(each_sentence)
                tagged = nltk.pos_tag(words)
                feature_list.extend([ps.stem(pos[0].lower()) for pos in tagged if pos[1] == 'NN'])  # listing the nouns in a list
        except Exception as E:
            print(str(E))
        feature_dictionary = Counter(feature_list)  # converts an iterable object(in this case, LIST) to dictionary
        return feature_dictionary
Пример #17
0
 def __init__(self, language, normalize=False, train_text_gen=None):
     """
     A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
     Params:
     language: Language to tokenize (currently doesn't do anything)
     train_text_gen: A generator of training text for the sentence tokenizer.
     """
     self.language = language
     self.train_text_gen = train_text_gen
     self.normalize = normalize
     
     if train_text_gen:
         self.sent_tokenizer = self._train_sentence_tokenizer()
     else:
         self.sent_tokenizer = PunktSentenceTokenizer()
Пример #18
0
class NER:
    """docstring for ClassName"""
    def __init__(self, query):
        self.original_query = query
       	conf = shelve.open('conf') 
        self.train_text = conf['train_text']
        self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text)
        self.tokenized = self.custom_sent_tokenizer.tokenize(self.original_query)

    def processContent(self):
        try:
            for i in self.tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                namedEnt = nltk.ne_chunk(tagged, binary=True)
                #print(namedEnt)
                #namedEnt.draw()
            return namedEnt
        except Exception as e:
            print(str(e))
        

    # Parse named entities from tree
    def structureNamedEntities(self):
    	ne = []
    	for subtree in self.named_entity_tree:
    		if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
    			ne_label = subtree.label()
    			ne_string = " ".join([token for token, pos in subtree.leaves()])
    			ne.append((ne_string, ne_label))
    	return ne

    def performNER(self):
        self.named_entity_tree = self.processContent()
        #print(type(self.named_entity_tree))
        self.named_entity_tuple = self.structureNamedEntities()
        #print(ne)
        names = [element[0] for element in self.named_entity_tuple]
        return names
def natural_sentence(string):
	pst = PunktSentenceTokenizer(string)
	t = pst.tokenize(string)

	word = nltk.word_tokenize(t[0])							#here we chunking sentance into word
	tagged = nltk.pos_tag(word)								#here each word is tagged means it is noud, pronoun, etc... is recognized
	print tagged
	chunkGram = r"""WRB:{<WRB.?>*<WP>*<WDT>?}"""			#REGEXP for detecting wh question
	chunkParser = nltk.RegexpParser(chunkGram)				#differentiate wh question
	chunked = chunkParser.parse(tagged)						#getting each word this will gives the output in tree form
	for subtree in chunked.subtrees():
		if subtree.label() == 'WRB':			# for only wh question
			for j in subtree.leaves():
				f = 0
				final = ""
				final += j[0]

				chunk = r"""VB: {<VBZ>*<VBP>?}"""							#here we are detecting type of wording and arranging it to proper place
				cp = nltk.RegexpParser(chunk)
				word = nltk.word_tokenize(t[0])
				tagged = nltk.pos_tag(word)
				ch = cp.parse(tagged)
				flg = 0
				for subtree in ch.subtrees():
					if subtree.label() == 'VB':
						for j in subtree.leaves():
							final += " "+j[0]

							flg = 1
						break
				if flg == 0:
					final += " is"

				chunk = r"""PRP: {<PRP.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<JJ.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<RB.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""PRP: {<VB.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

				chunk = r"""NN: {<NN.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(tagged)
				for subtree in ch.subtrees():
					if subtree.label() == 'NN':
						for j in subtree.leaves():
							if f == 0:
								final += " "+j[0]
								f = 1
							else:
								final += " of "+j[0]
				f = 0
				print final
				final_string = grammar(final)				#sending generated sentance to ginger grammer for correcting grammar
				print final_string
				ws.send(final_string.upper())				#sending final sentance to board
				return
	chunkGram = r"""NN:{<PRP.?>*<NN.?>?}"""					#same thing like wh question is here for simple present tence sentance
	chunkParser = nltk.RegexpParser(chunkGram)
	chunked = chunkParser.parse(tagged)
	for subtree in chunked.subtrees():
		if subtree.label() == 'NN':
			for j in subtree.leaves():
				f = 0
				w = nltk.word_tokenize(string)
				w.remove(j[0])
				final = ""
				final += " "+j[0]
				chunk = r"""VB: {<VBP>*<VBZ>*<VB>*<VB.?>*<MD.?>?}"""
				cp = nltk.RegexpParser(chunk)
				word = nltk.word_tokenize(t[0])
				tagged = nltk.pos_tag(word)
				ch = cp.parse(tagged)
				flg = 0
				for subtree in ch.subtrees():
					if subtree.label() == 'VB':
						for j in subtree.leaves():
							w.remove(j[0])
							final += " "+j[0]
							flg = 1
						break
				if flg == 0:
					final += " is"
				chunk = r"""PRP: {<PRP.?>?}"""
				cp = nltk.RegexpParser(chunk)

				ch = cp.parse(nltk.pos_tag(w))
				for subtree in ch.subtrees():
					if subtree.label() == 'PRP':
						for j in subtree.leaves():
							final += " "+j[0]

							w.remove(j[0])
				chunk = r"""NN: {<NN.?>?}"""
				cp = nltk.RegexpParser(chunk)
				ch = cp.parse(nltk.pos_tag(w))
				for subtree in ch.subtrees():
					if subtree.label() == 'NN':
						for j in subtree.leaves():
							if f == 0:
								final += " "+j[0]
								f = 1
							else:
								final += " of "+j[0]
							w.remove(j[0])
				f = 0
				for wrd in w:
					final += " "+wrd
				print final
				final_string = grammar(final)
				print final_string
				ws.send(final_string.upper())
				return
#!/usr/bin/env python
""" convert from 20news group to vw input """
import sys, os, nltk
from nltk.tokenize import PunktSentenceTokenizer, WordPunctTokenizer
from collections import defaultdict
from sequence import *

input_dir = sys.argv[1]

sent_tokenizer = PunktSentenceTokenizer()
token_tokenizer = WordPunctTokenizer()

group_seq = Sequence(1)
for group in os.listdir(input_dir):
    group_id = group_seq.id_for(group)
    sys.stderr.write("%s\t%s\n" % (group, group_id))
    for article_path in os.listdir("%s/%s" % (input_dir, group)):
        # stitch article into a single string
        article = ""
        for line in open("%s/%s/%s" % (input_dir, group, article_path)):
            line = line.strip()
            if len(line) > 0 and not line.startswith(">"):
                article += " "
                article += line
        # tokenise sentences then some simple normalisation, collect just binary features
        all_tokens = set()
        for sentence in sent_tokenizer.tokenize(article):
            tokens = token_tokenizer.tokenize(sentence)
            tokens = map(lambda t: t.lower().replace(":","").replace("|",""), tokens)  # : and | reserved for vw format
            tokens = filter(lambda t: len(t) > 3, tokens)
            all_tokens.update(tokens)
Пример #21
0
##Sometimes a single sentence can contain two relevant opinions
##For example- Apple launched the iPhone and Ferarri launched 911
## Now if I give an opinion regarding something chunking helps
#me identify regarding what object I am making the opinion.

#train_text=state_union.raw("2005-GWBush.txt")
#sample_text=state_union.raw("2006-GWBush.txt")

myfile =  open('Text2.txt', 'r') 
data1 = myfile.read().replace('\n', '')

#myfile =  open('Text1.txt', 'r') 
#data=myfile.read().replace('\n', '')


custom_sent_tokenizer= PunktSentenceTokenizer(data1)

tokenized = custom_sent_tokenizer.tokenize(data1)


def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            #.?and * are regexps. 
            chunkGram="""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?}"""
            chunkParser=nltk.RegexpParser(chunkGram)
            chunked=chunkParser.parse(tagged)
            print(chunked)
            chunked.draw()
def natural_sentence(string):
    pst = PunktSentenceTokenizer(string)
    t = pst.tokenize(string)

    word = nltk.word_tokenize(t[0])  #here we chunking sentance into word
    tagged = nltk.pos_tag(
        word
    )  #here each word is tagged means it is noud, pronoun, etc... is recognized
    print tagged
    chunkGram = r"""WRB:{<WRB.?>*<WP>*<WDT>?}"""  #REGEXP for detecting wh question
    chunkParser = nltk.RegexpParser(chunkGram)  #differentiate wh question
    chunked = chunkParser.parse(
        tagged)  #getting each word this will gives the output in tree form
    for subtree in chunked.subtrees():
        if subtree.label() == 'WRB':  # for only wh question
            for j in subtree.leaves():
                f = 0
                final = ""
                final += j[0]

                chunk = r"""VB: {<VBZ>*<VBP>?}"""  #here we are detecting type of wording and arranging it to proper place
                cp = nltk.RegexpParser(chunk)
                word = nltk.word_tokenize(t[0])
                tagged = nltk.pos_tag(word)
                ch = cp.parse(tagged)
                flg = 0
                for subtree in ch.subtrees():
                    if subtree.label() == 'VB':
                        for j in subtree.leaves():
                            final += " " + j[0]

                            flg = 1
                        break
                if flg == 0:
                    final += " is"

                chunk = r"""PRP: {<PRP.?>?}"""
                cp = nltk.RegexpParser(chunk)
                ch = cp.parse(tagged)
                for subtree in ch.subtrees():
                    if subtree.label() == 'PRP':
                        for j in subtree.leaves():
                            final += " " + j[0]

                chunk = r"""PRP: {<JJ.?>?}"""
                cp = nltk.RegexpParser(chunk)
                ch = cp.parse(tagged)
                for subtree in ch.subtrees():
                    if subtree.label() == 'PRP':
                        for j in subtree.leaves():
                            final += " " + j[0]

                chunk = r"""PRP: {<RB.?>?}"""
                cp = nltk.RegexpParser(chunk)
                ch = cp.parse(tagged)
                for subtree in ch.subtrees():
                    if subtree.label() == 'PRP':
                        for j in subtree.leaves():
                            final += " " + j[0]

                chunk = r"""PRP: {<VB.?>?}"""
                cp = nltk.RegexpParser(chunk)
                ch = cp.parse(tagged)
                for subtree in ch.subtrees():
                    if subtree.label() == 'PRP':
                        for j in subtree.leaves():
                            final += " " + j[0]

                chunk = r"""NN: {<NN.?>?}"""
                cp = nltk.RegexpParser(chunk)
                ch = cp.parse(tagged)
                for subtree in ch.subtrees():
                    if subtree.label() == 'NN':
                        for j in subtree.leaves():
                            if f == 0:
                                final += " " + j[0]
                                f = 1
                            else:
                                final += " of " + j[0]
                f = 0
                print final
                final_string = grammar(
                    final
                )  #sending generated sentance to ginger grammer for correcting grammar
                print final_string
                ws.send(final_string.upper())  #sending final sentance to board
                return
    chunkGram = r"""NN:{<PRP.?>*<NN.?>?}"""  #same thing like wh question is here for simple present tence sentance
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)
    for subtree in chunked.subtrees():
        if subtree.label() == 'NN':
            for j in subtree.leaves():
                f = 0
                w = nltk.word_tokenize(string)
                w.remove(j[0])
                final = ""
                final += " " + j[0]
                chunk = r"""VB: {<VBP>*<VBZ>*<VB>*<VB.?>*<MD.?>?}"""
                cp = nltk.RegexpParser(chunk)
                word = nltk.word_tokenize(t[0])
                tagged = nltk.pos_tag(word)
                ch = cp.parse(tagged)
                flg = 0
                for subtree in ch.subtrees():
                    if subtree.label() == 'VB':
                        for j in subtree.leaves():
                            w.remove(j[0])
                            final += " " + j[0]
                            flg = 1
                        break
                if flg == 0:
                    final += " is"
                chunk = r"""PRP: {<PRP.?>?}"""
                cp = nltk.RegexpParser(chunk)

                ch = cp.parse(nltk.pos_tag(w))
                for subtree in ch.subtrees():
                    if subtree.label() == 'PRP':
                        for j in subtree.leaves():
                            final += " " + j[0]

                            w.remove(j[0])
                chunk = r"""NN: {<NN.?>?}"""
                cp = nltk.RegexpParser(chunk)
                ch = cp.parse(nltk.pos_tag(w))
                for subtree in ch.subtrees():
                    if subtree.label() == 'NN':
                        for j in subtree.leaves():
                            if f == 0:
                                final += " " + j[0]
                                f = 1
                            else:
                                final += " of " + j[0]
                            w.remove(j[0])
                f = 0
                for wrd in w:
                    final += " " + wrd
                print final
                final_string = grammar(final)
                print final_string
                ws.send(final_string.upper())
                return
Пример #23
0
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
"""

#loading corpus
train_text = state_union.raw('2005-GWBush.txt')
text = state_union.raw('2006-GWBush.txt')

# Training custom sentence tokenizer
tokenizer = PunktSentenceTokenizer(train_text)
sentence = tokenizer.tokenize(text)

for s in sentence:
    token = word_tokenize(s)
    pos = pos_tag(token)
    entity = nltk.ne_chunk(pos, binary=True)
    entity.draw()


Пример #24
0
#!/usr/bin/env python

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #unsupervised tokenizer


train_text = state_union.raw('2005-GWBush.txt')

#print train_text

test_text = state_union.raw('2006-GWBush.txt')

custom_sent_token = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_token.tokenize(test_text)

#print tokenized
#print type(tokenized)

def chunk():
	try:
		for i in tokenized:
			words = nltk.word_tokenize(i)
			tagged = nltk.pos_tag(words)

			regexp = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} 
								}<VB.?|IN|DT|TO>+{"""

			parser = nltk.RegexpParser(regexp)
Пример #25
0
 def __init__(self, train_text, sample_text):
     self.train_text = state_union.raw(train_text)
     self.sample_text = state_union.raw(sample_text)
     self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text)
     self.tokenized = self.custom_sent_tokenizer.tokenize(self.sample_text)
Пример #26
0
 def punktSplit(self, text):
     tokenizer = PunktSentenceTokenizer()
     print(tokenizer.tokenize(text))
Пример #27
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

trainText = state_union.raw("2005-GWBush.txt")
sampleText = state_union.raw("2006-GWBush.txt")

customSentTokenizer = PunktSentenceTokenizer(trainText)

tokenized = customSentTokenizer.tokenize(sampleText)


def processContent():
    try:
        j = 0
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            namedEnt = nltk.ne_chunk(tagged, binary=True)
            if j < 10:
                namedEnt.draw()
                j += 1
    except Exception as e:
        print(str(e))


processContent()
Пример #28
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  # Pretrained unsupervised model, can be trained again if required

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)      # To train the model, no labelling required as it is unsupervised

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)  # Part of speech tagging, tuple with (word, pos)

            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} """

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            print(chunked)
            chunked.draw()

    except Exception as e:
        print(str(e))

process_content()
Пример #29
0
nltk.download('punkt');
nltk.download('averaged_perceptron_tagger');
nltk.download('maxent_ne_chunker');
nltk.download('words');

#Settings
inputPath = "wsj_0010_sample.txt"
outputPath = "wsj_0010_sample.txt.ne.nltk";

#Extracting data
file = open(inputPath, "r+");
content = file.read();
file.close();

#Tokenization
tokenizer = PunktSentenceTokenizer();
tokens = tokenizer.tokenize(content);

words = []
tagged = []
namedEnt = []

for sentence in tokens:
  # print(sentence)
  words += nltk.word_tokenize(sentence)
  tagged = nltk.pos_tag(words)
  #Named Entity Recognition
  namedEnt = nltk.ne_chunk(tagged, binary=True);

print("NameEnt: ", namedEnt); #Testing
Пример #30
0
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import WordPunctTokenizer, PunktSentenceTokenizer
import theano
import theano.tensor as T
import numpy as np

w2v_path = 'brown.w2v'
corpora_path = 'brown.txt'  #of course it's an overfit
ngram_order = 3
tokenizer = WordPunctTokenizer()
sent_tok = PunktSentenceTokenizer()
w2v_model = Word2Vec.load(w2v_path)
data = []  #for autoencoder it's a matrix, not tensor-3
centroid = np.mean(w2v_model.syn0, axis=0)
#using sigmoid, so we need to normalize vectors
min_w2v = np.min(w2v_model.syn0, axis=0)
max_w2v = np.max(w2v_model.syn0, axis=0)
print 'loading data'
with open(corpora_path) as inp:
    text = inp.read()
sentences = sent_tok.tokenize(text)

print 'vectorizing data'
for sent in sentences:
    tokens = tokenizer.tokenize(sent)

    for i in xrange(0, len(tokens) - ngram_order):
        ngram_slice = tokens[i:i + ngram_order]
        ngram = []
        for t in ngram_slice:
            try:
Пример #31
0
This example is chinking.
Which is removing the things that you dont want in your chunks

@author: jay
"""

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer


train_text = state_union.raw("2005-GWBush.txt")
sample = state_union.raw("2006-GWBush.txt")

cus_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = cus_tokenizer.tokenize(sample)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk:{<.*>+?}
                               }<VB.?|IN|DT|TO>+{"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
       #     chunked.draw()
            
    except Exception as e:
Пример #32
0
import json
import argparse
from collections import Counter
from nltk.tokenize import PunktSentenceTokenizer
import nltk
from tqdm import tqdm

parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str)
args = parser.parse_args()

claims = []

bigrams = Counter()

with open(args.in_file) as f:
    for line in tqdm(f):
        line = json.loads(line)

        claims.append(line["claim"])

tok = PunktSentenceTokenizer()
for claim in tqdm(claims):
    bigrams.update(nltk.bigrams(nltk.word_tokenize(claim)))

for bigram in bigrams.most_common(20):
    print(bigram)
Пример #33
0
from nltk.stem import PorterStemmer

ps = PosterStemmer()

example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
	print(ps.stem(w))

##Part of Speech Tagging
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
#Unsupervised machine learning tokenizer -> PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text) #training on train_text
	
tokenized = custom_sent_tokenizer.tokenize(sample_text) #applying model to sample_text
#this will generate sentences

def process_content():
	try:
		for i in tokenized:
			words= nltk.word_tokenize(i)
			tagged = nltk.pos_tag(words)
			print(tagged)
	except: Exception as e:
		print(str(e))
		
process_content()
Пример #34
0
# *************************************************************************************************
def find_whole_word(w):
    """A REGEX to find out the location of metadata about the article"""
    return re.compile(r'({0})'.format(w), flags=re.IGNORECASE).search

POLARITY_TEXTBLOB = []
SUBJECTIVITY = []
POLARITY_VADER = []
VADER_ARTICLE_COMPOUND = []
TEXTBLOB_ARTICLE_POLARITY = []
TEXTBLOB_ARTICLE_SUBJECTIVITY = []
for news in df["Content"]:
    try:
        a = find_whole_word('/Bloomberg')(news).span()[1]
#       b = find_whole_word('Reporting by')(news).span()[0]
        sentences = PunktSentenceTokenizer().tokenize(news[a + 1: ])
    except:
        sentences = PunktSentenceTokenizer().tokenize(news)

    for sentence in sentences:
        vaderAnalyzer = SentimentIntensityAnalyzer()
        vs = vaderAnalyzer.polarity_scores(sentence)
        textBlobAnalyzer = TextBlob(sentence)
        VADER_ARTICLE_COMPOUND.append(vs["compound"])
        TEXTBLOB_ARTICLE_POLARITY.append(textBlobAnalyzer.sentiment.polarity)
        TEXTBLOB_ARTICLE_SUBJECTIVITY.append(textBlobAnalyzer.sentiment.subjectivity)
    POLARITY_TEXTBLOB.append(st.mean(TEXTBLOB_ARTICLE_POLARITY))
    SUBJECTIVITY.append(st.mean(TEXTBLOB_ARTICLE_SUBJECTIVITY))
    POLARITY_VADER.append(st.mean(VADER_ARTICLE_COMPOUND))

df["Polarity TextBlob"] = pd.Series(POLARITY_TEXTBLOB, index=df.index)
Пример #35
0
pa = parser.parse_args()
lang = pa.lang
filePath = pa.file
outputPath = filePath + '.sent'


if __name__ == "__main__":
    file = open(filePath, 'r')
    output = open(outputPath, 'w')
    sst = None
    if lang == 'EN':
        sst = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    elif lang == 'ES':
        sst = nltk.data.load('nltk:tokenizers/punkt/spanish.pickle')
    else:
        sst = PunktSentenceTokenizer()
    for line in file:
        if line == "\n":
            sys.stdout.write(line)
            continue
        line = line.replace("«", "'")
        line = line.replace("»", "'")
        line = line.replace("“", "'")
        line = line.replace("”", "'")
        line = line.replace("\"", "'")
        sentences = sst.tokenize(line.decode("utf-8"))
        for s in sentences:
            output.write((s+'\n').encode('utf-8'))
    file.close()
    output.close()
Пример #36
0
 def __init__(self, query):
     self.original_query = query
    	conf = shelve.open('conf') 
     self.train_text = conf['train_text']
     self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text)
     self.tokenized = self.custom_sent_tokenizer.tokenize(self.original_query)
Пример #37
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer

train_txt = state_union.raw("2005-GWBush.txt")
test_txt = state_union.raw("2006-GWBush.txt")

train = PunktSentenceTokenizer(train_txt)
test = train.tokenize(test_txt)

def post():
    try:
        for i in test:
            words = word_tokenize(i)
            tag = nltk.pos_tag(words)
            print(tag)

    except Exception as e:
        print(str(e))
        
post()     
        
Пример #38
0
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import gutenberg

sample = gutenberg.raw("bible-kjv.txt")

pst = PunktSentenceTokenizer()
token = pst.tokenize(sample)

for i in range(5):
    print(token[i + 6])
Пример #39
0
def break_sentences(text):
    tokenize = PunktSentenceTokenizer()
    doc = tokenize.tokenize(text)
    return doc
Пример #40
0
words = word_tokenize(Example_text)

filtered_sentence = [w for w in words if not w in stop_words]

# print(filtered_sentence)

ps = PorterStemmer()

stemmed_words = [ps.stem(w) for w in words]

# print(stemmed_words)

train_text = state_union.raw("2005-GWBush.txt")
test_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenize = custom_sent_tokenizer.tokenize(test_text)


def process_content():
    try:
        for i in tokenize[:5]:
            word = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(word)
            chunkgram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            # chunkParser = nltk.RegexpParser(chunkgram)
            # chunkgram = r"""Chunk: {<.*>+}
            # }<VB.?|IN|DT|TO>+{"""
            chunkParser = nltk.RegexpParser(chunkgram)
            chunked = chunkParser.parse(tagged)
Пример #41
0
#Chunking is also called shallow parsing and it's basically the identification of parts of speech and short phrases (like noun phrases)
#An example of when chunking might be preferable is Named Entity Recognition. In NER, your goal is to find named entities, which tend to be noun phrases (though aren't always).

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text=state_union.raw("2005-GWBush.txt")
sample_text=state_union.raw("2006-GWBush.txt")
print(train_text)
print(sample_text)

custom_sent_tokenizer=PunktSentenceTokenizer(train_text)
tokenized=custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            
            chunkGram=r"""Chunk:{<RB.?>*<VB.?>*<NNP>+<NN>} """
            ChunkParser=nltk.RegexpParser(chunkGram)
            chunked=ChunkParser.parse(tagged)
            chunked.draw()
            
        except Exception as e:
            print(str(e))
Пример #42
0
from __future__ import generator_stop
import nltk
import pandas as pd
from nltk.tokenize import regexp_tokenize, PunktSentenceTokenizer, RegexpTokenizer
#from spellchecker import SpellChecker
import string
import re
from nltk.stem import SnowballStemmer
from pattern.text.es import conjugate, INFINITIVE
import spacy
from textacy import preprocessing

stemmer = SnowballStemmer('spanish')
regexpTokenizer = RegexpTokenizer(r'\w+')
punktSentenceTokenizer = PunktSentenceTokenizer()
stopwords = nltk.corpus.stopwords.words('spanish')
table = str.maketrans('', '', string.punctuation)
nlp = spacy.load("es_core_news_md")
otherwords = [
    'eramos', 'estabamos', 'estais', 'estan', 'estara', 'estaran', 'estaras',
    'estare', 'estareis', 'estaria', 'estariais', 'estariamos', 'estarian',
    'estarias', 'esteis', 'esten', 'estes', 'estuvieramos', 'estuviesemos',
    'fueramos', 'fuesemos', 'habeis', 'habia', 'habiais', 'habiamos', 'habian',
    'habias', 'habra', 'habran', 'habras', 'habre', 'habreis', 'habria',
    'habriais', 'habriamos', 'habrian', 'habrias', 'hayais', 'hubieramos',
    'hubiesemos', 'mas', 'mia', 'mias', 'mio', 'mios', 'seais', 'sera',
    'seran', 'seras', 'sere', 'sereis', 'seria', 'seriais', 'seriamos',
    'serian', 'serias', 'si', 'tambien', 'tendra', 'tendran', 'tendras',
    'tendre', 'tendreis', 'tendria', 'tendriais', 'tendriamos', 'tendrian',
    'tendrias', 'teneis', 'tengais', 'tenia', 'teniais', 'teniamos', 'tenian',
    'tenias', 'tuvieramos', 'tuviesemos'
Пример #43
0
def processing_contents():
    input_from_user = str(input("Enter a Sentence: "))

    stop_words = set(stopwords.words("english")) 
    # stopwords --> words that have no meaning, better to remove them first
    #for I would rather not let them sloppy words ruin my program



    words = nltk.word_tokenize(input_from_user) # Tokenizing the given String to add the POS-tags
    filtered_sentence = [w for w in words if not w in stop_words] #Filtering the input with list-comprehension
    #                               OR
    # filtered_sentence = []
    # for w in words:
    #     if w not in stop_words:
    #         filtered_sentence.append(w)
    tokenized  = nltk.pos_tag(filtered_sentence) # Adding the POS-tags
    #Variables that counts
    #------------------------
    #This is a Sentence Counter..
    sentence_sum = 0
    sentence_counter = PunktSentenceTokenizer().tokenize(input_from_user)
    for counter in sentence_counter:
        sentence_sum = sentence_sum + 1
   #---------------------
    nouns_count = 0
    nouns_display = []           #- making some lists to store the appended values
    adverbs_count = 0           
    adverbs_display = []
    verbs_count = 0
    verbs_display = []
    adjective_count = 0
    adjective_display = []
    proper_noun_count = 0
    proper_noun_display = []
    #---------------------
    for i in tokenized:
            x,y = i
            if y in ["NN", "NNS"]:
                nouns_count += 1
                nouns_display.append(x)
            elif y in ["RB", "RBR", "RBS", "RP"]:
                adverbs_count += 1
                adverbs_display.append(x)
            elif y in ["JJ", "JJR", "JJS"]:
                adjective_count +=1
                adjective_display.append(x)
            elif y in ["NNP", "NNPS"]:
                proper_noun_count += 1
                proper_noun_display.append(x)
            elif y in ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]:
                verbs_count +=1
                verbs_display.append(x)

    """I could have used decorators here but that would have consumed too much of my time; altering content, creating global variables and calling other functions xd
    Hence I approached a ugly looking cheap trick here,  pray pardon my laziness"""
#========================================  
    if proper_noun_display is not None:
        a = ", ".join(proper_noun_display)
        print("proper-Noun ---> {}".format(a))
    if nouns_display is not None:
        a = ", ".join(nouns_display)
        print("Noun ---> {}".format(a))
    if adverbs_display is not None:
        a = ", ".join(adverbs_display)
        print("Adverb ---> {}".format(a))
    if adjective_display is not None:
        a = ", ".join(adjective_display)
        print("Adjective ---> {}".format(a))
    if verbs_display is not None:
        a = ", ".join(verbs_display)
        print("Verb ---> {}".format(a))

    print("There are {}: proper-nouns, {}:nouns, {}:verbs, {}: adverbs and {}: adjectives in the sentence.".format(proper_noun_count, nouns_count, verbs_count, adverbs_count, adjective_count))

    print("Total Sentences: {}".format(sentence_sum))
Пример #44
0
def init_prepare():
    """ Initialize content preparation """

    try:
        sys.argv[2]
    except IndexError:
        print(
            "Tool needs an argument - second argument (search query) non existent."
        )
        sys.exit()

    query = sys.argv[2]
    slug = slugify(query)

    base_path = os.path.abspath(
        os.path.dirname(sys.modules['__main__'].__file__))

    save_path = base_path + "/data/csv/" + slug + "/"
    save_path_models = base_path + "/data/models/" + slug + "/dict/"
    statics_path = base_path + "/data/statics/"
    file_to_content = save_path + "content.csv"
    slugstopword_path = base_path + "/data/slug-stopwords/" + slug + ".txt"

    # create dict dir if not existent
    if not os.path.exists(save_path_models):
        os.makedirs(save_path_models)

    # load custom wordlist from file
    custom_stoplist = []

    with open(statics_path + "german-stopwords.txt", 'r') as csvfile:

        reader = csv.reader(csvfile, delimiter=',', quotechar='"')

        for row in reader:

            custom_stoplist.extend(row)

    slug_stoplist = []

    if os.path.exists(slugstopword_path) is True:

        with open(slugstopword_path, 'r') as csvfile:

            reader = csv.reader(csvfile, delimiter=',', quotechar='"')

            for row in reader:

                slug_stoplist.extend(row)
    else:
        print('No slug-specific stoplist found, creating...')

        os.mknod(slugstopword_path)

        print('You can add stopwords to ' + slugstopword_path +
              ', and rerun training!')

    # load standard german stoplist
    de_stop = get_stop_words('de')

    # set min length for tokens/terms (aka "Wörter")
    token_min_length = sys.argv[3] if len(sys.argv) >= 4 else 3
    token_min_length = int(token_min_length)

    # set the min occurence count for a word
    token_min_count = sys.argv[4] if len(sys.argv) >= 5 else 1
    token_min_count = int(token_min_count)

    max_docs = sys.argv[5] if len(sys.argv) >= 6 else 100
    max_docs = int(max_docs)

    # concatenate stoplists
    stopword_list = de_stop + custom_stoplist + slug_stoplist
    """
    Part 1: Prepare input data/content for gensim ml algorithms
    """

    # get tokens/terms
    terms = tokenize_content(file_to_content=file_to_content,
                             stopword_list=stopword_list,
                             token_min_length=token_min_length,
                             token_min_count=token_min_count,
                             max_docs=max_docs)

    # create dictionary and save for future use
    dictionary = corpora.Dictionary(terms)
    dictionary.save(save_path_models + "dictionary.dict")

    # create corpus and save for future use
    corpus = [dictionary.doc2bow(term) for term in terms]
    corpora.MmCorpus.serialize(save_path_models + "corpus.mm", corpus)
    """
    Part 2: Prepare data/content for some sklearn ml algorithms
    """

    # use sklearns tfidf vecotrizer
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                       min_df=token_min_count,
                                       max_features=100,
                                       stop_words=stopword_list,
                                       ngram_range=(1, 3),
                                       sublinear_tf=True,
                                       norm='l2')

    # tokenize senctences using PunktSentenceTokenizer
    sentences = []

    with open(file_to_content, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')

        current_row = 0

        for row in reader:

            if current_row >= max_docs:
                break

            current_row += 1

            tokens = PunktSentenceTokenizer().tokenize(row[1])

            for sent in tokens:
                sentences.append(sent)

    # transform tokens into tf idf matric
    tfidf_tokens = tfidf_vectorizer.fit_transform(sentences)

    # save tokens for future use
    pickle.dump(tfidf_tokens, open(save_path_models + "tfidf-tokens.sk", "wb"))

    # save this vectorizer for future use
    pickle.dump(tfidf_vectorizer,
                open(save_path_models + "tfidf-vectorizer.sk", "wb"))
Пример #45
0
Hobo: Oh, this is all theatrical.
Girl: Hola amigo... 
Hobo: his is all theatrical.
我说: "U.S.A 你好啊".
U.S.A is the abbreviation of United States. To use statistical parameters such as mean and standard deviation reliably, you need to have a good estimator for them. The maximum likelihood estimates (MLEs) provide one such estimator. However, an MLE might be biased, which means that its expected value of the parameter might not equal the parameter being estimated."""

sentences = sent_tokenize(article)

for sentence in sentences:
    tokens = word_tokenize(sentence)
    #print(sentence)

text = webtext.raw('overheard.txt')

print(text)
sent_tokenizer = PunktSentenceTokenizer(text)
sents1 = sent_tokenizer.tokenize(text)
sents2 = sent_tokenize(text)

sents1_article = sent_tokenizer.tokenize(article)
sents2_article = sent_tokenize(article)

print(sents1[0])
print(sents2[0])
print()
print(sents1[677])
print(sents2[677])
print()
print(sents1[678])
print(sents2[678])
print()
DIMENSIONS = 300
nltkTokenizer = RegexpTokenizer(patterns)
if os.path.isfile('tfidf_vectorizer.joblib'):
    tfidf_vectorizer = load('tfidf_vectorizer.joblib')
    punkt_sent_tokenizer = load('punkt_sent_tokenizer.joblib')
else:
    tfidf_vectorizer = TfidfVectorizer()
    training_text = ""
    corpus = []
    for file in os.listdir(data_dir):
        if file.endswith('.txt'):
            f = open(os.path.join(data_dir, file), 'r')
            raw = f.read()
            training_text += raw
            corpus.append(raw.lower())
    punkt_sent_tokenizer = PunktSentenceTokenizer(training_text)
    tfidf_vectorizer.fit(corpus)
    dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
    dump(punkt_sent_tokenizer, 'punkt_sent_tokenizer.joblib')
drugToAliasDict = {}
aliasToDrugDict = {}
word2VecModel = KeyedVectors.load_word2vec_format('glove.6B.300d.txt.word2vec',
                                                  binary=False)


def getFirstVP(constituencyTree):
    queue = [constituencyTree]
    while len(queue) != 0:
        currentNode = queue.pop(0)
        if currentNode.value == "VP":
            return currentNode
Пример #47
0
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
import nltk
import json
import re

train_text = state_union.raw('2005-GWBush.txt')
inputFile = 'data.json'
outputFile = 'sample_text.txt'

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(outputFile)

def createInput():
    
    with open(inputFile, 'r') as i:
        with open(outputFile, 'w') as o:
            for line in i:
                try:
                    tweet = json.loads(line)
                    text = tweet['text']
                    o.write(' '.join(re.sub("(RT)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split()))
                except BaseException as e:
                    continue

def tagInput():
    try:
        with open(outputFile, 'r') as f:
            for line in f:
                words = nltk.word_tokenize(line)
Пример #48
0
import nltk

from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
train_text = state_union.raw("2005-GWBush.txt")
_text = input("Enter a text:")
#sample_text=wikipedia.summary(_text,sentences=1)
custom_SentTok = PunktSentenceTokenizer(train_text)
tokenized = custom_SentTok.tokenize(_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            print(tagged)

    except Exception as e:
        print(str(e))


process_content()

#POS tag list:
#CC	coordinating conjunction
#CD	cardinal digit
#DT	determiner
#EX	existential there (like: "there is" ... think of it like "there exists")
#FW	foreign word
Пример #49
0
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
"""


# retrieving the corpus
train_text = state_union.raw('2005-GWBush.txt')
text = state_union.raw('2006-GWBush.txt')

# training the sentence tokenizer (unsupervised)
tokenizer = PunktSentenceTokenizer(train_text)
sentence = tokenizer.tokenize(text)

# tagging the tokens by word tokenizing the sentence and the using regular exp to chunk the tokens
try:
    for s in sentence:
        token = word_tokenize(s)
        pos = pos_tag(token)
        print(pos)
        chunkreg = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        chunkParser = nltk.RegexpParser(chunkreg)
        chunked = chunkParser.parse(pos)
        chunked.draw()

except Exception as e:
    print(str(e))
"""
This is based on the tutorial from this website: 
https://pythonprogramming.net/chunking-nltk-tutorial/
"""

# importing libraries
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  # unsupervised machine learning tokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(
    train_text)  # training tokenizer first
tokenized = custom_sent_tokenizer.tokenize(sample_text)


# processing content
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()
Пример #51
0
def get_sentence_occurrences(document, terms, terms_present=None, 
                             remove_overlap=False, remove_duplicates=False):
    """
    Returns a list of lists representing the terms occuring in each sentence.
    Semantically equivalent to: 
    [[term for term in terms if term in sent] for sent in document]

    Order of optional operations is: remove duplicates, remove overlap, 
    add doc terms, remove duplicate doc terms
    """
    # get list of terms in the document to narrow sentence-level search
    if terms_present is None:
        terms_present = set(get_document_occurrences(document, terms))

    # Use a Tokenizer from NLTK to build a sentence list
    tokenizer = Tokenizer(document)
    sentences = tokenizer.tokenize(document)
    logging.info("scanning %d sentences for %d terms" % (len(sentences), len(terms)))
    
    # Create a list of lists containing the collection of terms which cooccurr
    # in a sentence
    occurrences = []
    for sentence in sentences:
        sentence_occurrences = [] 

        for term in terms_present:
            # build list of search patterns starting with label
            patterns = ['\b%s\b' % term.label]
            patterns.extend(term.searchpatterns)

            for pattern in patterns:
                try:
                    # search for any occurrence of term, stop when found
                    if re.search(pattern, sentence, flags=re.IGNORECASE):
                        sentence_occurrences.append(term)
                        break
                except re.error:
                    logging.warning('Term %d (%s) pattern "%s" failed' % 
                                    (term.ID, term.label, pattern))
                    term.searchpatterns.remove(pattern)

        # remove duplicates
        if remove_duplicates:
            sentence_occurrences = list(set(sentence_occurrences))

        # remove overlapping elements
        if remove_overlap:
            to_remove = set()
            
            # build set of terms to remove
            for inside in sentence_occurrences:
                for term in sentence_occurrences:
                    if term != inside and\
                        inside.label.find(term.label) != -1:
                        to_remove.add(term)

            # remove terms
            for term in to_remove:
                sentence_occurrences.remove(term)

        # add to list of sentences if any terms are found
        if sentence_occurrences:
            occurrences.append(sentence_occurrences)
    
    return occurrences
Пример #52
0
myString = 'This is a large text. It is usually separated into sentences. We know that, but computers do not.'
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(myString)
print(sentences)

import nltk.data
sentences = "Hola. Esta es una frase espanola"
spanish_sentence_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
sentences = spanish_sentence_tokenizer.tokenize(sentences)
print(sentences)

import nltk
nltk.download('webtext')

from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
text = webtext.raw('overheard.txt')
print(text)

sent_tokenizer = PunktSentenceTokenizer(text)

sents1 = sent_tokenizer.tokenize(text)
sents1[0]

from nltk.tokenize import sent_tokenize
sents2 = sent_tokenize(text)
sents2[0]

print(sents1[678])

print(sents2[678])
Пример #53
0
import nltk.data
import nltk
from nltk.tokenize import PunktSentenceTokenizer
import sys
import re

#classifier = nltk.data.load("classifiers/test_data_sklearn.LinearSVC.pickle")
#file = open("validate_data/thedailybell/Survey_Says_Americans_Trust_No_One.txt","rb")
classifier = nltk.data.load("classifiers/test_data_sklearn.LinearSVC.pickle")
pst = PunktSentenceTokenizer()
files = []
words = []
##allow to take a directory
for i in sys.argv:
    match = re.match(".*\.txt$", i)
    if match:
        files.append(i)

print 'file_name' + '\t' + 'politcal_stance'

for f in files:
    with open(f, "rb") as class_file:
        if sys.argv[1] == '--sents':
            data = class_file.read().replace('\n', '')
            sents = pst.sentences_from_text(data)
            for sent in sents:
                sent_words = nltk.word_tokenize(sent)
                for word in sent_words:
                    words.append(word)
            feats = dict([(word, True) for word in words])
from nltk.tokenize import word_tokenize, PunktSentenceTokenizer
from nltk.corpus import state_union
import nltk

text_train = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_text_tokenizer = PunktSentenceTokenizer(text_train)
# tokenized = custom_text_tokenizer.tokenize(sample_text) # sentence tokenizer
tokenized = custom_text_tokenizer.tokenize("Poornima is an unclear Gemini!")

f = open('support/pos_tags.txt','r')
line = f.read()
lines = line.split('\n')

pos_dict = {}
for line in lines:
    key_val = line.split('\t')
    pos_dict[key_val[0]] = key_val[1]

POS = []

def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = list(nltk.pos_tag(words))

            for j in range(len(tagged)):
                tagged[j] = list(tagged[j])
                tagged[j][1] = pos_dict[tagged[j][1]]
Пример #55
0
qi = QueryIterator(get_config(), {
    'query': 'wid:%s AND iscontent:true' % wid,
    'fields': 'id,url,html_en'
})
#qi = QueryIterator(get_config(), {'query': 'id:3125_199499', 'fields':'id,url,html_en' })
#qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'filterquery': 'views:[3500000 TO *]', 'fields':'id,url,html_en,views' }) #test

service = ParserService()

config = json.loads("".join(open('worker-config.json').readlines()))
host = config["common"]["solr_endpoint"]

entities = {}
confirmed_entities = {}

p = PunktSentenceTokenizer()

doc_count = 0

bullet1 = '\xe2\x80\xa2'.decode('utf-8')
bullet2 = '\xc2\xb7'.decode('utf-8')

start_time = time.time()

for doc in qi:
    print '========== %s ==========' % doc['id']
    #    if doc_count < 4780:
    #        continue
    text = as_string(doc.get('html_en', ''))
    #print text
    usable = ''
Пример #56
0
 def __init__(self):
     from nltk.parse.stanford import StanfordDependencyParser
     from nltk.tokenize import PunktSentenceTokenizer, WordPunctTokenizer
     self.parser = StanfordDependencyParser()
     self.sentenceTokenizer = PunktSentenceTokenizer()
     self.wordTokenizer = WordPunctTokenizer()
Пример #57
0
## not sure about the approach here; do I parse the content of the .po
## file myself or do I feed it straight into NLTK?
## option 1
content = file.read()
#print content
match = re.findall(r"msgstr\s\"([^\"\\]*)\"", content)
match = " ".join(match)
match = match.decode('utf8')
#print match
#print(sent_tokenize(match))

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(match)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print (str(e))

processed_content = process_content()
Пример #58
0
#process of grouping the broken down informatio to get the meaning what it is saying.

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  #unsupervised [we can re train]

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)  #training the text

tokenized = custom_sent_tokenizer.tokenize(
    sample_text)  #after training we use it to different data

#Give the result about who all are verbs, adjective and all the things.


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            #print(tagged)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            #chinking is to excluding the things we don't required instead of selecting all by excepting few.

            chunkParser = nltk.RegexpParser(chunkGram)
            chucked = chunkParser.parse(tagged)
Пример #59
0
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 19 09:15:11 2015

@author: nilakant
"""


import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
#unsupervised tokenizer
train_text = state_union.raw("2006-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()
            
Пример #60
0
"""Process the Yelp converted csv into proper form for the network."""

import sys
from nltk.tokenize import PunktSentenceTokenizer
import csv
from nltk import word_tokenize
import string
import re

tokenizer = PunktSentenceTokenizer()

fix_re = re.compile(r"[^a-z0-9.!,]+")
num_re = re.compile(r'[0-9]+')


def fix_word(word):
    word = word.lower()
    word = fix_re.sub('', word)
    word = num_re.sub('#', word)

    if not any((c.isalpha() or c in string.punctuation) for c in word):
        word = ''
    return word


stars_and_reviews = []

num_rows = 0
with open(sys.argv[1]) as inp_file, open(sys.argv[2], 'w') as out_file:
    reader = csv.reader(inp_file)
    writer = csv.writer(out_file)