예제 #1
0
def tokenization(corpus, stop_words=nltk.corpus.stopwords.words('portuguese')):
    '''Input : corpus é uma Serie de corpusumentos(frases)
       Output : Uma lista de listas com palavras 
    
    stop_words : lista de palavras que devem ser removidas
    '''

    #Tokenizacao
    spacetok = SpaceTokenizer()
    corpus = [spacetok.tokenize(phrases) for phrases in corpus]

    #stopwords
    if (stop_words != None):
        tmp_corpus = list()
        tmp_words = list()

        for phrases in corpus:
            for word in phrases:
                if (word not in stop_words):
                    tmp_words.append(word)
                else:
                    pass
            tmp_corpus.append(tmp_words)
            tmp_words = list()

        corpus = tmp_corpus
    else:
        pass

    return corpus
예제 #2
0
def displayPageView(request):
    mycursor.execute('TRUNCATE table logs_c')
    filePath = request.GET['input-file']
    filePath = "C:/Users/Rhishabh/Documents/mithi hackathon/" + filePath
    log = readfile(filePath)

    line = log.readline()
    tk = SpaceTokenizer()
    tokens = tk.tokenize(line)
    while line:
        tokens = tk.tokenize(line)
        process(tokens)
        line = log.readline()

    mydb.commit()
    
    result1 = query_1()
    result2 = query2()
    result3 = query3()
    result4 = query4()
    result5 = query5()
    result7 = query7()

    # mydb.close()
    temp = [['test', 'test'], ['test', 'test']]
    test = 'sdsds'
    return render(request, 'display.htm', {'ipfile': filePath, 'result1': result1, 'result2': result2, 'result3': result3, 'result4': result4, 'result5': result5, 'result7': result7})
예제 #3
0
def extract_name(tweet):
    token = SpaceTokenizer()
    toks = token.tokenize(tweet)
    pos = pos_tag(toks)
    chunked_nes = ne_chunk(pos)
    nes = [
        ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes
        if isinstance(ne, nltk.tree.Tree)
    ]
    return nes
class NLTKSpaceTokenizeBody(BaseEnrichment):
    """Use the NLTK SpaceTokenizer to parse the Tweet body."""
    def __init__(self):
        self.tokenizer = SpaceTokenizer()

    def enrichment_value(self, tweet):
        return self.tokenizer.tokenize(tweet['body'])
예제 #5
0
def read_doc(doc, labels):
    doc = SpaceTokenizer().tokenize(doc.strip())
    # doc = doc.strip().split()
    labels = labels.strip().split('|')
    labels = [la.split() for la in labels]
    for i in range(len(labels)):
        for j in range(len(labels[i])):
            labels[i][j] = int(labels[i][j])

    res_labels = [0] * len(doc)
    for la in labels:
        if la[2] != 0:
            start = la[0]
            end = la[1]
            res_labels[start:end + 1] = [1] * (end + 1 - start)
    return [(doc[i], str(res_labels[i])) for i in range(len(doc))]
class NLTKSpaceTokenizeBody(BaseEnrichment):
    def __init__(self):
        self.tokenizer = SpaceTokenizer()
    def enrichment_value(self,tweet):
        return self.tokenizer.tokenize(tweet['body'])
    def __repr__(self):
        return "Use the NLTK SpaceTokenizer to parse the Tweet body."
예제 #7
0
def extract_entities(text):
	entities = []
	for sentence in sent_tokenize(text):
            tokenizer = SpaceTokenizer()
            toks = tokenizer.tokenize(sentence)
            default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)
            #model = {'_': 'RB','shikha':'NNP','Lots':'','bbnt':'NNP','Swati':'NNP','Sarkar':'NNP','Deepak':'NNP','Capgemini':'NNP','Swati':'NNP','Deepak Shete':'NNP','Melini':'NNP','Lots':'RB','Prashant Deshpande':'NNP','Deepak A. Shete':'NNP','Rajesh Achyut Patankar':'NNP','Shailesh V. Naik':'NNP','Prashant':'NNP','Kuldeep Vishnu Deshpande':'NNP','Kuldeep Deshpande':'NNP','Hi':'UH','From':'IN','Subject':'VB','RE':'SYM','Cc':'SYM','CC':'SYM','Start':'RB','All':'RB','PLEASE':'RB','Request':'RB','Add':'RB','Need':'RB','Completed':'VB','To':'RB','Dear':'RB','Thank':'RB','You':'PRP','We':'PRP','Here':'RB','Team':'RB','Please':'UH','Thanks':'UH','Regards':'UH','See':'VB','Test':'VB','ASAP':'SYM','Sent':'VB','mailto':'SYM','Together':'RB','Is':'VB','AS':'RB','Financial Services Strategic Business Unit':'NNP','fax':'RB','mobile':'RB','except':'RB','date':'RB','new':'RB','courier':'RB','extn':'RB'}
	    model =  {'extn':'RB'}
            tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger)
            pos = pos_tag(toks)
            pos=tagger.tag(toks)
            #print pos
            chunks = ne_chunk(pos) 
	    #chunks = ne_chunk(pos_tag(word_tokenize(sentence)))
	    entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')])
	return entities



#with open("D:/R/BOA/PySrc/FGD1_18-25_Vodafone_Prepaid_BCUsers_Mumbai.csv", "r") as csvfile:
        datareader = csv.reader(csvfile,quotechar='"' ,lineterminator='\n',quoting=csv.QUOTE_ALL)
        csv_out = open('D:/R/BOA/Noun/FNoun.csv', 'wb')
	mywriter = csv.writer(csv_out)
	count=0
	for row in datareader:
				count = count + 1
				print "COUNT is :%d" % count
                                print row(''.join(row))
				#mywriter.writerow(extract_entities(''.join(row)))


	#csv_out.close()	
	
	file = open('D:/R/BOA/txtfiles/FGD1_18-25_Vodafone_Prepaid_BCUsers_Mumbai.txt', 'r')
	print file.read()
	filew = open('D:/R/BOA/Noun/FNoun.txt', "w")
	for line in file:
                                print line
                                filew.write(extract_entities(line))
                                #filew.write("yeah its me")



        filew.close()
예제 #8
0
def tokenize(s):
    out = []
    tokens = SpaceTokenizer().tokenize(s)
    for w in tokens:
        if w[:1] == "\n":
            out.append("\n")
            out.append(w[1:])
        else:
            out.append(w)
    return out
예제 #9
0
def read_sent(sent):
    sent = SpaceTokenizer().tokenize(sent)
    start = int(sent[0])
    end = int(sent[1])
    dataset = int(sent[2])
    sent = sent[4:]
    labels = [0] * len(sent)
    if dataset != 0:
        labels[start:end + 1] = [1] * (end + 1 - start)
    return [(sent[i], str(labels[i])) for i in range(len(sent))]
예제 #10
0
def analyze_line(line):
    tokens = pos_tag(SpaceTokenizer().tokenize(line))

    names = []
    for token in tokens:
        if token[1] == 'NNP':
            names.append(re.sub('[' + string.punctuation + ']', '', token[0]))

    return {
        "names": names,
        "sentiment": SentimentIntensityAnalyzer().polarity_scores(line)
    }
예제 #11
0
def fun_1_1_5():
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk.tokenize import regexp_tokenize
    tokenizer = RegexpTokenizer("[\w]+")
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    print "regexp_tokenizer:", regexp_tokenize(
        "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+")
    # 通过空格来执行切分
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    # 筛选以大写字母开头的单词
    sent = " She secured 90.56 % in class X \n. She is a meritorious student"
    capt = RegexpTokenizer('[A-Z]\w+')
    print "RegexpTokenizer:", capt.tokenize(sent)
    # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的
    from nltk.tokenize import BlanklineTokenizer
    print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent)
    # 字符串的切分可以通过空格、间隔、换行等来完成
    from nltk.tokenize import WhitespaceTokenizer
    print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent)
    # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其
    # 切分为字母与非字母字符
    from nltk.tokenize import WordPunctTokenizer
    print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent)
    # 使用 split()方法进行切分
    print "split():", sent.split()
    print "split(' '):", sent.split(' ')
    print "split('\n'):", sent.split('\n')
    # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分
    from nltk.tokenize import LineTokenizer
    print "LineTokenizer:", LineTokenizer().tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent)
    # SpaceTokenizer 与 sent.split('')方法的工作原理类似
    from nltk.tokenize import SpaceTokenizer
    print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent)
    # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符
    # 在语句中的位置和偏移量
    print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent))
    # 给定一个标识符的序列,则可以返回其跨度序列
    from nltk.tokenize.util import spans_to_relative
    print "位置和偏移:", list(
        spans_to_relative(WhitespaceTokenizer().span_tokenize(sent)))
    # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量:
    from nltk.tokenize.util import string_span_tokenize
    print "标识符序列:", list(string_span_tokenize(sent, " "))
예제 #12
0
    def get_vocab(self, start_index=2, min_count=10):
        text = ''.join(list(self.publications['full_text'].values))
        all_words = SpaceTokenizer().tokenize(text + text.lower())
        vocab = Counter(all_words).most_common()
        vocab_out_json = {}
        for items in vocab:
            if items[1] > min_count:
                vocab_out_json[items[0].decode(
                    'utf-8', 'replace')] = len(vocab_out_json) + start_index

        print(len(vocab) - len(vocab_out_json), ' words are discarded as OOV')
        print(len(vocab_out_json), ' words are in vocab')

        with codecs.open(self.outdir + 'vocab.json', 'wb') as vocabfile:
            json.dump(vocab_out_json, vocabfile)
예제 #13
0
 def space_tokenizer(text, strip=None):
     ''' Only " " blank character
     Same as s.split(" ")
     >>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks."
     >>> SpaceTokenizer().tokenize(s)
     ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', 
     "It's", 'inexpensive.', 'Free-for-all.', 
     'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
     >>> s.split(' ')
     ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', 
     "It's", 'inexpensive.', 'Free-for-all.', 
     'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
     >>>'''
     
     for token in SpaceTokenizer().tokenize(text):
         if token not in patterns.PUNCTUATION and not token.isspace():
             yield token.strip(strip)
예제 #14
0
def tokenizarPorTipo():
    cadena = "Sorry, I can't go to the meeting.\n"
    print("TreebankWordTokenizer - 1")
    print("WhitespaceTokenizer - 2")
    print("SpaceTokenizer - 3")
    print("WordPunctTokenizer - 4")
    num = input("Introduzca un tokenizer: ")
    if num == "1":
        tokenizer = TreebankWordTokenizer()
    elif num == "2":
        tokenizer = WhitespaceTokenizer()
    elif num == "3":
        tokenizer = SpaceTokenizer()
    elif num == "4":
        tokenizer = WordPunctTokenizer()
    else:
        return

    tokens = tokenizer.tokenize(cadena)
    print(tokens)
예제 #15
0
from nltk import CFG,ChartParser
from nltk.tokenize import SpaceTokenizer
grammar = CFG.fromstring("""
  S -> NP VP
  NP -> Det N
  VP -> IV
  Det -> 'the'
  N -> 'man'
  IV -> 'walks'
  """)
#>>> grammar
#<Grammar with 14 productions>
#>>> grammar.start()
#S
#>>> grammar.productions()
#[S -> NP VP, NP -> Det N, VP -> IV, Det -> 'the', N -> 'man', IV -> 'walks']
parser = ChartParser(grammar)
parses = parser.parse_all(SpaceTokenizer().tokenize("the man walks"))
#>>> parses
#[Tree('S', [Tree('NP', [Tree('Det', ['the']), Tree('N', ['man'])]), Tree('VP', [Tree('IV', ['walks'])])])]
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time       : 2020/7/11 17:37
# @Author     : 代登辉
# @Email      : [email protected]
# @File       : tokenizer.py
# @Software   : PyCharm
# @Description: 分词

# 导入相应库
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

text = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \
       "loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. " \
       "\nAnd I will have my vengeance, in this life or the next. "
ITokenizer = LineTokenizer()
print("按照换行分词 ", ITokenizer.tokenize(text))

rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("按照空格符分词 :", sTokenizer.tokenize(rawText))  # 表达符号和单词连在一起
print("按照单词分词 :", word_tokenize(rawText))  # 表达符号和单词分开

tweet = "This is a cooool #dummysmiley: :-) :-P <3"
tTokenizer = TweetTokenizer()
print("处理特殊字符 ", tTokenizer.tokenize(tweet))
예제 #17
0
import csv

from nltk import pos_tag, ne_chunk
import nltk.tag, nltk.data
from nltk.tokenize import SpaceTokenizer


with open("D:/R/email_Analysis/FINAL/pyhton_mssg.csv", "r") as csvfile:
        datareader = csv.reader(csvfile,quotechar='"' ,lineterminator='\n',quoting=csv.QUOTE_ALL)
        csv_out = open('D:/R/email_Analysis/FINAL/Noun.csv.csv', 'wb')
	mywriter = csv.writer(csv_out)
	count=0
	for row in datareader:
				count = count + 1
				print "COUNT is :%d" % count
				tokenizer = SpaceTokenizer()
                                toks = tokenizer.tokenize((''.join(row)))
				default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)
				model = {'Almost': 'RB','shikha':'NNP','Lots':'','bbnt':'NNP','Swati':'NNP','Sarkar':'NNP','Deepak':'NNP','Capgemini':'NNP','Swati':'NNP','Deepak Shete':'NNP','Melini':'NNP','Lots':'RB','Prashant Deshpande':'NNP','Deepak A. Shete':'NNP','Rajesh Achyut Patankar':'NNP','Shailesh V. Naik':'NNP','Prashant':'NNP','Kuldeep Vishnu Deshpande':'NNP','Kuldeep Deshpande':'NNP','Hi':'UH','From':'IN','Subject':'VB','RE':'SYM','Cc':'SYM','CC':'SYM','Start':'RB','All':'RB','PLEASE':'RB','Request':'RB','Add':'RB','Need':'RB','Completed':'VB','To':'RB','Dear':'RB','Thank':'RB','You':'PRP','We':'PRP','Here':'RB','Team':'RB','Please':'UH','Thanks':'UH','Regards':'UH','See':'VB','Test':'VB','ASAP':'SYM','Sent':'VB','mailto':'SYM','Together':'RB','Is':'VB','AS':'RB','Financial Services Strategic Business Unit':'NNP','fax':'RB','mobile':'RB','except':'RB','date':'RB','new':'RB','courier':'RB','extn':'RB'}
				tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger)
                                #pos = pos_tag(toks)
				pos=tagger.tag(toks)
				print pos
                                chunked_nes = ne_chunk(pos) 

                                nes = [' '.join(map(lambda x: x[0], ne.leaves()))
                                       for ne in chunked_nes
                                             if isinstance(ne, nltk.tree.Tree)]
                                        #data.append(nes)
                                print nes
                                mywriter.writerow(nes)
예제 #18
0
# import all necessary libraries
from nltk.stem import PorterStemmer
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import stopwords
import re

# initialize the instances for various NLP tools
tokenizer = SpaceTokenizer()
stemmer = PorterStemmer()

# define each steps
pipeline1 = [lambda s: re.sub('[^\w\s]', '', s),     # remove special characters
             lambda s: re.sub('[\d]', '', s),        # remove numbers
             lambda s: s.lower(),                    # lower case
             lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s))),   # remove stop words
             lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s)))   # stem (using Porter stemmer)
             ]
pipeline2 = [lambda s: re.sub('[^\w\s]', '', s),
             lambda s: re.sub('[\d]', '', s),
             lambda s: s.lower(),
             lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s)))
             ]
stopword_removal_pipeline = [lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s)))]

# pipeline handling
def preprocess_text(text, pipeline):
    return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])
예제 #19
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
 def __init__(self):
     self.tokenizer = SpaceTokenizer()
예제 #21
0
    def get_test_docs(self):
        test_doc_ids = open(self.outdir + '/test_doc_ids', 'w+')
        test_docs = open(self.outdir + '/test_docs', 'w+')
        golden_data = open(self.outdir + '/test_doc_gold', 'w+')

        test_doc_list = []
        for doc in set(self.data_set_citations['publication_id']):
            if np.random.randint(
                    0, 100) < 10 and doc not in self.zero_shot_doc_ids:
                test_doc_list.append(doc)
                test_doc_ids.write(str(doc) + '\n')
        logger.info(str(len(test_doc_list)) + ' test docs selected')

        pub_ids = list(self.data_set_citations['publication_id'])
        pos_tokens = 0
        neg_tokens = 0
        #to locate lines with relevant pubs
        for pub_id in test_doc_list:
            pub_text = self.full_text[str(pub_id) + '.txt']
            test_docs.write(pub_text + '\n')
            pub_text_tokens = list(SpaceTokenizer().tokenize(pub_text))
            pub_text_spans = list(SpaceTokenizer().span_tokenize(pub_text))
            cur_pos_tokens = 0
            cur_neg_tokens = len(pub_text_tokens)

            res_line = []
            rows = [pub_ids.index(i) for i in pub_ids if i == pub_id]
            for idx in rows:
                d_row = self.data_set_citations.loc[idx]
                for mention_text in d_row['mention_list']:
                    mention_text = re.sub('\d', ' ', mention_text)
                    # mention_text = re.sub('[^ ]- ', '', mention_text)
                    mention_text_spans = list(
                        SpaceTokenizer().span_tokenize(mention_text))

                    index_finder_lower = findall_lower(mention_text, pub_text)
                    found_indices = [idx for idx in index_finder_lower]

                    for find_index in found_indices:
                        try:
                            if find_index != -1:
                                new_mention_text_spans = [
                                    (indices[0] + find_index,
                                     indices[1] + find_index)
                                    for indices in mention_text_spans
                                ]
                                cur_pos_tokens += len(mention_text_spans)

                                res_line.append(
                                    (pub_text_spans.index(
                                        new_mention_text_spans[0]),
                                     pub_text_spans.index(
                                         new_mention_text_spans[-1]),
                                     d_row['data_set_id'],
                                     d_row['publication_id']))
                        except:
                            pass
            res_line = list(set(res_line))
            if len(res_line) == 0:
                # no mentions at all
                res_line.append((-1, -1, 0, pub_id))
            i = 0
            for c in res_line:
                if i > 0:
                    golden_data.write(' | ' + str(c[0]) + ' ' + str(c[1]) +
                                      ' ' + str(c[2]) + ' ' + str(c[3]))
                else:
                    golden_data.write(
                        str(c[0]) + ' ' + str(c[1]) + ' ' + str(c[2]) + ' ' +
                        str(c[3]))
                i += 1
            golden_data.write('\n')
            pos_tokens += cur_pos_tokens
            neg_tokens += (cur_neg_tokens - cur_pos_tokens)

        test_doc_ids.close()
        test_docs.close()
        golden_data.close()

        logger.info(str(pos_tokens) + " pos tokens added.")
        logger.info(str(neg_tokens) + " neg tokens added.")
        logger.info("neg token percentage: {}".format(
            neg_tokens * 100 / (pos_tokens + neg_tokens)))
예제 #22
0
#print(question[1])
selected_k = []
for r in range(len(ranked_q)):
    pos = nltk.pos_tag(ranked_q[r])
    selective_pos = ['NN', 'VB']
    selective_pos_words = []
    for word, tag in pos:
        if tag in selective_pos:
            selective_pos_words.append((word, tag))
    selected_k.append(selective_pos_words)
#print(selected_k[1][0])

# In[14]:

from nltk.tokenize import SpaceTokenizer
tm = SpaceTokenizer()
to_rank = []
key_words = []

for i in range(len(ranked_q)):
    yn = 0

    #ranked_q[i][yn]
    question[i] = untokenize(question[i])

    yy = "_____"
    to_rank.append(tm.tokenize(ranked_q[i][0]))
    print("Q:", question[i].replace(to_rank[i][len(to_rank[i]) // 2], yy))
    print('Ans - ', to_rank[i][len(to_rank[i]) // 2])
    #quita = question[i].index(to_rank[i][len(to_rank[i])//2])
import nltk
import numpy
import os

# import all necessary libraries
from nltk.stem import PorterStemmer
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import stopwords
from functools import partial
#from gensim import corpora
#from gensim.models import TfidfModel
import re

# initialize the instances for various NLP tools
tokenizer = SpaceTokenizer()
stemmer = PorterStemmer()
 
# define steps
pipeline = [lambda s: re.sub('[\n]', '', s),
            lambda s: re.sub('[^\w\s]', '', s),
            lambda s: re.sub('[\d\n]', '', s),
            lambda s: s.lower(),
            lambda s: ' '.join(filter(lambda s: not (s in stopwords.words('english')), tokenizer.tokenize(s))),
            lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s)))
           ]
 
# function that carries out the pipeline step-by-step
def preprocess_text(text, pipeline):
    if len(pipeline)==0:
        return text
    else:
예제 #24
0
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d,
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
예제 #25
0
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

lTokenizer = LineTokenizer()
print(
    "Line tokenizer output :",
    lTokenizer.tokenize(
        "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. \nAnd I will have my vengeance, in this life or the next."
    ))

rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("Space Tokenizer output :", sTokenizer.tokenize(rawText))

print("Word Tokenizer output :", word_tokenize(rawText))

tTokenizer = TweetTokenizer()
print("Tweet Tokenizer output :",
      tTokenizer.tokenize("This is a cooool #dummysmiley: :-) :-P <3"))
예제 #26
0
from nltk.tokenize import SpaceTokenizer
from gensim.models import Word2Vec
import nltk

nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('portuguese')

tokenizer = SpaceTokenizer()
wiki_tokenized = []

wiki_files = [
    "soccer_teams_wiki/resources/wikipedia_corinthians.txt",
    "soccer_teams_wiki/resources/wikipedia_palmeiras.txt",
    "soccer_teams_wiki/resources/wikipedia_portuguesa.txt",
    "soccer_teams_wiki/resources/wikipedia_santos.txt",
    "soccer_teams_wiki/resources/wikipedia_sao_paulo.txt"
]

for file in wiki_files:
    with open(file, "r") as wiki_file:
        wiki_text = wiki_file.readlines()

    # TODO text cleanup. Remove stop words and other text treatment for articles
    for line in wiki_text:
        phrase = [
            word.lower() for word in tokenizer.tokenize(line)
            if word not in stop_words
        ]
        wiki_tokenized.append(phrase)

our_model = Word2Vec(wiki_tokenized,
예제 #27
0
from nltk.tag import pos_tag
import nltk.tokenize
from nltk.corpus import cmudict
from wordgen import gen_word
from nltk import pos_tag, ne_chunk
from nltk.tokenize import SpaceTokenizer
sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus"
tokenizer = SpaceTokenizer()
toks = tokenizer.tokenize(sentence)
pos = pos_tag(toks)
chunked_nes = ne_chunk(pos) 
print chunked_nes
nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)]

print nes
'''
qry = "who is Mahatma Gandhi"
tokens = nltk.tokenize.word_tokenize(qry)
pos = nltk.pos_tag(tokens)
sentt = nltk.ne_chunk(pos, binary = False)
print sentt
person = []
for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'):
    for leave in subtree.leaves():
        person.append(leave)
print "person=", person
    
   ''' 
    
'''
d = cmudict.dict()
예제 #28
0
from nltk.tag import pos_tag
import nltk.tokenize
from nltk.corpus import cmudict
from wordgen import gen_word
from nltk import pos_tag, ne_chunk
from nltk.tokenize import SpaceTokenizer

sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus"
tokenizer = SpaceTokenizer()
toks = tokenizer.tokenize(sentence)
pos = pos_tag(toks)
chunked_nes = ne_chunk(pos)
print chunked_nes
nes = [
    ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes
    if isinstance(ne, nltk.tree.Tree)
]

print nes
'''
qry = "who is Mahatma Gandhi"
tokens = nltk.tokenize.word_tokenize(qry)
pos = nltk.pos_tag(tokens)
sentt = nltk.ne_chunk(pos, binary = False)
print sentt
person = []
for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'):
    for leave in subtree.leaves():
        person.append(leave)
print "person=", person
    
 def __init__(self):
     self.tokenizer = SpaceTokenizer()
예제 #30
0
# 3장 전처리 - 토큰화-NLTK 내장 토크나이저 사용법
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

# LineTokenizer 사용('줄'로 나누기)
lTokenizer = LineTokenizer()
print(
    "Line toknizer 출력 :",
    lTokenizer.tokenize(
        "My name is" +
        "Maximus Decimus Meridius, commander of the Armies of the North, " +
        "General of the Felix Legions and loyal servant to the true emperor," +
        "Marcus Aurlius. \nFather to a murdered son, husband to a murdered" +
        "wife. \nAnd I will have my vengeance, in this life or the next."))

# SpaceTokenizer 사용('공백 문자'로 나누기)
rawText = "By 11 o'clock on sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("Space Tokenizer 출력 :", sTokenizer.tokenize(rawText))

# word_tokenize 사용('단어'와 '구두점' 나누기)
print("word Tokenizer 출력 :", word_tokenize(rawText))

# TweetTokenizer 사용('특수문자'를 다룰 때 사용)
tTokenizer = TweetTokenizer()
print("Tweet Tokenizer 출력 :",
      tTokenizer.tokenize("This is a coooool" + "#dummysmiley: :-) :-P <3"))
예제 #31
0
#import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

logs = pn.read_csv("logs_enhanced.csv", encoding='latin-1')

messages = logs['message_utf8']

print messages[0] + '\n' + messages[1]
ml = len(messages)
print ml
#34467
itr = 0

tokenized_messages = []
tokenizer = SpaceTokenizer()

while (itr < 10):
    try:

        if (messages[itr][0:1] == "`"):
            itr += 1
            print "code found"
        else:
            tokenized_messages.append(tokenizer.tokenize(messages[itr]))
            itr += 1
            print itr
    except TypeError:
        print "Skipped"
        itr += 1
        print itr
예제 #32
0
    def get_train_data(self, MAX_LENGTH=60, full_neg=True):
        ## MAX_LENGTH: max length of segments to be split into
        ## neg ratio: how many neg data to use (out of 100), should be an integer
        ## full_neg: whether to extract all neg data

        max_length_token = MAX_LENGTH

        ## avoid taking docs from test set
        test_doc_ids = []
        zero_shot_doc_ids = []
        with open('../data/all_test_docs/test_doc_ids') as f:
            fl = f.readlines()
            test_doc_ids = [int(line.strip()) for line in fl]

        with open('../data/all_test_docs/zero_shot_doc_ids') as f:
            fl = f.readlines()
            zero_shot_doc_ids = [int(line.strip()) for line in fl]

        train_doc_len = len(
            set(self.data_set_citations['publication_id'].values)) - len(
                test_doc_ids) - len(zero_shot_doc_ids)
        logger.info('sample from ' + str(train_doc_len) + ' train docs')

        pos_count = 0
        neg_count = 0
        pos_tokens = 0
        neg_tokens = 0

        sampled = []
        with codecs.open(self.outdir + 'pos_data',
                         'w') as pos_data, codecs.open(
                             self.outdir + 'neg_data', 'w') as neg_data:
            for index, row in self.data_set_citations.iterrows():
                pub_id = row['publication_id']
                if pub_id in zero_shot_doc_ids or pub_id in test_doc_ids:
                    continue

                if pub_id in sampled:
                    continue
                else:
                    sampled.append(pub_id)

                pub_ids = list(self.data_set_citations['publication_id'])
                rows = [pub_ids.index(i) for i in pub_ids if i == pub_id]
                mention_list = []
                for r in rows:
                    d_row = self.data_set_citations.loc[r]
                    mention_list.extend(d_row['mention_list'])
                mention_list = set(mention_list)
                logger.info('pub id: {}, mentions: {}'.format(
                    pub_id, len(mention_list)))

                sample_text = self.full_text[str(pub_id) + '.txt']
                sample_text_tokens = list(
                    SpaceTokenizer().tokenize(sample_text))
                sample_text_spans = list(
                    SpaceTokenizer().span_tokenize(sample_text))

                pos_splits = []
                for mention_text in mention_list:
                    mention_text = re.sub('\d', ' ', mention_text)
                    # mention_text = re.sub('[^ ]- ', '', mention_text)
                    mention_text_spans = list(
                        SpaceTokenizer().span_tokenize(mention_text))

                    index_finder_lower = findall_lower(mention_text,
                                                       sample_text)

                    all_found_indices = [idx for idx in index_finder_lower]

                    for find_index in all_found_indices:
                        try:
                            if find_index != -1:
                                # logger.info('Found: '+mention_text)
                                new_mention_text_spans = [
                                    (indices[0] + find_index,
                                     indices[1] + find_index)
                                    for indices in mention_text_spans
                                ]
                                #write to training sample pointers here

                                for splits in range(
                                        len(sample_text_tokens) //
                                        max_length_token - 1):
                                    if sample_text_spans.index(new_mention_text_spans[0]) > splits*(max_length_token) and \
                                      sample_text_spans.index(new_mention_text_spans[-1]) < (splits+1)*(max_length_token):

                                        pos_splits.append(splits)
                                        pos_count += 1
                                        pos_tokens += len(
                                            new_mention_text_spans)
                                        neg_tokens += (
                                            MAX_LENGTH -
                                            len(new_mention_text_spans))

                                        #TODO Wrapper over full data reader
                                        pos_data.write(
                                            str(
                                                sample_text_spans.index(
                                                    new_mention_text_spans[0])
                                                - splits *
                                                (max_length_token)) + ' ' +
                                            str(
                                                sample_text_spans.index(
                                                    new_mention_text_spans[-1])
                                                - splits *
                                                (max_length_token)) + ' ' +
                                            str(row['data_set_id']) + ' ' +
                                            str(row['publication_id']) + ' ' +
                                            ' '.join(sample_text_tokens[
                                                splits * (max_length_token):
                                                (splits + 1) *
                                                (max_length_token) + 1]) +
                                            '\n')
                            else:
                                # print ('Annotation Error: Annotated gold standards not correct')
                                pass
                        except:
                            # print ('Indexing Logic Error: Some corner index case missed while parsing')
                            pass

                if not full_neg:
                    continue
                ## NOTE: index starts from 0
                ## -1 - 1 means no mention
                for splits in range(
                        len(sample_text_tokens) // (max_length_token) - 1):
                    if splits not in pos_splits:
                        neg_data.write(
                            str(-1) + ' ' + str(-1) + ' ' + str(0) + ' ' +
                            str(row['publication_id']) + ' ' +
                            ' '.join(sample_text_tokens[splits *
                                                        (max_length_token):
                                                        (splits + 1) *
                                                        (max_length_token)]) +
                            '\n')

                        neg_count += 1
                        neg_tokens += MAX_LENGTH

        logger.info(str(pos_count) + " mentions added.")
        logger.info(str(neg_count) + " no mentions added.")
        logger.info(str(pos_tokens) + " pos tokens added.")
        logger.info(str(neg_tokens) + " neg tokens added.")
        logger.info("neg token percentage: {}".format(
            neg_tokens * 100 / (pos_tokens + neg_tokens)))
예제 #33
0
from nltk.tokenize import SpaceTokenizer

tokenizer = SpaceTokenizer()
start_tag = 0


def write_sentence_in_format(sentence, input_file, out):
    list_of_words_from_sentence = tokenize(sentence)
    write(list_of_words_from_sentence, out, input_file)


def tokenize(sentence):
    return tokenizer.tokenize(sentence)


def write(list_of_words, out, input_file):
    global start_tag
    for line in list_of_words:
        # write line to output file
        pass


def write_value(sentence, value, out):
    out.write(sentence + " " + value + "\n")
예제 #34
0
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

# Line tokenizer
longSentence = 'My name is Maximus Decimus Meridius, Commander of the Armies '\
'of the North, General of the Felix Legions, loyal servant to '\
'the true emperor, Marcus Aurelius. Father to a murdered son, '\
'husband to a murdered wife. And I will have my vengeance, in '\
'this life or the next.'

lTokenizer = LineTokenizer()
sentenceTokens = lTokenizer.tokenize(longSentence)
print (sentenceTokens)

# Space tokenizer
sTokenizer = SpaceTokenizer()
spaceTokens = sTokenizer.tokenize(longSentence)
print (spaceTokens)

# Tweet tokenizer
tweet = 'This is a coool #dummysmiley: :-) :) :-P <3'
tTokenizer = TweetTokenizer()
tTokens = tTokenizer.tokenize(tweet)
print ('Tweet tokenizer outpur:')
print (tTokens)

# Word tokenizer
wTokenizer = word_tokenize(longSentence)
print (wTokenizer)

################
예제 #35
0
indx ='\n'.join(res)
print ("\nThe sentences contaning '"+ inp +"'"+" are : \n" + indx)
#conversations containing input

con = re.findall(r'"(?:(?:(?!(?<!\\)").)*)"', str(res))
indx2 ='\n'.join(con)
print ("\nThe conversations contaning '"+ inp +"'"+" are : \n" + indx2)
#count of conversations
count = len(list(filter(lambda x: inp in x, con))) 
print ("\nThe count of conversations contaning '"+ inp +"'"+" are :\n"+str(count))
#All conversations in the excerpt
allconv = re.findall(r'"(.*?)"', str(token_text))
indx3 ='\n'.join(allconv)
print ("\nThe conversations in the excerpt are : \n" + indx3)

from nltk.tag import pos_tag
tagged_sent = pos_tag(text_string.split())
#propernouns = [word for word,pos in tagged_sent if pos == 'NNP']
#print( propernouns)

from nltk.tree import Tree
from nltk import pos_tag, ne_chunk
from nltk.tokenize import SpaceTokenizer
tokenizer = SpaceTokenizer()
toks = tokenizer.tokenize(text_string)
pos = pos_tag(toks)
chunked_nes = ne_chunk(pos) 
nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne,Tree)]
indx4 ='\n'.join(nes)
print("\n Proper nouns used in the excerpt are:\n", indx4)
예제 #36
0
# -*- coding: utf-8 -*-

from nltk.tokenize import SpaceTokenizer

import sys

s = sys.argv[1].decode('utf-8')
dt = sys.argv[2]

#print "dt = "+dt

#python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ"   ' 	'
tokens = []

if(dt == " "):
	tokens = SpaceTokenizer().tokenize(s)

#python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \t ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ"   '\t'
from nltk.tokenize import TabTokenizer


if(dt == '\\t'):
	print "dt = "+dt
	s = s.replace(u'\\t','\t')
	tokens = TabTokenizer().tokenize(s)

#python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ"   '\n'
from nltk.tokenize import LineTokenizer

if(dt == '\\n'):
	s = s.replace(u'\\n','\n')
예제 #37
0
파일: Sry_BOA.py 프로젝트: Aashu3739/PJS
import nltk.tag, nltk.data
from nltk.tokenize import SpaceTokenizer

os.chdir("D:/R/BOA/txtfiles")
for fileName in glob.glob("*.txt"):
    count=0
    file = open('D:/R/BOA/txtfiles/'+fileName, 'r')
    filew = open('D:/R/BOA/Noun/'+fileName, "wb")
    for line in file:
                                    count=count+1
                                    print count
                                    print line
                                    line = re.sub('\\f', '', line)
                                    #line = line.decode("utf-8")
                                    line = unicode(line, errors='ignore')
                                    tokenizer = SpaceTokenizer()
                                    toks = tokenizer.tokenize(line)
                                    default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)
                                    model = {'Consumer': 'RB'}
                                    tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger)
                                    #pos = pos_tag(toks)
                                    pos=tagger.tag(toks)
                                    print pos
                                    chunked_nes = ne_chunk(pos) 
                                    nes = [' '.join(map(lambda x: x[0], ne.leaves()))
                                           for ne in chunked_nes
                                                 if isinstance(ne, nltk.tree.Tree)]
                                            #data.append(nes)
                                    print nes 
                                    filew.write((','.join(nes))+'\n')
                                    #filew.write("yeah its me")