class DataFilter: def __init__(self): self.bl_tokenizer = LineTokenizer() self.re_tokenizer = RegexpTokenizer(r'[a-zA-Z]+') self.stemmer = SnowballStemmer('english') self.NGRAM_RANGE = 3 def rm_blanklines(self, text): return " ".join([word for word in self.bl_tokenizer.tokenize(text)]) def rm_stopwords(self, text): return " ".join([ word for word in word_tokenize(text) if word.lower() not in stopwords.words() ]) def ngram_tokenize(self, text): return [ word for sent in sent_tokenize(text) for word in ngrams( self.re_tokenizer.tokenize(sent), self.NGRAM_RANGE) ] def tokenize_(self, text): return [ word for sent in sent_tokenize(text) for word in self.re_tokenizer.tokenize(sent) ] def tokenize_and_stem(self, text): return [ self.stemmer.stem(word) for sent in sent_tokenize(text) for word in self.re_tokenizer.tokenize(sent) ] def rm_nonwords(self, text): return " ".join([ word for word in word_tokenize(text) if word.lower() in words.words() ])
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize lTokenizer = LineTokenizer() print( "Line tokenizer output :", lTokenizer.tokenize( "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. \nAnd I will have my vengeance, in this life or the next." )) rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary." sTokenizer = SpaceTokenizer() print("Space Tokenizer output :", sTokenizer.tokenize(rawText)) print("Word Tokenizer output :", word_tokenize(rawText)) tTokenizer = TweetTokenizer() print("Tweet Tokenizer output :", tTokenizer.tokenize("This is a cooool #dummysmiley: :-) :-P <3"))
def line_tokenizer(data, blanklines): ''' Tokenize the text on line level ie, after \n''' tokenizer = LineTokenizer(blanklines=blanklines) return tokenizer.tokenize(data)
def sentence_tokenizer(corpus): line_tokenizer = LineTokenizer() song_lines = line_tokenizer.tokenize(corpus) return song_lines
#!/usr/bin/python3.6 # -*- coding: utf-8 -*- # @Time : 2020/7/11 17:37 # @Author : 代登辉 # @Email : [email protected] # @File : tokenizer.py # @Software : PyCharm # @Description: 分词 # 导入相应库 from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize text = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \ "loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. " \ "\nAnd I will have my vengeance, in this life or the next. " ITokenizer = LineTokenizer() print("按照换行分词 ", ITokenizer.tokenize(text)) rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary." sTokenizer = SpaceTokenizer() print("按照空格符分词 :", sTokenizer.tokenize(rawText)) # 表达符号和单词连在一起 print("按照单词分词 :", word_tokenize(rawText)) # 表达符号和单词分开 tweet = "This is a cooool #dummysmiley: :-) :-P <3" tTokenizer = TweetTokenizer() print("处理特殊字符 ", tTokenizer.tokenize(tweet))
tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') # Loads the spanish sentence tokenizer print (tokenizer.tokenize(para)) # Tokenizes the text # Tokenize based on lines, spaces or tweets (special class) from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize # Line tokenizer longSentence = 'My name is Maximus Decimus Meridius, Commander of the Armies '\ 'of the North, General of the Felix Legions, loyal servant to '\ 'the true emperor, Marcus Aurelius. Father to a murdered son, '\ 'husband to a murdered wife. And I will have my vengeance, in '\ 'this life or the next.' lTokenizer = LineTokenizer() sentenceTokens = lTokenizer.tokenize(longSentence) print (sentenceTokens) # Space tokenizer sTokenizer = SpaceTokenizer() spaceTokens = sTokenizer.tokenize(longSentence) print (spaceTokens) # Tweet tokenizer tweet = 'This is a coool #dummysmiley: :-) :) :-P <3' tTokenizer = TweetTokenizer() tTokens = tTokenizer.tokenize(tweet) print ('Tweet tokenizer outpur:') print (tTokens) # Word tokenizer
def load_stop_words(lang): content = read_file(STOP_WORDS_PATH) lt = LineTokenizer() return lt.tokenize(content)
'''澎湃新闻(www.thepaper.cn)梳理发现,9月24日,青岛市在对青岛港大港公司进口冷链产品装卸工人进行定期例行检测时,发现2例新型冠状病毒肺炎无症状感染者。10月11日,青岛市又发现3例新冠肺炎无症状感染者 随后,青岛将上述3例无症状感染者的密切接触者和市胸科医院相关人员作为''', "青岛") #CCC = text_match('''asdccc''',"ccc") print("CCC text_match", CCC) print("CCC text_match", CCC.start()) print("CCC text_match", CCC.group(0)) #青岛 print("CCC text_match", CCC.groups()) print("CCC text_match", CCC.group()) #青岛 print("-------------------------------------------------------------") print("-------------------------------------------------------------") #---------line split---------------------------------------------------------------------------------------------- lTokenizer = LineTokenizer() #Print("lTokenizer result:", AAA = lTokenizer.tokenize( '''澎湃新闻(www.thepaper.cn)梳理发现,9月24日,青岛市在对青岛港大港公司进口冷链产品装卸工人进行定期例行检测时,发现2例新型冠状病毒肺炎无症状感染者。10月11日,青岛市又发现3例新冠肺炎无症状感染者 随后,青岛将上述3例无症状感染者的密切接触者和市胸科医院相关人员作为高风险人群进行重点监测,共排查到密切接触者和市胸科医院所有在院患者及陪护人员377人,其中新增核酸检测结果阳性9人,包括8名市胸科医院在院患者及陪护人员、1名患者家属,经专家组判定,其中4例为确诊病例、5例为无症状感染者。 青岛市卫健委通报称,截至10月11日23时,青岛市共发现6例确诊病例,6例无症状感染者。到目前发现的所有确诊病例和无症状感染者均与市胸科医院高度关联。而市胸科医院部分独立区域承担着收治境外输入新冠病毒感染者的任务。 澎湃新闻(www.thepaper.cn)注意到,山东正调集全省流调和检测方面的机动力量,赴青岛提供支持。山东省委、省政府已经在青岛设立前方指挥部,青岛市正开展大规模核酸检测,全面彻底排查可能的感染者,以尽快实现城区人员检测全覆盖。 ''') print(AAA) #---------get_close_matche---------------------------------------------------------------------------------------------- aaa = difflib.get_close_matches( '(www.thepaper.cn)注意到,山东正调集全省流调和检测方面的机动力量,赴青岛提供支持。山东省委、省政府已经在青岛设立前方指挥部,青岛市正开展大规模核酸检测,全面彻底排查可能的感染者,以尽快实现城区人员检测全覆盖。', AAA, 1, cutoff=0.1) print("get_close_matche - aaa match", aaa)
def load_stop_words(): STOP_WORDS_PATH = join('sw.txt') content = read_file(STOP_WORDS_PATH) lt = LineTokenizer() return lt.tokenize(content)
def text_pre_processing(text, remove_number=True, stop_word=True, stop_word_language='english', remove_punctuation=True): # --------------------------------------------- # Patterns results_chunk = '' results_named_entitiy = '' patterns1 = r'@[A-Za-z0-9_]+' pattterns2 = r'https?://[^ ]+' combined_patterns = r'|'.join((patterns1, pattterns2)) www_patterns = r'www.[^ ]+' negations_dic = { "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not", "can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not" } negations_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b') # --------------------------------------------- # convert to lower case results = str(text) # --------------------------------------------- # Text Cleaning results = re.sub(combined_patterns, '', results) results = re.sub(www_patterns, '', results) results = results.lower() results = negations_pattern.sub(lambda x: negations_dic[x.group()], results) results = re.sub("[^a-zA-Z]", " ", results) results = results.replace("(<br/>)", "") results = results.replace('(<a).*(>).*(</a>)', '') results = results.replace('(&)', '') results = results.replace('(>)', '') results = results.replace('(<)', '') results = results.replace('(\xa0)', ' ') # --------------------------------------------- if (remove_number) & (results != ''): results = re.sub(r'\d+', '', results) # --------------------------------------------- if remove_punctuation & (results != ''): translator = str.maketrans('', '', string.punctuation) results = results.translate(translator) # --------------------------------------------- # Remove whitespaces results = results.strip() # --------------------------------------------- # Line Tokenize if results != '': line_tokenizer = LineTokenizer() results = line_tokenizer.tokenize(results) results = list(filter(None, results)) results = results[0] # --------------------------------------------- # Tab Tokenize if results != '': tab_tokenizer = TabTokenizer() results = tab_tokenizer.tokenize(results) results = list(filter(None, results)) results = results[0] # --------------------------------------------- # Space Tokenizer if results != '': space_toknizer = SpaceTokenizer() results = space_toknizer.tokenize(results) results = list(filter(None, results)) results = ' '.join([w for w in results]) # ----------------------------------------------- # Lemmatization using NLTK if results != '': lemmatizer_of_text = WordNetLemmatizer() word_list = word_tokenize(results) results = ' '.join([ lemmatizer_of_text.lemmatize(w, get_word_net_pos_tag(w)) for w in word_list ]) # --------------------------------------------- # Stemming using NLTK if results != '': stemmer = PorterStemmer() if type(results) == list: results = ' '.join(str(w) for w in results) results = word_tokenize(str(results)) results = [stemmer.stem(word) for word in results] results = ' '.join(str(w) for w in results) # --------------------------------------------- # Remove Stop Words if stop_word & (results != ''): nltk.download('stopwords') stop_words = set(stopwords.words(stop_word_language)) word_tokens = word_tokenize(results) results = ' '.join(str(w) for w in word_tokens if not w in stop_words) # --------------------------------------------- # Chunking of the input, will be used ofr coloring of the text if results != '': result_str = TextBlob(results) reg_exp = 'NP: { < DT >? < JJ > * < NN >}' rp = nltk.RegexpParser(reg_exp) results_chunk = rp.parse(result_str.tags) # results_chunk.draw() # --------------------------------------------- # Named Entity Recognition if results != '': results_named_entitiy = ne_chunk(pos_tag(word_tokenize(results))) return results, results_chunk, results_named_entitiy
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize line = "My name is Venkatram Veerareddy, technical architect.\n I am having 20 years of experience in "\ " Software industry working \nfrom applications to products by using \n" \ " C, C++, Java, Javascript and databases "\ " like Oracle, MS SQL Server, Postgres, MySQL and OrientDB." lTokenizer = LineTokenizer() print("Line tokenizer output: ", lTokenizer.tokenize(line)) sTokenizer = SpaceTokenizer() print("Space Tokenizer output: ", sTokenizer.tokenize(line)) print("Word Tokenizer output: ", word_tokenize(line)) tTokenizer = TweetTokenizer() print("Tweet Tokenizer output: ", tTokenizer.tokenize("This is a coooool #dummysmiley: :-) :-P <3"))
def _update_ngram_database(notes_directory, ngram_db_dir): line_tokenizer = LineTokenizer(blanklines='discard') word_tokenizer = WhitespaceTokenizer() grep_command = 'find {} | grep ".note$"'.format(notes_directory) proc = Popen( grep_command, stdout=PIPE, stderr=PIPE, shell=True) output, err = proc.communicate() all_notes_files = output.decode().split('\n') ''' Create master list of all raw tokens. Will look like: tokens = { 'unigrams': ['all', 'unigrams'], 'bigrams': [('all', 'bigrams')], 'trigrams': [('all', 'the', 'trigrams')] } ''' tokens = { 'unigrams': [], 'bigrams': [], 'trigrams': [] } for note_file in all_notes_files: if not note_file: continue with codecs.open(note_file, mode="r", encoding="utf-8") \ as note_file_object: note_file_content = note_file_object.read() note_file_content = note_file_content.lower() lines = line_tokenizer.tokenize(note_file_content) for line in lines: sentences = sent_tokenize(line) for sentence in sentences: sentence_safe_split = [] all_words = word_tokenizer.tokenize(sentence) for word in all_words: # Skip any word with a forbidden character if any([char in word for char in FORBIDDEN_CHARS]): continue has_letters = False for char in word: if char.isalpha(): has_letters = True break if word and has_letters: sentence_safe_split.append(word) tokens['unigrams'].extend(sentence_safe_split) tokens['bigrams'].extend(bigrams(sentence_safe_split)) tokens['trigrams'].extend(trigrams(sentence_safe_split)) ''' Squash the list of tokens into a dict that tracks the number of occurences of each token. Will look like: tokens = { 'unigrams': { 'foo': 17, 'bar': 42, ... }, ... } ''' for token_type in tokens.keys(): all_tokens_of_type = tokens[token_type] weighted_tokens = {} for single_token in all_tokens_of_type: if not isinstance(single_token, str): single_token = ' '.join(single_token) if not weighted_tokens.get(single_token): weighted_tokens[single_token] = 1 else: weighted_tokens[single_token] = weighted_tokens[single_token]+1 tokens[token_type] = OrderedDict(sorted( weighted_tokens.items(), key=lambda t: t[1], reverse=True)) # Write Unigrams to Disk unigrams_json_file_path = ngram_db_dir + '/unigrams.json' unigrams_text_file_path = ngram_db_dir + '/unigrams.txt' with open(unigrams_json_file_path, 'w') as unigrams_json_file_object: json.dump(tokens['unigrams'], unigrams_json_file_object) with codecs.open(unigrams_text_file_path, mode="w", encoding="utf-8") \ as unigrams_text_file_object: for unigram, frequency in tokens['unigrams'].items(): unigrams_text_file_object.write(unigram + '\n') # Write Bigrams to Disk bigrams_json_file_path = ngram_db_dir + '/bigrams.json' bigrams_text_file_path = ngram_db_dir + '/bigrams.txt' with open(bigrams_json_file_path, 'w') as bigrams_json_file_object: json.dump(tokens['bigrams'], bigrams_json_file_object) with codecs.open(bigrams_text_file_path, mode="w", encoding="utf-8") \ as bigrams_text_file_object: for bigram, frequency in tokens['bigrams'].items(): bigrams_text_file_object.write(bigram + '\n') # Write Trigrams to Disk trigrams_json_file_path = ngram_db_dir + '/trigrams.json' trigrams_text_file_path = ngram_db_dir + '/trigrams.txt' with open(trigrams_json_file_path, 'w') as trigrams_json_file_object: json.dump(tokens['trigrams'], trigrams_json_file_object) with codecs.open(trigrams_text_file_path, mode="w", encoding="utf-8") \ as trigrams_text_file_object: for trigram, frequency in tokens['trigrams'].items(): trigrams_text_file_object.write(trigram + '\n')
def lineTokenize(data): lineTokenize_object = LineTokenizer() lineTokenize_string = lineTokenize_object.tokenize(data) return (lineTokenize_string)