def test_wordsent(self): text = u"""Tổng thống Nga coi việc Mỹ không kích căn cứ quân sự của Syria là "sự gây hấn nhằm vào một quốc gia có chủ quyền", gây tổn hại đến quan hệ Moscow-Washington.""" word_tokenize(text)
def tokenizer(row): return word_tokenize(row, format="text")
if label not in label_map: continue data_with_label[label_map[label]].append(content) x_train = [] y_train = [] x_test = [] y_test = [] for label, contents in data_with_label.items(): contents_length = len(contents) separate_index = (contents_length * 2) // 3 for index, content in enumerate(contents): # for content in contents: content = content.lower() words = word_tokenize(content) new_words = list(map(lambda word: '_'.join(word.split(' ')), words)) content_after_handling = ' '.join(new_words) if index <= separate_index: x_train.append(content_after_handling) y_train.append(label) else: x_test.append(content_after_handling) y_test.append(label) # x_test = [] print('Start training') (count_vectorizer, tf_idf_transformer, x_train_counts, x_train_tf_idf) = calculate_tf_idf(x_train) # with open('sentiment_analysis_test.v1.0.txt') as file: # for line in file:
def clean_data(data): sentences = word_tokenize(data) sentences = [ stemmer.stem(w.lower()) for w in sentences if w not in stopwords ] #pre process return sentences
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', keep_sentences=False): tokens_sentences = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] if keep_sentences: wordless_text_utils.check_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) else: wordless_text_utils.check_word_tokenizers( main, lang=lang, word_tokenizer=word_tokenizer) if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_sentences.append(treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_sentences.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_sentences.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_sentences.append(toktok_tokenizer.tokenize(sentence)) if not keep_sentences: tokens_sentences = [ itertools.chain.from_iterable(tokens_sentences) ] elif 'Sacremoses' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang) else: sentences = [text] if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.penn_tokenize(sentence)) elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if keep_sentences: for sentence in doc.sents: tokens_sentences.append( [token.text for token in sentence.as_doc()]) else: tokens_sentences.append([token.text for token in doc]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang=lang) else: sentences = [text] # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_sentences.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_sentences.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Thai elif 'PyThaiNLP' in word_tokenizer: sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'pybo' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang='bod') else: sentences = [text] if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_gmd.tokenize(sentence) ]) elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_pos.tokenize(sentence) ]) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (tsikchen)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_tsikchen.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if keep_sentences: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) else: sentences = [text] for sentence in sentences: tokens_sentences.append(underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for i, tokens in enumerate(tokens_sentences): tokens_sentences[i] = [ token.strip() for token in tokens if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary='', sentence_ending=True) else: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary=' ', sentence_ending=True) return tokens_sentences
def funcOpenFileTrainCheckPost(): file_path = os.path.join(script_dir, "train_NLTK/checkwords_vn_train.txt") f = codecs.open(file_path, "r", "utf8") keywords_for_check = f.read() ArrCheck = word_tokenize(keywords_for_check) print(str(ArrCheck) + "\n")
words = [] classes = [] documents = [] fileName = "../process/StopWords" file_Stop_word = open(fileName, "r", encoding="utf-8") stopWords = set() for line in file_Stop_word: line = line.strip("\n") stopWords.add(line) ignore_words = list(stopWords) for intent in intents['intents']: for question in intent['questions']: w = word_tokenize(question) words.extend(w) documents.append((w, intent['tag'])) if intent['tag'] not in classes: classes.append(intent['tag']) words = [w.lower() for w in words if w not in ignore_words if len(w) != 1] words = sorted(list(set(words))) classes = sorted(list(set(classes))) pickle.dump(words, open('../deploy/words.pkl', 'wb')) pickle.dump(classes, open('../deploy/classes.pkl', 'wb')) pickle.dump(documents, open('../deploy/documents.pkl', 'wb')) pickle.dump(ignore_words, open('../deploy/ignore_words.pkl', 'wb'))
def preprocessing(s, show_stepbystep=False): if show_stepbystep: print("original:") print(s) print() # remove 'Xem thêm' s = re.sub('Xem thêm', '', s) if show_stepbystep: print("remove 'Xem thêm':") print(s) print() # convert to lower case s = s.lower() if show_stepbystep: print("lowercase:") print(s) print() # abbreviate some names s = re.sub('kỹ thuật phần mềm', 'ktpm', s) s = re.sub('công nghệ phần mềm', 'cnpm', s) s = re.sub('khoa học máy tính', 'khmt', s) s = re.sub('hệ thống thông tin', 'httt', s) s = re.sub('kỹ thuật máy tính', 'ktmt', s) s = re.sub('thương mại điện tử', 'tmđt', s) s = re.sub('công nghệ thông tin', 'cntt', s) s = re.sub('an toàn thông tin', 'attt', s) s = re.sub('công tác sinh viên', 'ctsv', s) s = re.sub('ban học tập', 'bht', s) if show_stepbystep: print("abbreviate faculty name:") print(s) print() # remove urls and hashtags s = re.sub(r'http\S+', '', s) s = re.sub(r'#\S+', '', s) if show_stepbystep: print('remove urls and hashtags:') print(s) print() # remove email address s = re.sub(r'\S*@\S*\s?', '', s) if show_stepbystep: print('remove email addresses:') print(s) print() # split into words tokens = word_tokenize(s) if show_stepbystep: print('tokenize:') print(tokens) print() # remove punctuation and number words = [word for word in tokens if re.sub(r"\s+", "", word).isalpha()] if show_stepbystep: print('remove punctuation:') print(words) print() # remove consecutive duplicates character words = [removeConsecutiveDuplicates(word) for word in words] if show_stepbystep: print('remove consecutive duplicates character:') print(words) print() # replace abbreviation of word words = [replace_abbr(word) for word in words] if show_stepbystep: print('replace abbreviation:') print(words) print() # replace " " with "_" words = [re.sub(r"\s+", "_", word) for word in words] if show_stepbystep: print('replace space with "_" :') print(words) print() # remove single character words = [word for word in words if len(word) > 1] if show_stepbystep: print('remove single character:') print(words) print() return ' '.join(words)
for sentences in file: corpus.append(transform_row(sentences)) for sentences in corpus: W = None Text = sentences ListSentence.append(Text) W = ViWordSegment() W.parseword() words = list(set(words)) words.sort() X = np.zeros([len(words), len(words)]) for sentences in corpus: # tương tu cũng loai bo stopword tung cau tokens = [] for word in word_tokenize(sentences): NewWord = word.replace('.', '').replace(',', '').strip() if NewWord != '': if not (NewWord in StopWordsInput): tokens.append(NewWord.lower()) data = [] for sentences in corpus: tokens = [] for word in word_tokenize(sentences): NewWord = word.replace('.', '').replace(',', '').strip() if NewWord != '': if not (NewWord in StopWordsInput): tokens.append(NewWord.lower()) data.append(tokens)
#XỬ LÝ REQUEST_STATUS 1: time = [year] for i in numSeeker(sw_remover(standardize(datve['date'],0))): time.append(i) print(time) for j in timeSeeker(sw_remover(standardize(datve['time'],2))): time.append(j) if len(time)==6: d = datetime.datetime(int(time[0]),int(time[1]),int(time[2]),int(time[3]),int(time[4]),int(time[5])) timestamp = d.replace(tzinfo=timezone.utc).timestamp() seats = numSeeker(sw_remover(standardize(datve['seats'],2))) request01 = { "status": 0, "data": { "pickupAddress":placecorrect(word_tokenize(sw_remover(standardize(datve['pickupAddress'],1)))), "takeoffAddress":placecorrect(word_tokenize(sw_remover(standardize(datve['takeoffAddress'],1)))), "time": timestamp, "seats": seats[0] } } print(request01) else: print("error") ### HÀM XỬ LÝ PHẢN ÁNH ### def abbriviateCorrect(mss): #xử lý viết tắt final = ''
def buil_new_model(): with open('deploy/intents.json', encoding="utf-8") as json_data: intents = json.load(json_data) words = [] classes = [] documents = [] fileName = "process/StopWords" file_Stop_word = open(fileName, "r", encoding="utf-8") stopWords = set() for line in file_Stop_word: line = line.strip("\n") stopWords.add(line) ignore_words = list(stopWords) for intent in intents['intents']: for question in intent['questions']: w = word_tokenize(question) words.extend(w) documents.append((w, intent['tag'])) if intent['tag'] not in classes: classes.append(intent['tag']) words = [w.lower() for w in words if w not in ignore_words if len(w) != 1] words = sorted(list(set(words))) classes = sorted(list(set(classes))) pickle.dump(words, open('deploy/words.pkl', 'wb')) pickle.dump(classes, open('deploy/classes.pkl', 'wb')) pickle.dump(documents, open('deploy/documents.pkl', 'wb')) pickle.dump(ignore_words, open('deploy/ignore_words.pkl', 'wb')) dataset = [] output = [] output_empty = [0] * len(classes) for doc in documents: bag = [] # print(doc) question_words = doc[0] question_words = [ word.lower() for word in question_words if word not in ignore_words if len(word) != 1 ] # print(question_words) for w in words: if w in question_words: bag.append(1) else: bag.append(0) output_row = list(output_empty) output_row[classes.index(doc[1])] = 1 dataset.append([bag, output_row]) # print(bag) random.shuffle(dataset) len_dataset = len(dataset) len_train = int(len_dataset * 0.75) training = dataset[0:len_train] testing = dataset[len_train:len_dataset] training = np.array(training) testing = np.array(testing) train_x = list(training[:, 0]) test_x = list(testing[:, 0]) train_y = list(training[:, 1]) test_y = list(testing[:, 1]) model = Sequential() model.add(Dense(128, input_shape=(len(train_x[0]), ), activation='relu')) model.add(Dropout(0.1)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(len(train_y[0]), activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc']) history = model.fit(np.array(train_x), np.array(train_y), epochs=10000, batch_size=64) model_path = "deploy/model_h3d.h5" model.save(model_path) # Evaluate the model on the test data using `evaluate` print("Evaluate on test data") results = model.evaluate(np.array(test_x), np.array(test_y), batch_size=64) print("test loss, test acc:", results)
des = doc["Description"] news_link = doc["NewspaperLink"] ### Kiểm tra bài báo đã được đẩy vào Elasticsearch ### Vì dữ liệu đẩy lên Elasticsearch được tokenize bởi standard tokenizer 1 lần nữa ### nên đoạn code dưới không còn chính xác với dữ liệu text. Nhưng dữ liệu số thì được # result_ck = es.search( # index="my-index", # body={"query": {"bool": {"must": {"term": {"NewspaperLink": news_link}}}}}, # ) # if result_ck["hits"]["total"]["value"] > 0: # continue ### Tokenize ### Tokenize title title_tokenized = underthesea.word_tokenize(title, format="text") ### Tokenize description des_tokenized = underthesea.word_tokenize(des, format="text").replace("\n", "") ### Tokenize content content_tokenized = [] for ct in content: if ct["type"] == "text": paragraph = ct["content"] sentences = underthesea.sent_tokenize(paragraph) paragraph_tokenized = "" for sentence in sentences: ### Tokenize sentence in paragraph sentence = underthesea.word_tokenize(sentence, format="text")
from underthesea import pos_tag from underthesea import word_tokenize sen = 'kjkhigyf yêu cầu phát triển và ứng dụng công nghệ thông tin trong sản xuất kinh doanh và quản lý hướng tới mục tiêu nâng cao toàn diện năng lực cạnh tranh quốc gia coi đây là con đường ngắn nhất để Việt Nam tiến kịp các nước phát triển tiến cùng thời đại' seg = word_tokenize(sen) tag = pos_tag(sen) tag2 = [i[1] for i in tag] print('done')
def auto_tags(self): self.tokens = word_tokenize(self.text)
def func_DataAnalysis(): func_pushcontent() file_path_file2 = os.path.join(script_dir, "file2.txt") file_path_pl_train = os.path.join(script_dir, "train_NLTK/pl_train.txt") file_path_checkwords = os.path.join(script_dir, "train_NLTK/checkwords_vn_train.txt") file_path_stopwords = os.path.join(script_dir, "train_NLTK/vn_stopwords.txt") # content post raw and luot bo cac ki tu ngoai chu va so f = codecs.open(file_path_file2, "r", "utf8") text_raw = f.read() # bien cac ki tu dac biet thanh " " text = re.sub(r"\W+|_", " ", text_raw) # print("*Text raw: "+text_raw) # print("\nNoi Dung Chinh: ") tokens = word_tokenize(text) # print(text) # -- # tim kiem ngon ngu lap trinh trong post programming_language = [] company_email = re.findall(r'\S+@\S+', text_raw) # strip loai bo cac ki tu phia ngoai cung link_post = ((re.findall(r'(https?://www.facebook.com/[^\s]+)', text_raw))[0]).strip(',"') print("link_post: " + link_post) job_position_check = [ "Senior", "Fresher", "Intern", "Junior", "Tester", "Dev", "Software Test Intern", "Software Test Fresher" ] company_syn = ["công ty", "cty"] job_position = [] f = open(file_path_pl_train, "r") p_languges_raw = f.read() p_languges = p_languges_raw.split() # niceword = word_tokenize(analy1) for i in company_syn: for j in range(len(tokens)): if (i.lower() == tokens[j].lower()): company_name = tokens[j + 1] break else: if company_email != None: try: company_name = ((( company_email[0].split('.'))[0]).split('@'))[1] except: company_name = tokens[0] else: company_name = tokens[0] if (i.lower() == tokens[j].lower()): break for i in tokens: for j in job_position_check: if (i.lower() == j.lower()): if (j not in job_position): job_position.append(j) for i in text.split(): for j in p_languges: if (i.lower() == j.lower()): if (j not in programming_language): programming_language.append(j) print("--Company name: " + str(company_name)) print("--Post nay nhac den cac nn lap trinh: " + str(programming_language)) print("--Gmail company: " + str(company_email)) print("--Link post: " + str(link_post)) print("--Vi tri can tuyen: " + str(job_position)) # print("--Desc: "+text) x = '' y = '' z = '' for i in programming_language: x = x + i + ", " programminglanguage = x.strip(", ") for i in list(set(company_email)): y = y + i + ", " companyemail = y.strip(", ") for i in job_position: z = z + i + ", " jobposition = z.strip(", ") # -- # print("ARR: "+str(tokens)) # day la noi dung file input da split # canh bao post co phai spam khong f = codecs.open(file_path_checkwords, "r", "utf8") text_check = f.read() Arr_check = word_tokenize(text_check) # print("ARR_check mang nay kiem tra day co phai post spam k ?: "+str(Arr_check)) alert = 1 for i in tokens: for j in Arr_check: if (i.lower() == j.lower()): alert = 0 # -- # luot nhung stopwords trong content cua post de thong ke sach hon f = codecs.open(file_path_stopwords, "r", "utf8") vnstopwords = f.read() vn_sw = vnstopwords.splitlines() clean_tokens = tokens[:] # bintrash = [] for token in tokens: if token in vn_sw: clean_tokens.remove(token) # bintrash.append(token) print("--Length clean_tokens : " + str(len(clean_tokens))) print("--Length tokens : " + str(len(tokens))) # print("\n--Show clean token : "+str(clean_tokens)) # print("\n--Show Bin : "+str(bintrash)) # thong ke so luong tu sau khi luot stopwords trong post # freq = nltk.FreqDist(clean_tokens) # ve bieu do sau khi luot stopwords trong post # freq.plot(20, cumulative=False) # freq = nltk.FreqDist(tokens) # thong ke so luong tu day du trong post # for key,val in freq.items(): # print(str(key) + ':' + str(val)) # -- # print("Data push: "+company_name+" "+companyemail+" "+programminglanguage+" "+jobposition+" "+link_post+" "+text+" "+str(alert)) database(company_name.capitalize(), companyemail, programminglanguage, jobposition, link_post, text.replace('description ', ''), alert) print("-------alert: " + str(alert)) if (alert != 0): print("WARNING!-Day co kha nang cao la post khong lien quan!\n") else: print("Khong co canh bao nao!") f.close()
if __name__ == "__main__": input = "./data/data_ver03.txt" output = "./data/data_offical.txt" content = None with open(input, "r") as f: content = f.read() content = normalize_content(content) content = change_teen_code(content) content = remove_emoji(content) content = remove_other_language(content) # word tokenize again. content = content.replace("_", " ").split("\n") result = [] for sent in content: words = word_tokenize(sent) words = [x.replace(" ", "_") for x in words if x != ""] result.append(" ".join(words)) result = "\n".join(result) with open(output, "w") as f: f.write(result)
def newfunc_DataAnalysis(): t1 = time.time() file_path = os.path.join(script_dir, "testnewmyfile20.txt") file_path_pl_train = os.path.join(script_dir, "train_NLTK/pl_train.txt") file_path_checkwords = os.path.join(script_dir, "train_NLTK/checkwords_vn_train.txt") f = codecs.open(file_path, "r", "utf8") txtjson = f.read() try: objjson = json.loads(txtjson) except: print("File json loi kiem tra lai!") quit() f.close() job_position_check = [ "Senior", "Fresher", "Intern", "Junior", "Tester", "Dev", "Software Test Intern", "Software Test Fresher" ] company_syn = ["công ty", "cty"] job_position = [] programming_language = [] count = 0 for i in range(len(objjson)): link_post = objjson[i]['post_url'] desc_insertdb = remove_emojis(objjson[i]['description']) desc_analysis = re.sub(r"\W+|_", " ", desc_insertdb) tokens = word_tokenize(desc_analysis) # print(tokens) f = open(file_path_pl_train, "r") p_languges_raw = f.read() p_languges = p_languges_raw.split() f.close() company_email = re.findall(r'\S+@\S+', desc_insertdb) for i in company_syn: for j in range(len(tokens)): if (i.lower() == tokens[j].lower()): company_name = tokens[j + 1] break else: if company_email != None: try: company_name = ((( company_email[0].split('.'))[0]).split('@'))[1] except: company_name = tokens[0] else: company_name = tokens[0] if (i.lower() == tokens[j].lower()): break for i in tokens: for j in job_position_check: if (i.lower() == j.lower()): if (j not in job_position): job_position.append(j) for i in desc_analysis.split(): for j in p_languges: if (i.lower() == j.lower()): if (j not in programming_language): programming_language.append(j) # chuyen mang thanh chuoi push vao db x = '' y = '' z = '' for i in programming_language: x = x + i + ", " programminglanguage = x.strip(", ") for i in list(set(company_email)): y = y + i + ", " companyemail = y.strip(", ") for i in job_position: z = z + i + ", " jobposition = z.strip(", ") # kiem tra co phai post spam khong f = codecs.open(file_path_checkwords, "r", "utf8") text_check = f.read() Arr_check = word_tokenize(text_check) alert = 1 for i in tokens: for j in Arr_check: if (i.lower() == j.lower()): alert = 0 # push vao db print("------" + str(count + 1) + "------") print("Company mail : " + companyemail + "| Company name : " + company_name.capitalize() + "| PL : " + programminglanguage + "| Job position : " + jobposition + "| Status: " + str(alert)) database(company_name.capitalize(), companyemail, programminglanguage, jobposition, link_post, desc_insertdb, alert) count = count + 1 noti = "Xu ly " + str( len(objjson)) + " post thanh cong " + str(count) + " post." print(noti) t2 = time.time() processingtime = "processing time: {:.3f}".format(t2 - t1) print(processingtime) history_path = os.path.join(script_dir, "history.txt") history = codecs.open(history_path, "a", "utf8") dt = datetime.datetime.now() history.write(noti + " || " + processingtime + " || " + dt.strftime("%x") + " " + dt.strftime("%X")) history.write("\n") history.close()
def receive_message(): global intention intention = 0 global loc_apply_flag if request.method == 'GET': """Before allowing people to message your bot, Facebook has implemented a verify token that confirms all requests that your bot receives came from Facebook.""" if (request.args.get('hub.verify_token') == VERIFY_TOKEN): return request.args.get('hub.challenge') #if the request was not get, it must be POST and we can just proceed with sending a message back to user else: # get whatever message a user sent the bot output = request.get_json() print('__check output variable:', output) messenger.handle(request.get_json(force=True)) inten_flow(intention) for event in output['entry']: messaging = event['messaging'] for message in messaging: if message.get('message'): #Facebook Messenger ID for user so we know where to send response back to if message['message'].get('text'): print('__check message variable:', message) if 'quick_reply' in message['message']: text = { 'text': message['message']['quick_reply']['payload'] } inten_flow( execute_flow( message['message']['quick_reply'] ['payload'], intention)) text['quick_replies'] = quick_replies.to_dict() messenger.send(text, 'RESPONSE') else: # print('message type: ', type(message['message'].get('text'))) # response_message = repOfficial(message['message'].get('text')) # text = {'text': response_message} # # # print("TEXT: ", text) # # text = {'text': 'A message Hi'} # # # text['quick_replies'] = quick_replies.to_dict() # # messenger.send('Bot say: {0}'.format(text['text']), 'RESPONSE') # # text = {'text': 'A Message'} # # text['quick_replies'] = quick_replies.to_dict() # messenger.send(text, 'RESPONSE') message_text = message['message'].get('text') list_of_out_of_work = [ 'cmnd', 'chứng minh', 'hộ khẩu', 'KT1', 'KT2', 'KT3' ] list_of_say_hello = [ 'hello', 'hi', 'chào', 'aloha', 'morning' ] message_check = word_tokenize(message_text) for i in range(len(message_check)): if message_check[i] in list_of_out_of_work: str_bot_rep = "Xin lỗi bạn, tôi chỉ có thể hỗ trợ bạn về vấn đề hộ chiếu, những thủ tục liên quan khác. \ Bạn xin chờ tính năng phát triển tiếp theo." text = {'text': str_bot_rep} messenger.send(text, 'RESPONSE') return '' elif message_check[i] in list_of_say_hello: str_bot_rep = "Chào bạn, tôi là Chatbot hỗ trợ bạn với những thủ tục cơ bản khi làm hộ chiếu lần đầu.\ Nếu bạn có thắc mắc gì về những việc cần làm khi làm hộ chiếu lần đầu thì cứ hỏi tôi." text = {'text': str_bot_rep} messenger.send(text, 'RESPONSE') return '' bot_rep = [] userIntent = ic_predict(message_text)[0] print("user's intent: ", userIntent) print('loc_apply_flag: ', loc_apply_flag) if loc_apply_flag == True: locQuantity, locApply = getNerName( 'Làm hộ chiếu ở ' + message_text) print('locApply: ', locApply) for i in range(len(locQuantity)): bot_rep.append(getLocApply(locApply[i])) loc_apply_flag = False else: if userIntent == "where_loc_apply": if isAnyLOC(message_text) == False: text = { 'text': 'Bạn đang ở tỉnh thành: ' } messenger.send(text, 'RESPONSE') loc_apply_flag = True return '' else: locQuantity, locApply = getNerName( message_text) print(locApply) if locApply == []: bot_rep = getLocApply(locApply) else: for i in range(len(locQuantity)): bot_rep.append( getLocApply(locApply[i])) else: bot_rep = normalRep(userIntent) str_bot_rep = '' for i in range(len(bot_rep)): str_bot_rep = str_bot_rep + bot_rep[i] text = { 'text': str_bot_rep } #We must add 'text' variable like this or it can not send to Messenger. print('text: ', text) messenger.send(text, 'RESPONSE') print('message sent') return ''
def wl_word_tokenize(main, text, lang, word_tokenizer='default', flat_tokens=True): tokens_multilevel = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] # Check initialization status of word (and sentence) tokenizers if flat_tokens: wl_text_utils.check_word_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) else: wl_text_utils.check_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) # NLTK if 'NLTK' in word_tokenizer: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang) if word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_multilevel.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NLTK Tokenizer'): nltk_tokenizer = nltk.NLTKWordTokenizer() for sentence in sentences: tokens_multilevel.append(nltk_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_multilevel.append(treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_multilevel.append(toktok_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_multilevel.append(tweet_tokenizer.tokenize(sentence)) # Sacremoses elif 'Sacremoses' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang) moses_tokenizer = sacremoses.MosesTokenizer( lang=wl_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_multilevel.append( moses_tokenizer.tokenize(sentence, escape=False)) # spaCy elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if flat_tokens: tokens_multilevel.append([token.text for token in doc]) else: for sentence in doc.sents: tokens_multilevel.append( [token.text for token in sentence.as_doc()]) # syntok elif word_tokenizer == 'syntok - Word Tokenizer': syntok_tokenizer = syntok.tokenizer.Tokenizer() if flat_tokens: tokens_multilevel.append( [token.value for token in syntok_tokenizer.tokenize(text)]) else: for para in syntok.segmenter.analyze(text): for sentence in para: tokens_multilevel.append( [token.value for token in sentence]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang=lang) # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_multilevel.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wl_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wl_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wl_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_multilevel.append(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_multilevel.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wl_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wl_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wl_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wl_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_multilevel.append(tokens) # Russian elif word_tokenizer == 'razdel - Russian Word Tokenizer': if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang='rus') for sentence in sentences: tokens_multilevel.append( [token.text for token in razdel.tokenize(sentence)]) # Thai elif 'PyThaiNLP' in word_tokenizer: # Preserve sentence boundaries sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, text, lang='tha') if word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_multilevel.append( pythainlp.word_tokenize(sentence, engine='longest')) elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching'): for sentence in sentences: tokens_multilevel.append( pythainlp.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching + TCC'): for sentence in sentences: tokens_multilevel.append( pythainlp.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching + TCC (Safe Mode)'): for sentence in sentences: tokens_multilevel.append( pythainlp.word_tokenize(sentence, engine='newmm-safe')) # Tibetan elif 'botok' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang='bod') for sentence in sentences: tokens_multilevel.append([ token.text for token in main.botok_word_tokenizer.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) for sentence in sentences: tokens_multilevel.append(underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for i, sentence in enumerate(tokens_multilevel): tokens_multilevel[i] = [ token.strip() for token in sentence if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for sentence in tokens_multilevel: if sentence: sentence[-1] = wl_text.Wl_Token(sentence[-1], boundary='', sentence_ending=True) else: for sentence in tokens_multilevel: if sentence: sentence[-1] = wl_text.Wl_Token(sentence[-1], boundary=' ', sentence_ending=True) # Clause tokenization if not flat_tokens: for i, sentence in enumerate(tokens_multilevel): tokens_multilevel[i] = wl_sentence_tokenization.wl_clause_tokenize( main, sentence, lang) # Flatten tokens tokens_flat = list(wl_misc.flatten_list(tokens_multilevel)) if flat_tokens: return tokens_flat else: return tokens_multilevel
def evaluate_classify_model(): test_df = pd.read_csv('data/test-more-info.txt', sep='\t', header=None, names=['id', 'origin_q', 'compare_q', 'label', 'score_elastic'], ) test_df['predict'] = 0.01 doc2vec_model = Doc2Vec.load('gensim/model/question.d2v') classify_model = load_model('model/simple_classify_model.h5') for index, row in test_df.iterrows(): origin_q = row['origin_q'] compare_q = row['compare_q'] origin_q_vector = doc2vec_model.infer_vector(simple_preprocess(word_tokenize(origin_q, format='text'))) compare_q_vector = doc2vec_model.infer_vector(simple_preprocess(word_tokenize(compare_q, format='text'))) concat_vector = np.concatenate((origin_q_vector, compare_q_vector)) arr_wraper = np.array([concat_vector]) test_df.at[index, 'predict'] = classify_model.predict(arr_wraper)[0][0] test = test_df.loc[test_df['id'] == 22972022] test_sort = test.sort_values(by='predict', ascending=False).reset_index(drop=True) id_queries = [] for id_query in test_df['id']: if id_query not in id_queries: id_queries.append(id_query) mAP_df = pd.DataFrame(data=id_queries, columns=['id']) score_AP_model_alls = [] score_AP_model_top10 = [] score_AP_elastic_alls = [] score_AP_elastic_top10 = [] for id_query in mAP_df['id']: group_id = test_df.loc[test_df['id'] == id_query] # Caculate mAP model group_predict_sort = group_id.sort_values( by='predict', ascending=False).reset_index(drop=True) AP_model_all = convenion.caculate_AP(group_predict_sort['label']) AP_model_top10 = convenion.caculate_AP(group_predict_sort['label'][:10]) score_AP_model_alls.append(AP_model_all) score_AP_model_top10.append(AP_model_top10) # Caculate mAP elastic search group_elastic_sort = group_id.sort_values( by='score_elastic', ascending=False).reset_index(drop=True) AP_elastic_all = convenion.caculate_AP(group_elastic_sort['label']) AP_elastic_top10 = convenion.caculate_AP(group_elastic_sort['label'][:10]) score_AP_elastic_alls.append(AP_elastic_all) score_AP_elastic_top10.append(AP_elastic_top10) mAP_df['AP_model_all'] = score_AP_model_alls mAP_df['AP_model_top10'] = score_AP_model_top10 mAP_df['AP_elastic_all'] = score_AP_elastic_alls mAP_df['AP_elastic_top10'] = score_AP_elastic_top10 print('mAP elastic all: ', sum(score_AP_elastic_alls) / len(score_AP_elastic_alls)) print('mAP model all: ', sum(score_AP_model_alls) / len(score_AP_model_alls)) print('mAP elastic top10: ', sum(score_AP_elastic_top10) / len(score_AP_elastic_top10)) print('mAP model top10: ', sum(score_AP_model_top10) / len(score_AP_model_top10)) return mAP_df
#load data file = open('vietnamese-stopwords.txt', 'r') stopwords = file.readlines() words = [] documents = [] classes = [] import json with open('data.json') as json_data: intents = json.load(json_data) for intent in intents['intents']: for pattern in intent['patterns']: w = word_tokenize(pattern) words.extend(w) print(intent['tag']) documents.append((w, intent['tag'])) if intent['tag'] not in classes: classes.append(intent['tag']) words = [stemmer.stem(w.lower()) for w in words if w not in stopwords] words = sorted(list(set(words))) classes = sorted(list(set(classes))) def clean_data(data): sentences = word_tokenize(data) sentences = [ stemmer.stem(w.lower()) for w in sentences if w not in stopwords
def search_tf_idf(category, tags_list): with open(TMP_PATH + 'Top_20_keyword_' + str(category) + '_tf_idf.csv', 'w', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=tags_list) writer.writeheader() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) page = es.search( index='baomoi.com', doc_type='doc', scroll='2m', # size = 100, #number of hits to return body={ "query": { "match_phrase": { "categories": { "query": category } } } }) sid = page['_scroll_id'] scroll_size_ = page['hits']['total'] print(category) scroll_size = 1 hits = page['hits']['hits'] cnt = Counter() stop_word = [ 'bị', 'bởi', 'cả', 'các', 'cái', 'cần', 'càng', 'chỉ', 'chiếc', 'cho', 'chứ', 'chưa', 'chuyện', 'có', 'có thể', 'cứ', 'của', 'cùng', 'cũng', 'đã', 'đang', 'đây', 'để', 'đến', 'đến nỗi', 'đều', 'điều', 'do', 'đó', 'được', 'dưới', 'gì', 'khi', 'không', 'là', 'lại', 'lên', 'lúc', 'mà', 'mỗi', 'một', 'một cách', 'này', 'năm' 'nên', 'nếu', 'ngay', 'nhiều', 'như', 'nhưng', 'những', 'nơi', 'nữa', 'ở' 'phải', 'qua', 'ra', 'rằng', 'rằng', 'rất', 'rất', 'rồi', 'sau', 'sẽ', 'so', 'sự', 'tại', 'theo', 'thì', 'trên', 'trong', 'trước', 'từ', 'từng', 'và', 'vẫn', 'vào', 'vậy', 'về', 'vì', 'việc', 'với', 'vừa', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '~' ] word_list = list() while (scroll_size > 0): # print('Scrolling ...') page = es.scroll(scroll_id=sid, scroll='2m') for post in hits: post_ = post['_source']['summary'] word_tok = word_tokenize(post_) word_fil = list(filter(lambda x: x not in stop_word, word_tok)) cnt += Counter(word_fil) word_list.append(post_) # cal_idf(cnt,word_tok,scroll_size) # print(word_list) # print (cnt.most_common(10)) # print(cnt.most_common(10)) # print (word_tok) hits = page['hits']['hits'] # Update the scroll_id sid = page['_scroll_id'] # Get the number of results scroll_size = len(page['hits']['hits']) # print(type(cnt)) tf_dict = cal_tf(cnt, word_tok) print('############################') idf_dict = cal_idf(cnt, word_list, scroll_size_) # print(scroll_size_) tf_idf = {} for word, val in tf_dict.items(): tf_idf[word] = val * idf_dict[word] sort = Counter(tf_idf) # print(sort.most_common(20)) for item in sort.most_common(20): writer.writerow({ category: str(item[0]), tags_list[1]: "{0:.2f}".format(item[1]) })
def remove_stopword(text): tokens = word_tokenize(text) return " ".join(word for word in tokens if word not in stopwords)
dt_file = [ open('./saved_model/dt_model_group-12.pkl', 'rb'), 'Decision Tree' ] nb_file = [ open('./saved_model/nb_model_group-12.pkl', 'rb'), 'Naive Bayes' ] rf_file = [ open('./saved_model/rf_model_group-12.pkl', 'rb'), 'Random Forest' ] svm_file = [ open('./saved_model/svm_model_group-12.pkl', 'rb'), 'Support Vector Machine' ] knn_file = [ open('./saved_model/knn_model_group-12.pkl', 'rb'), 'K Nearest Neighbor' ] for file in [svm_file, dt_file, rf_file, nb_file, knn_file]: name = file[1] model = pickle.load(file[0]) tokenized_input_text = uts.word_tokenize(input_text, format="text") features = get_features(tokenized_input_text) # type: dict feature_values = np.array(list(features.values())) feature_values = feature_values.reshape(1, -1) print(name, ': ', model.predict(feature_values)) # Get input feature # print(dt_model.predict(input_text))
def vi2IPA_split(texts, delimit): content = [] with open(imp.find_module('viphoneme')[1] + "/Popular.txt", encoding="utf-8") as f: content = f.read().splitlines() tess = texts.split(".") Results = "" for text in tess: #print("------------------------------------------------------") TN = TTSnorm(text) #TN=text #print("------------------------------------------------------") #print("Text normalize: ",TN) TK = word_tokenize(TN) #print("Vietnamese Tokenize: ",TK) for iuv, under_valid in enumerate(TK): token_under = under_valid.split(" ") checkinvalid = 0 ##print(token_under) if len(token_under) > 1: for tok in token_under: if tok not in content or "[" in T2IPA(tok): checkinvalid = 1 if checkinvalid == 1: TK = TK[:iuv] + TK[iuv + 1:] for tok in reversed(token_under): TK.insert(iuv, tok) IPA = "" for tk in TK: ipa = T2IPA_split(tk, delimit).replace(" ", "_") if ipa == "": IPA += delimit + tk + delimit + " " elif ipa[0] == "[" and ipa[-1] == "]": eng = eng_to_ipa.convert(tk) if eng[-1] == "*": if tk.lower().upper() == tk: ##print("ENGLISH",tk) #Đọc tiếng anh từng chữ letter2sound = "" for char in tk: CHAR = str(char).lower() if CHAR in list(EN.keys()): letter2sound += EN[CHAR] + " " else: letter2sound += char + " " IPA += T2IPA_split(letter2sound, delimit) + " " else: #Giữ nguyên #Future: test experiment" Nếu từ unknow có thể dùng eng_norm để chuyển qua thay thế chứ không cần giữ nguyên như này IPA += Parsing("default", tk.lower(), delimit) + " " else: #This use for version english not splited by syllable #IPA+=Parsing("default",eng,delimit)+" " #This version will split english to each syllable IPA += normEng(tk, delimit) + delimit + " " #Check tu dien tieng anh Etrain bưc #Neu co Mapping #Neu khong, check co nguyen am #Neu co de nguyen #Neu khong danh van #print(" ..................Out of domain word: " ,ipa) else: IPA += ipa + " " IPA = re.sub(delimit + '+', delimit, IPA) IPA = re.sub(' +', ' ', IPA) #print("IPA Vietnamese: ",IPA) #print("------------------------------------------------------") Results += IPA.rstrip() + " " + delimit + "." + delimit + " " return Results.rstrip()
def word_tokenizer(document): document=word_tokenize(document, format="text") return document
def tachtu(sentence): sentence = clean_text(sentence) sentence = give_emoji_free_text(sentence) sentence = word_tokenize(sentence) return sentence
def clear_unknown_letter(text): text = strip_non_alphanum(text) text = word_tokenize(text) return process_lower(text)
def standardized_sentence(sentence): sentence = word_tokenize(sentence.lower(), format="text") return sentence
def test_decomposed_from(self): text = u"yếu" acutal = word_tokenize(text) expected = [u'yếu'] self.assertEqual(acutal, expected)