def add_data(path='./data'): list_json = os.listdir(path) for file_name in list_json: paths = os.path.join(path, file_name) with open(paths) as json_file: data = json.load(json_file) data = list(data) for field in data: field["content"] = ViTokenizer.tokenize( field["content"]) if field['content'] else 'nothing' field["title"] = ViTokenizer.tokenize( field["title"]) if field['title'] else 'nothing' field["description"] = ViTokenizer.tokenize( field["description"] ) if field['description'] else 'nothing' field["topic"] = ViTokenizer.tokenize( field["topic"]) if field['topic'] else 'nothing' field["author"] = field["author"].strip().replace( ' ', '_') if (field['author'] and field['author'].strip()) else 'unknown' field['publish_date'] = field['publish_date'] if field[ 'publish_date'] else 'unknown' solr.add(data) return jsonify("OK")
def tokenizer(self, item): # tokenizer title_token = self.removeStopwords( re.sub("(\d+(\,*\.*\d+)+)", "_NUMBER", ViTokenizer.tokenize(item['title']))) description_token = self.removeStopwords( re.sub("(\d+(\,*\.*\d+)+)", "_NUMBER", ViTokenizer.tokenize(item['description']))) # word count item['title_wc'] = { word: title_token.count(word) for word in title_token } item['description_wc'] = { word: description_token.count(word) for word in description_token } # remove col self.list_key = ['title_wc', 'square', 'price', 'description_wc'] for key in list(item.keys()): if key not in self.list_key: del item[key] return item
def make_w2vec_matrix(question, paragraph, model=word2vec): train_question = preprocess_sentence(question) train_answers = preprocess_sentence(paragraph) tokens_question = ViTokenizer.tokenize(train_question).split() tokens_answer = ViTokenizer.tokenize(train_answers).split() question_embs = [] answer_embs = [] for i in range(len(tokens_question)): if tokens_question[i] in model: question_embs.append(model[tokens_question[i]]) else: question_embs.append(model['unknown']) for i in range(len(tokens_answer)): if tokens_answer[i] in model: answer_embs.append(model[tokens_answer[i]]) else: answer_embs.append(model['unknown']) question_embs = np.array(question_embs) answer_embs = np.array(answer_embs) """ if question_embs.shape[0] < MIN_LENGTH_QUESTION: question_embs = np.pad(question_embs, ((4,4), (0,0))) """ if answer_embs.shape[0] < MIN_LENGTH_ANSWER: paddings = np.ceil(MIN_LENGTH_ANSWER / answer_embs.shape[0]) d = np.copy(answer_embs) for i in range(int(paddings)): answer_embs = np.concatenate((answer_embs, d)) return question_embs, answer_embs
def predict_articles(articles): for article in articles: gensim.utils.simple_preprocess(article) ViTokenizer.tokenize(article) content_data_tfidf = tfidf_vector.transform(articles) prediction = trained_model.predict(content_data_tfidf) return prediction
def __get_keywords_from_text(text): tokens = ViTokenizer.tokenize(text) tokens = ViTokenizer.spacy_tokenize(tokens)[0] tokens = list(filter(lambda x: len(x) > 1, tokens)) counter_tokens = Counter(tokens) counter_tokens = dict(counter_tokens) counter_tokens = dict( sorted(counter_tokens.items(), key=lambda x: -x[1])) return counter_tokens
def searchDongNghia(sentence): content = ViTokenizer.tokenize(sentence) list_word = content.split() stopwords = [] f = open('vietnamese-stopwords.txt', 'r') for line in f: line = line.rstrip() #print(line) line = line.replace(' ', '_') stopwords.append(line) f.close() content = ViTokenizer.tokenize(sentence) list_word = content.split() words = [] for word in list_word: word = word.lower() if word not in stopwords: words.append(word) model = gensim.models.KeyedVectors.load_word2vec_format( 'model/baomoi.model.bin', binary=True) N = 3 list_dongnghia = [] for word in words: # kiểm tra nếu word có trong từ điển ko, nếu có thì lấy tra N từ đồng nghĩa với từ đó dongnghia = model.wv.most_similar(positive=[word], topn=N) for i in range(0, N): list_dongnghia.append(dongnghia[i][0].replace('_', ' ')) #print(list_dongnghia) results = [] for word in list_dongnghia: key = "description : " + "\"" + sentence + "\"" result = solr.search(key) results.append(result) return results
def convert_st_to_bow(self, st): bow = [0] * len(self.words) tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st)) if not (len(tagger[1]) == 1 and tagger[1][0] == 'Np' and tagger[0][0] not in SKIP_WORDS): tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st.lower())) for i, j in enumerate(tagger[1]): if j in REPLACE: tagger[0][i] = REPLACE[tagger[1][i]] if tagger[0][i] in self.words: bow[self.words.index(tagger[0][i])] = tagger[0].count( tagger[0][i]) return np.array(bow)
def makeSummary(sentences, n, scores): sentences = sorted(sentences, key=lambda x: x.getScore(), reverse=True) summary = [] i = 0 length_summary = len( ViTokenizer.tokenize(sentences[i].getOriginalWords().strip()).split()) while (length_summary < n): i += 1 summary += [sentences[i]] length_summary += len( ViTokenizer.tokenize( sentences[i].getOriginalWords().strip()).split()) return summary
def segment_tree(tree): for par in tree.findall(".//paragraph"): par_text = "" par_original_text = "" char_attrib_in_par = [] char_in_par = [] count_line = 0 for lines in par: count_line += 1 t = '' for c in lines: if (c.text is None) or (c.text == "\n"): # t += r" " c.text = " " if (c.text == ''): continue t += c.text par_original_text += c.text char_attrib_in_par.append(c) par_text = par_text + " " + t[:-1].strip() if (len(par) > 1): par_text = ViTokenizer.tokenize(par_text).replace("_", " ").replace("-", "").replace("+", "") list_sentences = segmenter.segment_long(par_text.strip(), n_window=10) else: list_sentences = [par_text.strip()] search_idx = 0 for i in range(len(list_sentences)): sentence = etree.Element("sentence") list_word = ViTokenizer.tokenize(list_sentences[i]).split() lookup = link_coord(par_original_text, list_word, search_idx) if lookup: j = 0 while (j < len(lookup)): word = etree.Element("word") start_word_idx = lookup[j] end_word_idx = lookup[j + 1] for idx in range(start_word_idx, end_word_idx + 1): word.append(char_attrib_in_par[idx]) sentence.append(word) j += 2 # print(s_t) search_idx = lookup[-1] + 1 par.append(sentence) for layout in tree.findall(".//textline"): layout.getparent().remove(layout) return tree
def PreprocessingData(i): # i = i.strip(SPECIAL_CHARACTER) my_words = i.split(" ") for word in i: if word in SPECIAL_CHARACTER: # print(word) i = i.replace(word, "") i = i.replace(" ", " ") # print(i) for word in my_words: if len(word) > 20 : # print(word) i = i.replace(word, "") i = i.replace(" ", " ") # print(i) i = ViTokenizer.tokenize(i) my_words = i.split(" ") # print(i) for word in my_words: # print(word) if word in STOP_WORDS: print(word) i = i.replace(word, "") i = i.replace(" ", " ") # print(i) i = i.lower() # print(i) return i
def transform(self, X, y=None, **fit_params): result = [ViTokenizer.tokenize(text.lower()) for text in X] return [ " ".join([ token for token in text.split() if token not in self.stopwords ]) for text in result ]
def handle(self): newData = [] for v in self.data: t = v[self.content].lower() if self.html: t = BeautifulSoup(t, 'html.parser').get_text() # Chuẩn hóa láy âm tiết t = re.sub(r'(\D)\1+', r'\1', t) # Tách từ t = ViTokenizer.tokenize(t) if self.accented_char: t = unicodedata2.normalize('NFD', t).encode('ascii', 'ignore').decode("utf-8") if self.special_char: t = [x.strip(SPECIAL_CHARACTER) for x in t.split()] if self.stopwords: t = [word for word in t if word not in self.list_stopword] v[self.content] = t if v not in newData: newData.append(v) print(np.array(newData))
def text_postag(text): pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(text)) dict_tag = {} for i in range(len(pos_tag[0])): dict_tag[pos_tag[0][i]] = pos_tag[1][i] return dict_tag
def word_segment(sent): ''' Args: sent: A string. A sentence. Returns: A list of words. ''' global lcode if lcode in ['ko']: words = [word for word, _ in kkma.pos(sent)] elif lcode in ['ja']: words = mecab.parse(sent.encode('utf8')).split() elif lcode in ['th']: words = pythai.split(sent) elif lcode in ['vi']: words = ViTokenizer.tokenize(sent).split() elif lcode in ['zh']: words = list(jieba.cut(sent, cut_all=False)) # elif lcode in ['ar']: # words = segmenter.segment(sent).split() else: # Mostly european languages words = sent.split() return words
def tokenize(self): dem = 1 len1, len2 = (0, 0) start = time() for folder, subfolder, file in os.walk(self.pathToTxt): if len(file) == 0: continue else: for fi in file: new_path = os.path.join(folder, fi) with open(new_path, 'r') as f: content = f.read() content = self.removeCommas(content) len1 += len(content) pos = ViTokenizer.tokenize(content) new_text = self.removeStopWord(pos) new_text = ' '.join(new_text) len2 += len(new_text) path_to_save = 'dataset' + str(dem) + '.txt' path_to_dataset = os.path.join(self.dataset_path, path_to_save) with open(path_to_dataset, 'w+') as f: f.write(new_text) dem += 1 end_time = time() - start print('Done in {}s, with {}% change'.format(end_time, (len2 / len1) * 100))
def get_data(folder_path, mode=None): type_data = folder_path.split('/')[-1].split('_')[0].lower() if mode is None: X = [] y = [] dirs = os.listdir(folder_path) print(dirs) for path in dirs: file_paths = os.listdir(os.path.join(folder_path, path)) for file_path in tqdm(file_paths, desc=path): with open(os.path.join(folder_path, path, file_path), 'r', encoding='utf-16') as f: lines = f.readlines() lines = ' '.join(lines) lines = ViTokenizer.tokenize(lines) lines = gensim.utils.simple_preprocess( lines) # remove symbols lines = ' '.join(lines) X.append(lines) y.append(path) elif mode == 'from_file': with open('./data/X_' + type_data + '.pkl', 'rb') as f: X = pickle.load(f) with open('./data/y_' + type_data + '.pkl', 'rb') as f: y = pickle.load(f) return X, y
def clean_text(text, stopwords, acronyms): t = text.lower() t = ' '.join(t.split()) t = BeautifulSoup(t, 'html.parser').get_text() # Chuẩn hóa láy âm tiết t = re.sub(r'(\D)\1+', r'\1', t) for key in acronyms: for value in acronyms[key]: if value in t: t = t.replace(value, key) # Xóa dấu # Tách từ t = ViTokenizer.tokenize(t) # t = unicodedata2.normalize('NFD', t).encode( # 'ascii', 'ignore').decode("utf-8") t = [x.strip(settings.SPECIAL_CHARACTER) for x in t.split()] t = [word for word in t if word not in stopwords] return " ".join(t)
def clean_text(text, stopwords, acronyms): REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') t = text.lower() t = BeautifulSoup(t, 'html.parser').get_text() t = " ".join([x.strip(settings.SPECIAL_CHARACTER) for x in t.split()]) print(t) # Chuẩn hóa láy âm tiết t = re.sub(r'(\D)\1+', r'\1', t) t = " " + t + " " for key in acronyms: for value in acronyms[key]: v = ' ' + value + ' ' if v in t: t = t.replace(v, ' ' + key + ' ') # # Tách từ t = ViTokenizer.tokenize(t) t = [word for word in t.split() if word not in stopwords] return " ".join(t)
def pyvi_prc(text): tokens, tags = ViPosTagger.postagging(ViTokenizer.tokenize(text)) result = {} for i in range(len(tokens)): tokens[i] = tokens[i].replace('_', ' ') result[tokens[i]] = tags[i] return result
def load_and_clean_data(doc): paths = glob.glob("./DataRaw/" + doc + "/*.txt") data = [] for path in paths: with open(path, encoding="utf-8") as file: text = file.read() text_lower = text.lower() text_token = ViTokenizer.tokenize(text_lower) data.append(text_token) file.close() stop_words = [] with open("./Stopword/vietnamese-stopwords.txt", encoding="utf-8") as f: text = f.read() for word in text.split(): stop_words.append(word) f.close() punc_ = list(punctuation) stop_word = stop_words + punc_ sentences = [] for d in data: sent = [] for word in d.split(" "): if word not in stop_word: if "_" in word or word.isalpha() is True: sent.append(word) sentences.append(" ".join(sent)) return sentences pass
def main(): keyword = input("Nhập từ khoá tìm kiếm: ") keyword_format = "%{}%".format(keyword) keyword_tokennize = tach_tu.tokenize(keyword) print("Từ khoá: ", keyword_tokennize) bow = str(keyword_tokennize).split(' ') print("bow tìm kiếm :", bow) word_dict = creat_word_count_dict(keyword_format) print("Số lần xuất hiện :", word_dict) # tính TF tf = compute_TF(word_dict, bow) print("Kết quả tf:", tf) # Tính IDF idf = compute_IDF(word_dict) print("Kết quả idf:", idf) # Cuối cùng: Tính TF-IDF Từ kết quả TF và IDF phía trên chỉ cần nhân lại là xong tf_idf = compute_TFIDF(tf, idf) for key, value in tf_idf.items(): if (key == keyword_tokennize): print(key, ":", value) #print(sorted(tf_idf.values(),reverse=True)) print(tf_idf) # vẽ biểu đồ df = pd.DataFrame([tf_idf])
def res_sentence(self, test_sentence): test_sentence = ViTokenizer.tokenize(test_sentence) test_sentence, pos = ViPosTagger.postagging(test_sentence) new_words, pos = self.process(test_sentence, pos) X_test = self.sent2features(new_words, pos) new_tags = self.crf.predict_single(X_test) st1, st2 = [], [] for i in range(len(new_words)): if new_tags[i] == 'O': if new_tags[i - 1] != 'O': st1.append(new_words[i]) st2.append('O') print(i) continue else: if i == 0: st1.append(new_words[i]) st2.append('O') else: st1[-1] = st1[-1] + '_' + new_words[i] elif new_tags[i][0] == 'B': tag = "" + new_tags[i][2:] st1.append(new_words[i]) st2.append(tag.upper()) elif new_tags[i][0] == 'I': st1[-1] = st1[-1] + '_' + new_words[i] return st1, st2
def markdown_to_text(markdown_string, parser="html.parser", tags=['pre', 'code', 'a', 'img', 'i']): """ Converts a markdown string to plaintext https://stackoverflow.com/questions/18453176 """ import mistune # noqa # md -> html -> text since BeautifulSoup can extract text cleanly markdown = mistune.Markdown() html = markdown(markdown_string) soup = BeautifulSoup(html, parser) # remove code snippets text = preprocessing_tags(soup, tags) text = remove_links_content(text) text = remove_emails(text) text = remove_punctuation(text) text = text.replace('\n', ' ') text = remove_numeric(text) text = remove_multiple_space(text) text = text.lower().strip() text = ViTokenizer.tokenize(text) text = remove_stopwords(text, stopwords=stopwords) return text
def segment(tree): final_list = [] for par in tree.findall(".//paragraph"): p = "" count_line = 0 for line in par: count_line += 1 t = '' for c in line: if c.text is None: t += r"!0" else: t += c.text p = p + " " + t[:-1].strip() if (len(par) > 1): p = ViTokenizer.tokenize(p).replace("_", " ").replace("-", "").replace("+", "") list_sentences = segmenter.segment_long(p.strip(), n_window=10) else: list_sentences = [p.strip()] for i in range(len(list_sentences)): if (len(list_sentences[i]) > 0 and list_sentences[i].strip()[-1] != '.'): list_sentences[i] = list_sentences[i] + " ." final_list.extend(list_sentences) # for sent in list_sentences: # print(sent+"\n") # print("----------------------------------------------------------------------------------------") return final_list
def main(): loaded_model = pickle.load( open('finalized_model_no_pos_chunk_name_process.pkl', 'rb')) result = {} text = input('Enter some text: \n\n') tokenized = ViTokenizer.tokenize(text) raw_text = parse_raw_input(tokenized) word_featured = [get_features(s) for s in raw_text] preds = loaded_model.predict(word_featured) temp_sent_list = tokenized.split('.') sent_list = [] for i in range(len(temp_sent_list)): if len(temp_sent_list[i]) > 0: sent_list.append(temp_sent_list[i].strip()) print("\n\nResult : \n") for i in range(len(sent_list)): result = [] current_sent = sent_list[i] current_tag = preds[i] tokens = current_sent.split(' ') if len(current_tag) > len(tokens): tokens.append('.') if len(current_sent) > 0: for j in range(len(tokens)): result.append([tokens[j], current_tag[j]]) print(str(i) + " : ", end=" ") for part in result: if part[1] == "O": print(part[0], end=" ") else: # print("<"+part[1]+">" + part[0] +"</"+part[1]+">", end = " ") print(part[0] + "/" + part[1], end=" ") print("\n")
def buildSummary(self, sentences, n): sentences = sorted(sentences, key=lambda x: x.getLexRankScore(), reverse=True) summary = [] i = 0 length_summary = len( ViTokenizer.tokenize(sentences[i].getOGwords().strip()).split()) while (length_summary < n): i += 1 summary += [sentences[i]] length_summary += len( ViTokenizer.tokenize( sentences[i].getOGwords().strip()).split()) return summary
def BigClassifier(contents): input_ = [] contents = gensim.utils.simple_preprocess(contents) contents = ' '.join(contents) contents = ViTokenizer.tokenize(contents) contents = contents.split() result = [word for word in contents if word.lower() not in stop_word] contents = ' '.join(result) input_.append(contents) X_data.append(input_[0]) tfidf_vect = TfidfVectorizer(analyzer='word', max_features=7000, max_df=0.8, min_df=1) tfidf_vect.fit(X_data) X_data_tfidf = tfidf_vect.transform(X_data) X_test_tfidf = X_data_tfidf[-1] X_data_tfidf = X_data_tfidf[0:6000] feature = tfidf_vect.get_feature_names() encoder = preprocessing.LabelEncoder() y_data_n = encoder.fit_transform(y_data) classifier = naive_bayes.MultinomialNB() classifier.fit(X_data_tfidf, y_data_n) test_predictions = classifier.predict(X_test_tfidf)[0] return (categorize[test_predictions])
def clean_text(text, stopwords, acronyms): t = text.lower() t = ' '.join(t.split()) t = BeautifulSoup(t, 'html.parser').get_text() # Chuẩn hóa láy âm tiết t = re.sub(r'(\D)\1+', r'\1', t) t = " ".join([x.strip(settings.SPECIAL_CHARACTER) for x in t.split()]) t = ' ' + t + ' ' for key in acronyms: for value in acronyms[key]: if value in t: t = t.replace(value, key) # Tách từ t = ViTokenizer.tokenize(t) t = [word for word in t if word not in stopwords] return " ".join(t)
def exec(post, content): try: if post.source_info.name == 'V' and not post.has_summary: content.lower().strip() sentences = nltk.sent_tokenize(content) vocab = w2v.wv.vocab X = [] for sentence in sentences: sentence = ViTokenizer.tokenize(sentence) words = sentence.split(" ") sentence_vec = np.zeros((100)) for word in words: if word in vocab: sentence_vec += w2v.wv[word] X.append(sentence_vec) n_clusters = post.sentences_of_summary kmeans = KMeans(n_clusters=n_clusters) kmeans = kmeans.fit(X) avg = [] for j in range(n_clusters): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X) ordering = sorted(range(n_clusters), key=lambda k: avg[k]) summary = ' '.join([sentences[closest[idx]] for idx in ordering]) post.summary = summary post.has_summary = True post.save() except: post.has_summary = True post.save() return 'fail post ' + post.id
def segmentation(self, topic): # use collocation first temp1 = topic.lower() for collo in self._collocation: if collo in temp1: temp1 = temp1.replace(collo, collo.replace(' ', '_')) return ViTokenizer.tokenize(temp1)
def make_doc(self, text): if self.Defaults.use_pyvi: try: from pyvi import ViTokenizer except ImportError: msg = ("Pyvi not installed. Either set Vietnamese.use_pyvi = False, " "or install it https://pypi.python.org/pypi/pyvi") raise ImportError(msg) words, spaces = ViTokenizer.spacy_tokenize(text) return Doc(self.vocab, words=words, spaces=spaces) else: words = [] spaces = [] doc = self.tokenizer(text) for token in self.tokenizer(text): words.extend(list(token.text)) spaces.extend([False]*len(token.text)) spaces[-1] = bool(token.whitespace_) return Doc(self.vocab, words=words, spaces=spaces)