def review_to_words(review, filename): """ Function to convert a raw review to a string of words :param review :return: meaningful_words """ # 1. Convert to lower case, split into individual words # words = review.lower().split() tup = ViPosTagger.postagging( ViTokenizer.tokenize(unicode(review, encoding='utf-8'))) # gan nhan POS words = review.split() # 2. In Python, searching a set is much faster than searching # a list, so convert the stop words to a set with open(filename, "r") as f3: dict_data = f3.read() array = dict_data.splitlines() # 3. Remove stop words meaningful_words = [w for w in words if not w in array] # 4. Join the words back into one string separated by space, # and return the result. return " ".join(meaningful_words) meaningful_words = [w for w in words if not w in array] b = " ".join(meaningful_words) # cau sau khi loai bo stopword words_list = b.split() tup = ViPosTagger.postagging( ViTokenizer.tokenize(unicode(b, encoding='utf-8'))) # gan nhan POS a = tup[1] c = words_list + a return " ".join(c)
def predict_ex(mes): vectorizer = load_model('model/vectorizer.pkl') uni_big = load_model('model/uni_big.pkl') if uni_big == None: training1() uni_big = load_model('model/uni_big.pkl') print "---------------------------" print "Training" print "---------------------------" t0 = time.time() # iterate over classifiers mes = unicode(mes, encoding='utf-8') test_message = ViTokenizer.tokenize(mes).encode('utf8') test_message = clean_str_vn(test_message) test_message = list_words(test_message) clean_test_reviews = [] clean_test_reviews.append(test_message) d2 = {"message": clean_test_reviews} test2 = pd.DataFrame(d2) test_text2 = test2["message"].values.astype('str') test_data_features = vectorizer.transform(test_text2) test_data_features = test_data_features.toarray() # print test_data_features s = uni_big.predict(test_data_features)[0] return s
def predict_ex(mes): print mes vectorizer = load_model('model_balance/vectorizer_tfidf12.pkl') clf = load_model('model_balance/tfidf12.pkl') clf2 = load_model('model_balance/tfidf_fine12.pkl') if clf is None or clf2 is None: training1() clf = load_model('model/model_balance/tfidf12') clf2 = load_model('model_balance/tfidf_fine12.pkl') mes = unicodedata.normalize("NFC", mes.strip()) mes = clean_str_vn(mes) test_message = ViTokenizer.tokenize(mes).encode('utf8') test_message = clean_str_vn(test_message) test_message = list_words(test_message) clean_test_reviews = [] clean_test_reviews.append(test_message) d2 = {"message": clean_test_reviews} test2 = pd.DataFrame(d2) test_text2 = test2["message"].values.astype('str') test_data_features = vectorizer.transform(test_text2) test_data_features = test_data_features.toarray() # print test_data_features s = clf.predict(test_data_features)[0] s2 = clf2.predict(test_data_features)[0] return s + " " + s2
def predict_ex(mes): vectorizer = load_model('model/vectorizer.pkl') # vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=0.7, min_df=2, max_features=1000) if vectorizer == None: vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=0.7, min_df=2, max_features=1000) clf = load_model('model/clf.pkl') if clf == None: training() clf = load_model('model/clf.pkl') mes = unicode(mes, encoding='utf-8') test_message = ViTokenizer.tokenize(mes).encode('utf8') test_message = clean_str_vn(test_message) test_message = review_to_words(test_message) clean_test_reviews = [] clean_test_reviews.append(test_message) d2 = {"message": clean_test_reviews} test2 = pd.DataFrame(d2) test_text2 = test2["message"].values.astype('str') test_data_features = vectorizer.transform(test_text2) test_data_features = test_data_features.toarray() # print test_data_features s = clf.predict(test_data_features) s2 = np.array(s) s3 = str(s2[0]) return s3
def word_segment(sent): ''' Args: sent: A string. A sentence. Returns: A list of words. ''' global lcode if lcode in ['ko']: words = [word for word, _ in kkma.pos(sent)] elif lcode in ['ja']: words = mecab.parse(sent.encode('utf8')).split() elif lcode in ['th']: words = pythai.split(sent) elif lcode in ['vi']: words = ViTokenizer.tokenize(sent).split() elif lcode in ['zh']: words = list(jieba.cut(sent, cut_all=False)) # elif lcode in ['ar']: # words = segmenter.segment(sent).split() else: # Mostly european languages words = sent.split() return words
def load_text(doc): dataset = {'target_names': [], 'data': [], 'target': []} content = doc.lower() rx = re.compile("[^\W\d_]+", re.UNICODE) content = " ".join(rx.findall(content)) dataset['data'].append(ViTokenizer.tokenize(content)) return dataset
def my_tokenize(posts_str): posts = posts_str.split('(^-^)') tokens = [] for post in posts: post = utils.icons(post) tokens.extend(ViTokenizer.tokenize(post).split(' ')) return tokens
def word_segment(root): directory = 'seg/%s' % get_container_folder(root) ut.create_folder(directory) files = [ f for f in os.listdir(root) if os.path.isfile('%s/%s' % (root, f)) ] total = len(files) for index, f in enumerate(files): path = '%s/%s' % (root, f) content = ut.load_file(path) if len(content) >= 3: title = content[0].replace('\n', '') par = content[2].replace('\n', '') title = ViTokenizer.tokenize(unicode(title, 'UTF-8')) par = ViTokenizer.tokenize(unicode(par, 'UTF-8')) ut.save_file_utf8('%s/%s' % (directory, f), title + '\n' + par) ut.update_progress((index + 1) * 1.0 / total)
def tokenize(filename, outname, delimiter="\t"): with open(filename, "r", encoding="utf-8") as source, open(outname, "w", encoding="utf-8") as target: for i, line in enumerate(source): print(i) tokens = line.strip().split(delimiter) for j in [1, 3, 7, 8]: tokens[j] = ViTokenizer.tokenize(tokens[j]) target.write(delimiter.join(tokens) + "\n")
def vitokenizer(input_text): input_text = unicoded(input_text) input_text = ViTokenizer.tokenize(input_text) #Turn result into usable format to input into Vectorizer(tokenizer=) input_text = input_text.split() input_text = [x.replace('_', ' ') for x in input_text] return input_text
def load_data(filename, dict): res = [] col1 = [] col2 = [] col3 = [] col4 = [] with open(filename, 'r') as f, open(dict, "w") as f2: for line in f: label1, p, label2, question = line.split(" ", 3) question = review_to_words(question, 'datavn/question_stopwords.txt') # question = review_add_pos(question,'datavn/question_stopwords.txt') col1.append(label1) col2.append(label2) col3.append(question) ngram = ngrams_array(col3, 2) # tu dien cac tu va so lan xuat hien cua no dict_arr = [] # list cac tu co tan suat < 1 for x in ngram: p = ngram.get(x) if p < 1: dict_arr.append(x) f2.write(x + "\n") col4 = [] for q in col3: r1 = [] r2 = [] q = review_to_words2(q, dict, 2) # q la 1 cau q1 = [' '.join(x) for x in ngrams(q, 1)] # q1:mang cac 1-grams s1 = ViPosTagger.postagging( ViTokenizer.tokenize(unicode( q, encoding='utf-8'))) # gan nhan POS for i1, i2 in zip(s1[0], s1[1]): t1 = i1 + "_" + i2 t1 = t1.encode('utf-8') r1.append(t1) s2 = ' '.join( i for i in s1[1]) # Nhan tu loai cua cau dang str. vd: "N V E N" q2 = [' '.join(x) for x in ngrams(q, 2) ] # q2: mang cac phan tu 2-grams la word s22 = [' '.join(x) for x in ngrams(s2, 2) ] # s22: mang cac phan tu 2-grams la tag q3 = (' '.join(x.replace(' ', '_') for x in q2)).split() s3 = (' '.join(x.replace(' ', '_') for x in s22)).split() for i1, i2 in zip(q3, s3): t2 = i1 + "_" + i2 r2.append(t2) y = r1 + r2 # z1 = [' '.join(x) for x in y] z = ' '.join(y) col4.append(z) # col4.append(q) d = {"label1": col1, "label2": col2, "question": col4} train = pd.DataFrame(d) return train
def get_entity(sentence): entity_list = [line.rstrip('\n') for line in open('data/entity.dat')] sentence_words = ViTokenizer.tokenize(sentence).split(' ') entity = [ stemmer.stem(word.lower()) for word in sentence_words if word in entity_list ] return entity
def process_chunk(chunk): print "Process chunk of size " + str(len(chunk)) ret = [] for line in chunk: line = line.strip() if len(line) > 0: out = ViTokenizer.tokenize(line) ret.append(out) return ret
def token_data(raw_texts, max_sent_length): output_texts = [] len_texts = [] token_texts = sent_tokenize(raw_texts) for text in token_texts: token_text = ViTokenizer.tokenize(text).split() len_text = len(token_text) len_texts.append(len_text) output_texts += [token_text[i:i + max_sent_length] for i in xrange(0, len(token_text), max_sent_length)] return output_texts, len_texts
class FeatureTransformer(BaseEstimator, TransformerMixin): def __init__(self): self.tokenizer = ViTokenizer() self.pos_tagger = ViPosTagger() def fit(self, *_): return self def transform(self, X, y=None, **fit_params): result = X.apply(lambda text: self.tokenizer.tokenize(text)) return result
def clean_up_sentence(sentence): ignore_words = ['?', '!', ',', '.', 'xin_lỗi', 'và', 'ạ'] sentence_words = w = ViTokenizer.tokenize(sentence).split(' ') sentence_words = [ stemmer.stem(word.lower()) for word in sentence_words if word not in ignore_words ] sentence_words = ngrams(w, 4, []) return sentence_words
def review_add_pos(review, filename): words = review.split() with open(filename, "r") as f3: dict_data = f3.read() array = dict_data.splitlines() meaningful_words = [w for w in words if not w in array] b = " ".join(meaningful_words) # cau sau khi loai bo stopword words_list = b.split() tup = ViPosTagger.postagging(ViTokenizer.tokenize(unicode(b,encoding='utf-8'))) # gan nhan POS a = tup[1] c = words_list + a return " ".join(c)
def load_dataset(folder): dataset = {'target_names': [], 'data': [], 'target': []} print('loading dataset') for root, dirs, files in os.walk(folder, topdown=False): position = 0 for name in dirs: subdir = os.path.join(root, name) dataset['target_names'].append(name) filesPath = get_filepaths(subdir) for filePath in filesPath: with io.open(filePath, mode="r", encoding="UTF8") as file: content = file.read().lower() rx = re.compile("[^\W\d_]+", re.UNICODE) content = " ".join(rx.findall(content)) dataset['data'].append(ViTokenizer.tokenize(content)) dataset['target'].append(position) position += 1 return dataset
def clean_doc(question): question = regex_email(question) question = regex_phone_number(question) question = regex_link(question) if type(question) != unicode: question = unicode(question, encoding='utf-8') question = accent(question) # question = tokenizer.predict(question) # tu them dau . vao cuoi cau question = ViTokenizer.tokenize(question) print question rm_junk_mark = re.compile(ur'[?,\.\n]') normalize_special_mark = re.compile( ur'(?P<special_mark>[\.,\(\)\[\]\{\};!?:“”\"\'/])') question = normalize_special_mark.sub(u' \g<special_mark> ', question) question = rm_junk_mark.sub(u'', question) question = re.sub(' +', ' ', question) # remove multiple spaces in a string return question
def nb(): if request.method == 'GET': return render_template('index.html') else: try: document = request.form['document'] document = ViTokenizer.tokenize(document) if document.strip() == '': return render_template('index.html', message='Please enter your document.') print(document) message = LABELS[nb_model.detect_one(document)] print(message) return render_template('index.html', message=message, document=document) except Exception as e: traceback.print_exc() return render_template( 'index.html', message='Check error. See log file for detail.', document=document)
def ner_crf(question): text = ViPosTagger.postagging(ViTokenizer.tokenize(question)) detect = [] ar = [] for i in range(len(text[0])): l = [] l.append(text[0][i]) l.append(text[1][i]) ar.append(tuple(l)) detect.append(ar) X_detect = [sent2features(s) for s in detect] tagger = pycrfsuite.Tagger() tagger.open('crf.model') y_detect = [tagger.tag(xseq) for xseq in X_detect] pred = [] for i in range(len(detect[0])): k = detect[0][i][0] v = y_detect[0][i] kv = [] kv.append(k) kv.append(v) pred.append(tuple(kv)) return pred
def main(argv): input_file = 'input.txt' output_file = 'output.txt' try: opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="]) except getopt.GetoptError: print 'test.py -i <inputfile> -o <outputfile>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py -i <inputfile> -o <outputfile>' sys.exit() elif opt in ("-i", "--ifile"): input_file = arg elif opt in ("-o", "--ofile"): output_file = arg if not exists(input_file): print "Cannot open", input_file return content = open(input_file, "r").read() content = content.decode("utf-8") output = ViTokenizer.tokenize(content) output = output.encode("utf-8") open(output_file, "w").write(output)
def alt(array): files = [] tfidf = [] wordDict = [] newA = [] tf = [] q = [] z = [] u_neg = [] u_pos = [] u_test = [] count_neg = 0 count_pos = 0 #task 1 path_neg = '/home/rindem/Desktop/bag_of_word_auth/training/negative' obj1 = open(path_neg, "r") str1 = obj1.read() files_neg = str1.split("\n\n") # print len(files_neg) obj1.close() path_pos = '/home/rindem/Desktop/bag_of_word_auth/training/positive' obj2 = open(path_pos, "r") str2 = obj2.read() files_pos = str2.split("\n\n") # print len(files_pos) obj2.close() files.append(array) files.extend(files_neg) files.extend(files_pos) # print len(files) for value in range(len(files)): decode = files[value].decode('utf-8') tmp = ViTokenizer.tokenize(decode) split = tmp.split(" ") newA.append(split) # mang 2 chieu luu tach tu union = set.union(*(set(value) for value in newA)) for val in range(len(files)): wordDict.append(dict.fromkeys(union, 0)) for num in range(len(newA)): for word in newA[num]: wordDict[num][word] += 1 #tf for val in range(len(wordDict)): tfBow = computeTF(wordDict[val], newA[val]) tf.append(tfBow) #idf idfs = computeIDF(wordDict) #tfidf for val in tf: tfidfBow = computeTFIDF(val, idfs) tfidf.append(tfidfBow) x_neg = dict.fromkeys(tfidf[0].keys(), 0) x_pos = dict.fromkeys(tfidf[0].keys(), 0) x_test = tfidf[0] longNum = len(newA) for num in range(1, ((longNum - 1) / 2) + 1): for word in newA[num]: x_neg[word] += tfidf[num][word] for num in range(((longNum - 1) / 2) + 1, longNum): for word in newA[num]: x_pos[word] += tfidf[num][word] for word, val in x_neg.items(): u_neg.append(x_neg[word]) for word, val in x_pos.items(): u_pos.append(x_pos[word]) for word, val in x_test.items(): u_test.append(x_test[word]) # print "\n" # print "Compare test vs neg: ", space(u_test,u_neg) tmp = space(u_test, u_neg) # print "Compare test vs pos", space(u_test,u_pos) # print "\n" temp = space(u_test, u_pos) if (compare(tmp, temp) == tmp): return 1 else: return 2
for row in csvReader: count = count + 1 if (count % 2) == 1: utf = unicode(row[0], "utf-8") allstr = allstr + utf arr.append(utf) else: st = row[0].strip('\n') st = st.strip('\r') st = st.strip('\n') labels.append(st) # Create diction allstr = allstr.replace(",", "").replace(".", "") allstr = ViTokenizer.tokenize(allstr) allstr = allstr.lower() diction = allstr.split() diction = list(set(diction)) # Write diction to file write(diction, "diction.file") diction = read("diction.file") print(len(diction)) data = [] # Predict pre = pre.replace(",", "").replace(".", "") pre = ViTokenizer.tokenize(pre)
def segmentation(self): return ViTokenizer.tokenize(self.text)
def tokenize(comment): text_token = ViTokenizer.tokenize(comment) return text_token
from __future__ import unicode_literals from __future__ import print_function from sklearn.naive_bayes import MultinomialNB import numpy as np from pyvi.pyvi import ViTokenizer, ViPosTagger import sys # train raw data dt1 = u"Hà Nội Phở Cháo Lòng Hà Nội Cháo Trai" dt2 = u"Hà Nội Bún Chả Phở Ô Mai Lẩu Ếch" dt3 = u"Phở Bánh Giò Ô Mai" dt4 = u"Sài Gòn Hủ Tiếu Bánh Bò Phở Bún Nem" dt5 = u"Hà Nội Hà Nội Bún Chả Hủ Tiếu Nem Gián Cơm Gà Phở" #VNTokenizer dt1 = ViTokenizer.tokenize(dt1) dt2 = ViTokenizer.tokenize(dt2) dt3 = ViTokenizer.tokenize(dt3) dt4 = ViTokenizer.tokenize(dt4) dt5 = ViTokenizer.tokenize(dt5) print(isinstance(dt1, (str, unicode))) # Dictionary arr1 = dt1.split() arr2 = dt2.split() arr3 = dt3.split() arr4 = dt4.split() arr5 = dt5.split() arr = arr1 + arr2 + arr3 + arr4
tfidf = {} for word, val in tfBow.items(): tfidf[word] = val * idfs[word] return tfidf f = open('test/dia chi.txt', 'r') str1 = f.read() z = str1.decode('utf-8') q = open('test/ten.txt', 'r') str2 = q.read() print type(str2) t = str2.decode('utf-8') print len(t) x = ViTokenizer.tokenize(z) x2 = ViTokenizer.tokenize(t) #x = ViPosTagger.postagging(ViTokenizer.tokenize(u"Trường đại học Bách Khoa Hà Nội")) y = x.split(" ") y2 = x2.split(" ") aList = [] bList = [] for index in range(len(y)): tmp = y[index] aList.append(tmp) for index in range(len(aList)): print aList[index]
newA = [] tf = [] path = 'training/negative' obj2 = open(path, "r") str1 = obj2.read() files = str1.split("\n\n") obj2.close() for val in files: print "//////" print val print "/////" for value in range(len(files)): decode = files[value].decode('utf-8') tmp = ViTokenizer.tokenize(decode) split = tmp.split(" ") newA.append(split) # mang 2 chieu luu tach tu union = set.union(*(set(value) for value in newA)) for val in range(len(files)): wordDict.append(dict.fromkeys(union, 0)) for num in range(len(newA)): for word in newA[num]: wordDict[num][word] += 1 #tf for val in range(len(wordDict)): tfBow = computeTF(wordDict[val], newA[val])
def tokenize_text(sentence: str, format=None): tokenized_text = ViTokenizer.tokenize(sentence) if format == 'list': return [re.sub('_', ' ', w) for w in tokenized_text.split()] else: return tokenized_text