def cut(sentence): '''''' old_numeric_chars = ["壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] simple_numeric_chars = ["零", "一", "二", "三", "四", "五", "六", "七", "八", "九"] old_numeric_char_set = set(old_numeric_chars) simple_numeric_char_set = set(simple_numeric_chars) r_symbols = '[`~!@#$%^&+*()=|{}\':;,\t\n\\[\\]『』「」<>/?《》~!@#¥%……&*()|{}【】‘;:”“’。,、?]' r_float = "-?(\d+)?\.\d+" r_alnum = "^[a-z]+[0-9]+$" ## basic replacement sentence = text_regularization.extractWords(sentence) ## domain replacement sentence = sentence.replace('+', '加') ## symbol replacement sentence = re.sub(r_symbols, ' ', sentence.strip()) ## word segmentation words = [w for w in jieba.lcut(sentence, cut_all=False)] ## word filter clean_words = [] for w in words: if ((w == '') | (w == ' ')): continue if (w.isnumeric()): # integer old_numeric_ratio = np.sum( [1 for c in w if (c in old_numeric_char_set)]) / len(w) simple_numeric_ratio = np.sum( [1 for c in w if (c in simple_numeric_char_set)]) / len(w) if ((old_numeric_ratio == 1.0) | (simple_numeric_ratio == 1.0)): clean_words.append('INTEGER_CN_%s' % len(w)) else: clean_words.append('INTEGER_%s' % len(w)) elif (re.match(r_float, w) != None): # float clean_words.append('FLOAT') elif ( (w.isalpha() == True) & (is_chinese_words(w) == False)): ## alpha clean_words.append(w.lower()) elif (is_chinese_words(w)): # chinese words clean_words.append(w) elif (re.match(r_alnum, w) != None): # alpha + num if (w.lower().startswith('qq')): clean_words.append('qq') clean_words.append('INTEGER_%s' % (len(w) - 2)) elif (w.lower().startswith("tel")): clean_words.append('tel') clean_words.append('INTEGER_%s' % (len(w) - 3)) else: clean_words.append('ALNUM_%s' % len(w)) elif ((w == '-') | (w == '_')): clean_words.append(w) return clean_words
#encoding:UTF-8 import codecs import jieba import text_regularization as tr filenames = ["ad.txt", "not_ad.txt"] output = open("fastText_test.txt", "a") for filename in filenames: with open(filename, 'r') as f: for line in f: text = tr.extractWords(line) word_list = " ".join(jieba.cut(text)) output.write( word_list.replace("\n", " ") + "\t__label__" + filename[:-4] + "\n") f.close() output.flush() output.close()
""" import numpy as np import tensorlayer as tl import sys sys.path.append("../serving/packages") from text_regularization import extractWords wv = tl.files.load_npy_to_any(name='./output/model_word2vec_200.npy') for label in ["pass", "spam"]: embeddings = [] inp = "data/msglog/msg" + label + ".log.seg" outp = "output/sample_" + label f = open(inp, encoding='utf-8') for line in f: line = extractWords(line) words = line.strip().split(' ') text_embedding = np.zeros(200) for word in words: try: text_embedding += wv[word] except KeyError: text_embedding += wv['UNK'] embeddings.append(text_embedding) embeddings = np.asarray(embeddings, dtype=np.float32) if label == "spam": labels = np.zeros(embeddings.shape[0]) elif label == "pass": labels = np.ones(embeddings.shape[0])
def cut_1(sentence): '''''' sentence = text_regularization.extractWords(sentence) return jieba.lcut(sentence, cut_all=False)