def StaticalWordFrequently(): WordFrequently = dict() file_object = open("E:\\text.txt", 'r', encoding='UTF-8') content = "" for line in file_object: content += line Words = content.split() print(f"总个数:{len(Words)}") Stemmer = snowballstemmer.EnglishStemmer() ''' 取词干之前: 总个数:17005207 总词数202003 取词干之后: 总个数:17005207 总词数253854 ''' for w in Words: word = Stemmer.stemWord(word=w) # word =w if word in WordFrequently.keys(): count = WordFrequently[word] + 1 WordFrequently[word] = count else: WordFrequently[word] = 1 print(f"总词数{len(WordFrequently)}") file_object.close() return WordFrequently
def StaticalWordFrequently(train_data): WordFrequently = dict() file_object = open(train_data, 'r', encoding='UTF-8') content = "" for line in file_object: content += line without_punctuation = content.maketrans('', '', string.punctuation) str = content.translate(without_punctuation) Words = str.lower().split() print(f"总个数:{len(Words)}") Stemmer = snowballstemmer.EnglishStemmer() ''' 取词干之前: 总个数:17005207 总词数202003 取词干之后: 总个数:17005207 总词数253854 ''' for w in Words: word = Stemmer.stemWord(word=w) # word =w if word in WordFrequently.keys(): count = WordFrequently[word] + 1 WordFrequently[word] = count else: WordFrequently[word] = 1 print(f"总词数{len(WordFrequently)}") file_object.close() return WordFrequently
def test_load_dump_and_search_with_stemming(self): dump = self.oktavia.dump() oktavia = Oktavia() oktavia.set_stemmer(snowballstemmer.EnglishStemmer()) oktavia.load(dump) results = oktavia.raw_search(u'baby', stemming=True) self.assertEqual(1, len(results))
def setUp(self): self.oktavia = Oktavia() self.oktavia.set_stemmer(snowballstemmer.EnglishStemmer()) self.section = self.oktavia.add_section(u'document') self.oktavia.add_word(u"stemming baby", stemming=True) self.section.set_tail(u"doc1") self.oktavia.add_word(u"stemmed babies", stemming=True) self.section.set_tail(u"doc2") self.oktavia.build()
def stemit(): stemmer = snowballstemmer.EnglishStemmer() stop = stopwords.words('english') stop.extend([ 'may', 'also', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'across', 'among', 'beside', 'however', 'yet', 'within' ] + list(ascii_lowercase)) stoplist = stemmer.stemWords(stop) stoplist = set(stoplist) stop = set(sorted(stop + list(stoplist))) return stop
def stemming(self, text_without_punctuation): """Stemming text""" if self.lang == 2: stemmer = snowballstemmer.RussianStemmer() else: stemmer = snowballstemmer.EnglishStemmer() self.stemmed_text = [] for sentence in text_without_punctuation: self.stemmed_text.append(" ".join( [stemmer.stemWord(i) for i in sentence.split()])) self.remove_stop_words()
import pandas as pd import gensim, os, re, pymongo, itertools, nltk, snowballstemmer # set the location where we'll save our model savefolder = '/data' # grab data from database and convert to pandas dataframe client = MongoClient() db = client.target_database # access target database collection = db.target_collection # access target collection within the target database data = pd.DataFrame( list(collection.find()) ) # each row is one document; the raw text of the document should be in the 'text_data' column # initialize stemmer stemmer = snowballstemmer.EnglishStemmer() # grab stopword list, extend it a bit, and then turn it into a set for later stop = stopwords.words('english') stop.extend([ 'may', 'also', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'across', 'among', 'beside', 'however', 'yet', 'within' ] + list(ascii_lowercase)) stoplist = stemmer.stemWords(stop) stoplist = set(stoplist) stop = set(sorted(stop + list(stoplist))) # remove characters and stoplist words, then generate dictionary of unique words data['text_data'].replace( '[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',
# print(required_data.sample(5)) # Pre-processing steps # Encode labels # Text cleaning # Tokenizer label_encoder = LabelEncoder() required_data['target'] = label_encoder.fit_transform( required_data['airline_sentiment']) text_cleaner = TextCleaner() required_data['clean_text'] = text_cleaner.transform(required_data['text']) # segment text into tokens(words) tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])') stemmer = snowballstemmer.EnglishStemmer( ) # words are reduced to their root words def tokenize(s): tokens = tok.sub(r' \1 ', s).split() return stemmer.stemWords(tokens) # Split into training and test set X_train, X_test, y_train, y_test = train_test_split( required_data['clean_text'].values, required_data['target'].values, test_size=0.25, random_state=0)
def stemming_user_description(row): if (row["user lang"] == "ca" or row["user lang"] == "eu"): stemmer = snowballstemmer.SpanishStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["removed_stopwords"].split() ]) if (row["user lang"] == "da"): stemmer = snowballstemmer.DanishStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "nl"): stemmer = snowballstemmer.DutchStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "en" or row["user lang"] == "fi"): stemmer = snowballstemmer.EnglishStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "fu"): stemmer = snowballstemmer.FinnishStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "fr"): stemmer = snowballstemmer.FrenchStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "de"): stemmer = snowballstemmer.GermanStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "hu"): stemmer = snowballstemmer.HungarianStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "it"): stemmer = snowballstemmer.ItalianStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "no"): stemmer = snowballstemmer.NorwegianStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "pt"): stemmer = snowballstemmer.PortugueseStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "ro"): stemmer = snowballstemmer.RomanianStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "ru"): stemmer = snowballstemmer.RussianStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) if (row["user lang"] == "es"): stemmer = snowballstemmer.SpanishStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ]) #if(row["user lang"]=="sv"): # stemmer = snowballstemmer.SwedishStemmer(word) # return ' '.join([stemmer.stemWord() for word in row["removed_stopwords"].split()]) if (row["user lang"] == "tr"): stemmer = snowballstemmer.TurkishStemmer() return ' '.join([ stemmer.stemWord(word) for word in row["user description2"].split() ])
def trainWord2Vector(nVector, HuffmanTree_list, trainData): count = 0 # 学习速率 和平均误差率 learn_rate = 0.05 mean_error = 0.0 Stemmer = snowballstemmer.EnglishStemmer() # 将huffman树改成基于字典结构的,便于快速查找所需要子树 # 存储词向量的字典 Word_X = dict() Parameter_W = dict() HuffmanTree_dict = dict() for index in range(len(HuffmanTree_list)): node = HuffmanTree_list[index] HuffmanTree_dict[node.name] = node # 实现随机梯度下降 # 训练数据个数 n = len(trainData) for index in range(n): count += 1 if count % 5000 == 0: print(f"已经训练了{count}数据") # 获取当前训练集合中训练数量 current_num = len(trainData) # 随机选择训练数据, 计算完成之后将训练用过的数据移除,并随机挑选新的训练数据,以达到随机梯度下降的效果 data_index = random.randint(0, current_num - 1) # print(data_index, '当前数据:', trainData[data_index]) Simply_data = trainData[data_index] sum_Vector = np.zeros(nVector) midde = int(len(Simply_data) / 2) # codename为训练数据中最中间的词 codeName = Stemmer.stemWord(Simply_data[midde]) del Simply_data[midde] # print(Simply_data) for ws in Simply_data: w = Stemmer.stemWord(ws) if w in Word_X.keys(): sum_Vector += Word_X[w] else: # 随机为新词初始化一个n维数组 X = (np.random.random(nVector)-0.5)*2 sum_Vector += X Word_X[w] = X huffmanCode, ParentCode = getHuffmanCode_update(HuffmanTree_dict, codeName) # print(huffmanCode, ':::', ParentCode[1:]) # 获取各个父节点所包含的参数向量,如果没有要随机生成 current_Parameter_W_dict = dict() for i in range(1, len(ParentCode)): pc = ParentCode[i] if pc in Parameter_W.keys(): current_Parameter_W_dict[pc] = Parameter_W[pc] else: # 向量初始化-1 到 1之间 W = (np.random.random(nVector)-0.5)*2 Parameter_W[pc] = W current_Parameter_W_dict[pc] = W ####################################### e = np.zeros(nVector) code_index = 0 for code in ParentCode[1:]: # q 表示向量X与结点参数parameter的乘积,并带入sigmoid函数转化为概率 parameter_w = current_Parameter_W_dict[code] martix_dot = round(float(np.dot(sum_Vector, parameter_w)), 5) # q = round(1 / (1+(round(math.exp(-martix_dot), 6))), 10) # try: # # q = 1 / (1 + (1 / math.exp(- martix_dot))) # except OverflowError: # print('数值太大') # break # except ZeroDivisionError: # print('捕获处零异常') # break if martix_dot >= 20: q = 1.0 elif martix_dot <= -20: q = 0.0 else: q = 1 / (1 + (1 / math.exp(- martix_dot))) # 根据哈夫曼编码判断是正类还是负类 code_class = int(huffmanCode[code_index]) # 表示偏导数的计算公式之前的系数 g = learn_rate * (1 - code_class - q) # e表示学习速率*偏导数在求和之和的矩阵 e += g * np.array(parameter_w) # 对参数执行梯度下降 parameter_w += g * np.array(sum_Vector) code_index += 1 current_Parameter_W_dict[code] = parameter_w for ws in Simply_data: w = Stemmer.stemWord(ws) x_vector = Word_X[w] x_vector += e Word_X[w] = x_vector # print(x_vector) del trainData[data_index] print("训练完成") return Word_X