示例#1
0
def StaticalWordFrequently():
    WordFrequently = dict()
    file_object = open("E:\\text.txt", 'r', encoding='UTF-8')
    content = ""
    for line in file_object:
        content += line
    Words = content.split()
    print(f"总个数:{len(Words)}")
    Stemmer = snowballstemmer.EnglishStemmer()
    '''
    取词干之前:
        总个数:17005207
        总词数202003
    取词干之后:
        总个数:17005207
        总词数253854
    '''
    for w in Words:
        word = Stemmer.stemWord(word=w)
        # word =w
        if word in WordFrequently.keys():
            count = WordFrequently[word] + 1
            WordFrequently[word] = count
        else:
            WordFrequently[word] = 1
    print(f"总词数{len(WordFrequently)}")
    file_object.close()
    return WordFrequently
示例#2
0
def StaticalWordFrequently(train_data):
    WordFrequently = dict()
    file_object = open(train_data, 'r', encoding='UTF-8')
    content = ""
    for line in file_object:
        content += line
    without_punctuation = content.maketrans('', '', string.punctuation)
    str = content.translate(without_punctuation)
    Words = str.lower().split()
    print(f"总个数:{len(Words)}")
    Stemmer = snowballstemmer.EnglishStemmer()
    '''
    取词干之前:
        总个数:17005207
        总词数202003
    取词干之后:
        总个数:17005207
        总词数253854
    '''
    for w in Words:
        word = Stemmer.stemWord(word=w)
        # word =w
        if word in WordFrequently.keys():
            count = WordFrequently[word] + 1
            WordFrequently[word] = count
        else:
            WordFrequently[word] = 1
    print(f"总词数{len(WordFrequently)}")
    file_object.close()
    return WordFrequently
示例#3
0
 def test_load_dump_and_search_with_stemming(self):
     dump = self.oktavia.dump()
     oktavia = Oktavia()
     oktavia.set_stemmer(snowballstemmer.EnglishStemmer())
     oktavia.load(dump)
     results = oktavia.raw_search(u'baby', stemming=True)
     self.assertEqual(1, len(results))
示例#4
0
 def setUp(self):
     self.oktavia = Oktavia()
     self.oktavia.set_stemmer(snowballstemmer.EnglishStemmer())
     self.section = self.oktavia.add_section(u'document')
     self.oktavia.add_word(u"stemming baby", stemming=True)
     self.section.set_tail(u"doc1")
     self.oktavia.add_word(u"stemmed babies", stemming=True)
     self.section.set_tail(u"doc2")
     self.oktavia.build()
示例#5
0
def stemit():
    stemmer = snowballstemmer.EnglishStemmer()
    stop = stopwords.words('english')
    stop.extend([
        'may', 'also', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
        'seven', 'eight', 'nine', 'ten', 'across', 'among', 'beside',
        'however', 'yet', 'within'
    ] + list(ascii_lowercase))
    stoplist = stemmer.stemWords(stop)
    stoplist = set(stoplist)
    stop = set(sorted(stop + list(stoplist)))
    return stop
示例#6
0
    def stemming(self, text_without_punctuation):
        """Stemming text"""

        if self.lang == 2:
            stemmer = snowballstemmer.RussianStemmer()
        else:
            stemmer = snowballstemmer.EnglishStemmer()

        self.stemmed_text = []
        for sentence in text_without_punctuation:
            self.stemmed_text.append(" ".join(
                [stemmer.stemWord(i) for i in sentence.split()]))
        self.remove_stop_words()
示例#7
0
import pandas as pd
import gensim, os, re, pymongo, itertools, nltk, snowballstemmer

# set the location where we'll save our model
savefolder = '/data'

# grab data from database and convert to pandas dataframe
client = MongoClient()
db = client.target_database  # access target database
collection = db.target_collection  # access target collection within the target database
data = pd.DataFrame(
    list(collection.find())
)  # each row is one document; the raw text of the document should be in the 'text_data' column

# initialize stemmer
stemmer = snowballstemmer.EnglishStemmer()

# grab stopword list, extend it a bit, and then turn it into a set for later
stop = stopwords.words('english')
stop.extend([
    'may', 'also', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
    'seven', 'eight', 'nine', 'ten', 'across', 'among', 'beside', 'however',
    'yet', 'within'
] + list(ascii_lowercase))
stoplist = stemmer.stemWords(stop)
stoplist = set(stoplist)
stop = set(sorted(stop + list(stoplist)))

# remove characters and stoplist words, then generate dictionary of unique words
data['text_data'].replace(
    '[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',
示例#8
0
# print(required_data.sample(5))

# Pre-processing steps
# Encode labels
# Text cleaning
# Tokenizer
label_encoder = LabelEncoder()
required_data['target'] = label_encoder.fit_transform(
    required_data['airline_sentiment'])

text_cleaner = TextCleaner()
required_data['clean_text'] = text_cleaner.transform(required_data['text'])

# segment text into tokens(words)
tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
stemmer = snowballstemmer.EnglishStemmer(
)  # words are reduced to their root words


def tokenize(s):
    tokens = tok.sub(r' \1 ', s).split()

    return stemmer.stemWords(tokens)


# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(
    required_data['clean_text'].values,
    required_data['target'].values,
    test_size=0.25,
    random_state=0)
示例#9
0
def stemming_user_description(row):
    if (row["user lang"] == "ca" or row["user lang"] == "eu"):
        stemmer = snowballstemmer.SpanishStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["removed_stopwords"].split()
        ])
    if (row["user lang"] == "da"):
        stemmer = snowballstemmer.DanishStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "nl"):
        stemmer = snowballstemmer.DutchStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "en" or row["user lang"] == "fi"):
        stemmer = snowballstemmer.EnglishStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "fu"):
        stemmer = snowballstemmer.FinnishStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "fr"):
        stemmer = snowballstemmer.FrenchStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "de"):
        stemmer = snowballstemmer.GermanStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "hu"):
        stemmer = snowballstemmer.HungarianStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "it"):
        stemmer = snowballstemmer.ItalianStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "no"):
        stemmer = snowballstemmer.NorwegianStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "pt"):
        stemmer = snowballstemmer.PortugueseStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "ro"):
        stemmer = snowballstemmer.RomanianStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "ru"):
        stemmer = snowballstemmer.RussianStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    if (row["user lang"] == "es"):
        stemmer = snowballstemmer.SpanishStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
    #if(row["user lang"]=="sv"):
    #    stemmer = snowballstemmer.SwedishStemmer(word)
    #    return ' '.join([stemmer.stemWord() for word in row["removed_stopwords"].split()])
    if (row["user lang"] == "tr"):
        stemmer = snowballstemmer.TurkishStemmer()
        return ' '.join([
            stemmer.stemWord(word)
            for word in row["user description2"].split()
        ])
示例#10
0
def trainWord2Vector(nVector, HuffmanTree_list, trainData):
    count = 0
    # 学习速率 和平均误差率
    learn_rate = 0.05
    mean_error = 0.0
    Stemmer = snowballstemmer.EnglishStemmer()
    # 将huffman树改成基于字典结构的,便于快速查找所需要子树
    # 存储词向量的字典
    Word_X = dict()
    Parameter_W = dict()
    HuffmanTree_dict = dict()
    for index in range(len(HuffmanTree_list)):
        node = HuffmanTree_list[index]
        HuffmanTree_dict[node.name] = node
    # 实现随机梯度下降
    # 训练数据个数
    n = len(trainData)
    for index in range(n):
        count += 1
        if count % 5000 == 0:
            print(f"已经训练了{count}数据")
        # 获取当前训练集合中训练数量
        current_num = len(trainData)
        # 随机选择训练数据, 计算完成之后将训练用过的数据移除,并随机挑选新的训练数据,以达到随机梯度下降的效果
        data_index = random.randint(0, current_num - 1)
        # print(data_index, '当前数据:', trainData[data_index])
        Simply_data = trainData[data_index]
        sum_Vector = np.zeros(nVector)
        midde = int(len(Simply_data) / 2)
        # codename为训练数据中最中间的词
        codeName = Stemmer.stemWord(Simply_data[midde])
        del Simply_data[midde]
        # print(Simply_data)
        for ws in Simply_data:
            w = Stemmer.stemWord(ws)
            if w in Word_X.keys():
                sum_Vector += Word_X[w]
            else:
                # 随机为新词初始化一个n维数组
                X = (np.random.random(nVector)-0.5)*2
                sum_Vector += X
                Word_X[w] = X
        huffmanCode, ParentCode = getHuffmanCode_update(HuffmanTree_dict, codeName)
        # print(huffmanCode, ':::', ParentCode[1:])
        # 获取各个父节点所包含的参数向量,如果没有要随机生成
        current_Parameter_W_dict = dict()
        for i in range(1, len(ParentCode)):
            pc = ParentCode[i]
            if pc in Parameter_W.keys():
                current_Parameter_W_dict[pc] = Parameter_W[pc]
            else:
                # 向量初始化-1 到 1之间
                W = (np.random.random(nVector)-0.5)*2
                Parameter_W[pc] = W
                current_Parameter_W_dict[pc] = W
        #######################################
        e = np.zeros(nVector)
        code_index = 0
        for code in ParentCode[1:]:
            # q 表示向量X与结点参数parameter的乘积,并带入sigmoid函数转化为概率
            parameter_w = current_Parameter_W_dict[code]
            martix_dot = round(float(np.dot(sum_Vector, parameter_w)), 5)
            # q = round(1 / (1+(round(math.exp(-martix_dot), 6))), 10)
            # try:
            #
            #     q = 1 / (1 + (1 / math.exp(- martix_dot)))
            # except OverflowError:
            #     print('数值太大')
            #     break
            # except ZeroDivisionError:
            #     print('捕获处零异常')
            #     break
            if martix_dot >= 20:
                q = 1.0
            elif martix_dot <= -20:
                q = 0.0
            else:
                q = 1 / (1 + (1 / math.exp(- martix_dot)))
            # 根据哈夫曼编码判断是正类还是负类
            code_class = int(huffmanCode[code_index])
            # 表示偏导数的计算公式之前的系数
            g = learn_rate * (1 - code_class - q)
            # e表示学习速率*偏导数在求和之和的矩阵
            e += g * np.array(parameter_w)
            # 对参数执行梯度下降
            parameter_w += g * np.array(sum_Vector)
            code_index += 1
            current_Parameter_W_dict[code] = parameter_w
        for ws in Simply_data:
            w = Stemmer.stemWord(ws)
            x_vector = Word_X[w]
            x_vector += e
            Word_X[w] = x_vector
            # print(x_vector)
        del trainData[data_index]
    print("训练完成")
    return Word_X