def comment_to_word_vectors(comment): words = list() sentence_words = jieba_utils.cut(comment) for word in sentence_words: if word in word_vector_model.wv: words.append(word_vector_model.wv[word]) return words
def test(comment): sentence_words = jieba_utils.cut(comment) print(sentence_words) words = list() for word in sentence_words: if word in word_vector_model.wv: words.append(word_vector_model.wv[word]) else: print(word, ' 沒有在模組中')
def comment_to_indices(comment, word_to_index): indices = [] size = len(word_to_index) words = jieba_utils.cut(comment) print('Comment: {}\n Terms: {}'.format(comment, ','.join(words))) for word in words: if word in word_to_index: indices.append(word_to_index[word]) else: indices.append(size) # other return indices
def comment_to_one_hot(comment, word_index_dict): data = list() words = jieba_utils.cut(comment) for word in words: data_one_hot = np.zeros(len(word_index_dict) + 1) if word in word_index_dict: index = word_index_dict[word] data_one_hot[index] = 1 else: data_one_hot[-1] = 1 data.append(data_one_hot.tolist()) return data
def build_up_word_list(comments, word_list_file_name=None, output_file_name=None): if word_list_file_name: print('Loading word list from file ' + word_list_file_name) with open(word_list_file_name, 'r', encoding='utf-8') as fr: word_list = [line.strip() for line in fr.readlines() if len(line.strip()) != 0] else: wordset = set() print('No word list file specified, creating new word list.') for comment in comments: words = jieba_utils.cut(comment) wordset.update(words) word_list = list(wordset) if output_file_name: with open(output_file_name, 'w+', encoding='utf-8') as fw: for word in word_list: fw.write(word + '\n') print('Word list saved to ' + output_file_name + '.') return word_list
def comment_to_n_of_words(comment): return jieba_utils.cut(comment)