def prepare_raw_news(): logger.info("In Prepare Raw News...") raw_news_data = CommonUtil.read_excel(RAW_NEWS_PATH) raw_news_table = raw_news_data.sheet_by_index(0) raw_news_rows = raw_news_table.nrows for rowN in range(0, raw_news_rows): news_item = list() news_index = int(raw_news_table.cell_value(rowN, 0)) news_time = CommonUtil.get_datetime_from_cell( raw_news_table.cell_value(rowN, 1)) news_content = raw_news_table.cell_value(rowN, 2) news_item.append(news_index) news_item.append(news_time) news_item.append(news_content) newsList.append(news_item) logger.info("Prepare Raw News...Done!")
def splitwords_word2vec(filename, fname, weightWord): filetxt = open(filename, 'a+', encoding='utf-8') model = gensim.models.KeyedVectors.load_word2vec_format( '../word2veczzh/news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True) word_vec = model.wv del model # fname = '../files/splitedsentence.xls' data = xlrd.open_workbook(fname) table = data.sheet_by_index(0) nrows = table.nrows ncols = table.ncols #一行一行读取新闻内容 for i in range(0, nrows): #读取新闻的时间 # newsTime = table.cell(i, 0).value news_time = table.cell(i, 0).value newsTime = CommonUtil.get_datetime_from_cell(news_time) #读取新闻的内容 newsContent = table.cell(i, 1).value #获取极性值 -1,0,1 polarity = int(table.cell(i, 3).value) print(newsTime) # print(newsContent) # print(polarity) # 分词 words, wordsList = ltpmanner.splitwords(newsContent) #对这一条新闻进行分词 #去停用词 new_wordsList = ltpmanner.stopwords(wordsList) vec_array = np.zeros(64, dtype=float) #一条新闻初始化词向量,为0 #循环对词进行向量化 total_array = [] total_array.append(newsTime) length = len(new_wordsList) for each in new_wordsList: try: if each in weightWord: value = word_vec[each] * 10 / length vec_array += value else: value = word_vec[each] / length vec_array += value except Exception as e: print("error:", e) #转成list vec_array = vec_array.tolist() #合并到一起 total_array for each in vec_array: total_array.append(each) total_array.append(polarity) # print(total_array) # print('len(total_array)',len(total_array)) #向量化后写入文件 for each in total_array: print(each) filetxt.write(str(each) + '\t') filetxt.write('\n')