示例#1
0
def prepare_raw_news():
    logger.info("In Prepare Raw News...")
    raw_news_data = CommonUtil.read_excel(RAW_NEWS_PATH)
    raw_news_table = raw_news_data.sheet_by_index(0)
    raw_news_rows = raw_news_table.nrows
    for rowN in range(0, raw_news_rows):
        news_item = list()
        news_index = int(raw_news_table.cell_value(rowN, 0))
        news_time = CommonUtil.get_datetime_from_cell(
            raw_news_table.cell_value(rowN, 1))
        news_content = raw_news_table.cell_value(rowN, 2)
        news_item.append(news_index)
        news_item.append(news_time)
        news_item.append(news_content)
        newsList.append(news_item)
    logger.info("Prepare Raw News...Done!")
示例#2
0
def splitwords_word2vec(filename, fname, weightWord):
    filetxt = open(filename, 'a+', encoding='utf-8')
    model = gensim.models.KeyedVectors.load_word2vec_format(
        '../word2veczzh/news_12g_baidubaike_20g_novel_90g_embedding_64.bin',
        binary=True)
    word_vec = model.wv
    del model

    # fname = '../files/splitedsentence.xls'
    data = xlrd.open_workbook(fname)
    table = data.sheet_by_index(0)
    nrows = table.nrows
    ncols = table.ncols
    #一行一行读取新闻内容
    for i in range(0, nrows):
        #读取新闻的时间
        # newsTime = table.cell(i, 0).value
        news_time = table.cell(i, 0).value
        newsTime = CommonUtil.get_datetime_from_cell(news_time)
        #读取新闻的内容
        newsContent = table.cell(i, 1).value
        #获取极性值 -1,0,1
        polarity = int(table.cell(i, 3).value)
        print(newsTime)
        # print(newsContent)
        # print(polarity)

        # 分词
        words, wordsList = ltpmanner.splitwords(newsContent)  #对这一条新闻进行分词
        #去停用词
        new_wordsList = ltpmanner.stopwords(wordsList)
        vec_array = np.zeros(64, dtype=float)  #一条新闻初始化词向量,为0
        #循环对词进行向量化
        total_array = []
        total_array.append(newsTime)
        length = len(new_wordsList)
        for each in new_wordsList:
            try:
                if each in weightWord:
                    value = word_vec[each] * 10 / length
                    vec_array += value
                else:
                    value = word_vec[each] / length
                    vec_array += value
            except Exception as e:
                print("error:", e)

        #转成list
        vec_array = vec_array.tolist()
        #合并到一起 total_array
        for each in vec_array:
            total_array.append(each)

        total_array.append(polarity)
        # print(total_array)
        # print('len(total_array)',len(total_array))
        #向量化后写入文件
        for each in total_array:
            print(each)
            filetxt.write(str(each) + '\t')
        filetxt.write('\n')