Пример #1
0
def test_extract_txt(reload=False):
    if reload:
        best_vector = "wordfreq"
        best_model = 1  # linearLogistic
        save_model(best_vector, best_model)
    else:
        od.loadStopwords()
        od.loadEmotionwords()
        od.loadWords(od.stopList)
        od.loadDocument(od.stopList)

    predictor = Predictor()
    predictor.load_model()
    predictor.set_mode(mode="wordfreq")
    # 读取txt文件
    path = os.path.join("reports")
    file_names = os.listdir(path)
    for file in file_names:
        if file.lower().count('.txt') == 1:
            result = []
            with open(os.path.join(path, file), 'r', encoding="utf-8") as f:
                contents = f.read()
                news_list = format_text(contents)
                for new in news_list:
                    predictor.set_news(news=new)
                    predictor.trans_vec()
                    tag = predictor()
                    result.append((new, tag))
                # 计算贸易摩擦指数
                print(file)
                total, neg_nums, percent = cal_neg_ratio(result)
                print("贸易摩擦指数:")
                print(percent)
                write_result(file, total, neg_nums, percent)
Пример #2
0
def save_model(best_vector, best_model):

    od.loadStopwords()
    od.loadEmotionwords()
    od.loadWords(od.stopList)
    od.loadDocument(od.stopList)
    xpath = os.path.join('result', 'vector', 'resultX.npz')
    ypath = os.path.join('result', 'vector', 'resultY.npz')
    resultX = np.load(xpath)
    resultY = np.load(ypath)
    new_x, new_y = od.twoTag(resultX[best_vector], resultY[best_vector])
    model_saved = ml.naiveBayes(new_x, new_y)
    path = os.path.join('model', 'wordfreq_naiveBayes.ml')
    with open(path, 'wb') as f:
        pickle.dump(model_saved, f)
    print("Save over")
Пример #3
0
def save_model(best_vector, best_model):
    """
    存储效果最好的模型
    需要手动指明参数名字
    :param best_vector: 最好的文本->词向量的方法
    :param best_model: 最好的机器学习模型
    :return: info
    """
    od.loadStopwords()
    od.loadEmotionwords()
    od.loadWords(od.stopList)
    od.loadDocument(od.stopList)
    xpath = os.path.join('result', 'vector', 'resultX.npz')
    ypath = os.path.join('result', 'vector', 'resultY.npz')
    resultX = np.load(xpath)
    resultY = np.load(ypath)
    new_x, new_y = od.twoTag(resultX[best_vector], resultY[best_vector])
    model_saved = ml.linearLogistic(new_x, new_y)
    path = os.path.join('model', 'wordfreq_logistic.ml')
    with open(path, 'wb') as f:
        pickle.dump(model_saved, f)
    print("Save over")
Пример #4
0
def test(reload=False):
    if reload:
        best_vector = "wordfreq"
        best_model = 1  # linearLogistic
        save_model(best_vector, best_model)
    else:
        od.loadStopwords()
        od.loadEmotionwords()
        od.loadWords(od.stopList)
        od.loadDocument(od.stopList)

    predictor = Predictor()
    predictor.load_model()
    predictor.set_mode(mode="wordfreq")

    news = "                                                    《经济通通讯社13日专讯》日股早市偏软,日经225指数报18312跌239点。  美元兑日圆疲软,新报108﹒78╱80。(tt)                                                                         "
    news = "                                                  周二,恒生指数收报20356.24点,跌236.76点,跌幅1.15%;国企指数收报10596.91点,跌148点,跌幅1.38%;大市成交492.76亿港元。美国3月非农就业数据表现疲弱,拖累隔夜欧美股市全线受压。中国3月份CPI同比增长3.6%,令货币政策在短期内放宽预期降低。港股早盘随外围低开两百多点,但是A股在汇金增持内银股刺激下探底回升,对港股起到支持,之后恒指于低位维持窄幅震荡整理,最终跌逾1%。银行股全线走软。四大内银股方面,工商银行跌0.4%,中国银行跌0.64%,建设银行跌0.67%,农业银行跌1.2%;国际金融股方面,汇丰控股跌1.75%,渣打集团跌1.89%。美国就业市场增长放缓及内地通胀反弹,投资者对经济信心下降。中国央行短期内下调存准机会大减,从而利淡大市气氛。预计港股本周将继续在20200至20700点之间震荡。                                                                         "  # 待转化的文本

    predictor.set_news(news=news)
    predictor.trans_vec()

    tag = predictor()
    print("算出来的和是", sum(predictor._vec[0]))
    print("打标的结果是:", tag)