def test_extract_txt(reload=False): if reload: best_vector = "wordfreq" best_model = 1 # linearLogistic save_model(best_vector, best_model) else: od.loadStopwords() od.loadEmotionwords() od.loadWords(od.stopList) od.loadDocument(od.stopList) predictor = Predictor() predictor.load_model() predictor.set_mode(mode="wordfreq") # 读取txt文件 path = os.path.join("reports") file_names = os.listdir(path) for file in file_names: if file.lower().count('.txt') == 1: result = [] with open(os.path.join(path, file), 'r', encoding="utf-8") as f: contents = f.read() news_list = format_text(contents) for new in news_list: predictor.set_news(news=new) predictor.trans_vec() tag = predictor() result.append((new, tag)) # 计算贸易摩擦指数 print(file) total, neg_nums, percent = cal_neg_ratio(result) print("贸易摩擦指数:") print(percent) write_result(file, total, neg_nums, percent)
def save_model(best_vector, best_model): od.loadStopwords() od.loadEmotionwords() od.loadWords(od.stopList) od.loadDocument(od.stopList) xpath = os.path.join('result', 'vector', 'resultX.npz') ypath = os.path.join('result', 'vector', 'resultY.npz') resultX = np.load(xpath) resultY = np.load(ypath) new_x, new_y = od.twoTag(resultX[best_vector], resultY[best_vector]) model_saved = ml.naiveBayes(new_x, new_y) path = os.path.join('model', 'wordfreq_naiveBayes.ml') with open(path, 'wb') as f: pickle.dump(model_saved, f) print("Save over")
def save_model(best_vector, best_model): """ 存储效果最好的模型 需要手动指明参数名字 :param best_vector: 最好的文本->词向量的方法 :param best_model: 最好的机器学习模型 :return: info """ od.loadStopwords() od.loadEmotionwords() od.loadWords(od.stopList) od.loadDocument(od.stopList) xpath = os.path.join('result', 'vector', 'resultX.npz') ypath = os.path.join('result', 'vector', 'resultY.npz') resultX = np.load(xpath) resultY = np.load(ypath) new_x, new_y = od.twoTag(resultX[best_vector], resultY[best_vector]) model_saved = ml.linearLogistic(new_x, new_y) path = os.path.join('model', 'wordfreq_logistic.ml') with open(path, 'wb') as f: pickle.dump(model_saved, f) print("Save over")
def test(reload=False): if reload: best_vector = "wordfreq" best_model = 1 # linearLogistic save_model(best_vector, best_model) else: od.loadStopwords() od.loadEmotionwords() od.loadWords(od.stopList) od.loadDocument(od.stopList) predictor = Predictor() predictor.load_model() predictor.set_mode(mode="wordfreq") news = " 《经济通通讯社13日专讯》日股早市偏软,日经225指数报18312跌239点。 美元兑日圆疲软,新报108﹒78╱80。(tt) " news = " 周二,恒生指数收报20356.24点,跌236.76点,跌幅1.15%;国企指数收报10596.91点,跌148点,跌幅1.38%;大市成交492.76亿港元。美国3月非农就业数据表现疲弱,拖累隔夜欧美股市全线受压。中国3月份CPI同比增长3.6%,令货币政策在短期内放宽预期降低。港股早盘随外围低开两百多点,但是A股在汇金增持内银股刺激下探底回升,对港股起到支持,之后恒指于低位维持窄幅震荡整理,最终跌逾1%。银行股全线走软。四大内银股方面,工商银行跌0.4%,中国银行跌0.64%,建设银行跌0.67%,农业银行跌1.2%;国际金融股方面,汇丰控股跌1.75%,渣打集团跌1.89%。美国就业市场增长放缓及内地通胀反弹,投资者对经济信心下降。中国央行短期内下调存准机会大减,从而利淡大市气氛。预计港股本周将继续在20200至20700点之间震荡。 " # 待转化的文本 predictor.set_news(news=news) predictor.trans_vec() tag = predictor() print("算出来的和是", sum(predictor._vec[0])) print("打标的结果是:", tag)