@version: ?? @author: li @file: event2mysql.py @time: 2018-12-17 13:53 将更新好的事件按固定格式保存到mysql中 """ import json import pandas as pd from src import data_reader from src.configure import conf from src.utils import file_util, event_util from src.utils.log import log_util from src.utils.engine import data_source logging = log_util.Logger('event2mysql') event_save_path = conf.event_save_path # event_save_path = "/Users/li/PycharmProjects/event_parser/src/model/event_model/" # 从文件目录中导入最新的更新文件 file_new = file_util.find_newest_file(event_save_path) new_event_units = event_util.load_history_event(event_save_path + file_new) # 从数据库中读取最新的新闻的id,title,url和timestamp total_data = data_reader.get_all_data() # 将id设为index,方面后面根据id提取title和url total_data_df = total_data.set_index('id') # 将事件单元的信息整理成规定格式 result = [] for item in new_event_units:
""" @version: ?? @author: li @file: xueqiu_dicsuss_batch.py @time: 2019-03-29 14:05 """ import gc import os import glob import pandas as pd from datetime import datetime, timedelta from src.utils import time_util from src.utils.log import log_util from src.parser.xueqiu.discuss_parser import discuss_parser logging = log_util.Logger('xueqiu_discuss_batch') # 使用desc和rdesc作为当前用户的讨论数据,用户id为当前用户id,讨论id为 def read_csv(path=None): """ 读取原始csv文件 :param path: :return: """ if path is None: path = '/Users/li/Desktop/sets1' file_list = glob.glob(os.path.join(path, "*.csv")) data_list = [] for f in file_list: data_list.append(pd.read_csv(f, header=0,
import time import pandas as pd from tqdm import tqdm from src.configure import conf from src.utils.engine.data_source import GetDataEngine from src.utils import tokenization, data_process, dicts from src.utils.tokenization import load_stop_words from src.utils.log import log_util from src.utils.VSM import tfidf # TaggededDocument = gensim.models.doc2vec.TaggedDocument engine_mysql = GetDataEngine("XAVIER") engine_mysql_test = GetDataEngine("VISIONTEST") engine_sqlserver = GetDataEngine("DNDS") logging = log_util.Logger('data_reader_log') global _stopwords class DataReader(object): def get_news_data(self, data_path=None): if data_path is None: data_path = '/Users/li/PycharmProjects/event_parser/src/text.txt' with open(data_path, 'r') as news: data = news.readlines() train_data = [] for i, text in enumerate(data): # text_list = text.decode("utf8").split(",") text_list = text.split(",") train_data.append(text_list) return train_data
""" import sys sys.path.append('../') sys.path.append('../../') sys.path.append('../../../') sys.path.append('../../../../') sys.path.append('../../../../../') from src.utils import time_util from src.utils.log import log_util import pandas as pd from sqlalchemy import create_engine from src.data_reader import read_all_data from src.parser.xueqiu.discuss_parser import discuss_parser, format_transform logging = log_util.Logger('discuss_stock_filter_daily') # 数据结构调整 def transform_fuc(id, stock_list): """ 将user_id和stock_list两两组合成tuple的list集合 :param id: str :param stock_list: list :return: """ if len(stock_list) <= 0: pass user_id_list = [id] * len(stock_list) tuple_zip = zip(stock_list, user_id_list) tuple_list = list(tuple_zip)
@author: li @file: xueqiu_discuss_csv.py @time: 2019-03-23 14:23 """ # 从csv文件中读取文件 import time import os import glob import pandas as pd from src.utils import time_util from src.utils.engine import data_source from src.utils.log import log_util from src.parser.xueqiu.discuss_parser import discuss_parser logging = log_util.Logger('xueqiu_discuss_csv') # 使用desc和rdesc作为当前用户的讨论数据,用户id为当前用户id,讨论id为 def read_csv(path=None): """ 读取原始csv文件 :param path: :return: """ if path is None: path = '/Users/li/Desktop/sets1' file_list = glob.glob(os.path.join(path, "*.csv")) data_list = [] for f in file_list: data_list.append(
raise deg_dict = {} # 程度副词 senti_dict = {} # 情感词 eng_dict = {} # 英语或拼音词 fou_dict = [] # 否定词 but_dict = [] # 转折词 lim_dict = [] # 限定词 new_dict = [] # 新词 zhi_dict = [] # 知网 stock_dict = [] # 股票词 stock_code_dict = [] # 股票代码 jg_dict = [] # 机构名 stock_df = [] logging = log_util.Logger('dict_log') class DictInit(object): pass def load_stock_data(): dic_path = conf.dic_path st_path = dic_path + "/stock_words.txt" st_new_path = dic_path + "/stock.csv" for st in open(st_path): # st = st.decode("utf8") code1, st_code = st.split("\t") code, stock = st_code.split(",") stock_code_dict.append(code.strip("\n"))
from src.utils import data_process, keywords_extractor from src.utils.log import log_util from src.utils.VSM import tfidf sys.path.append('..') sys.path.append('../') sys.path.append('../../') from src.configure import conf from src.algorithm.cluster.singlePass import singlePassCluster # corpus_train = "/Users/li/PycharmProjects/event_parser/src/text_full_index.txt" # corpus_train = conf.corpus_train_path data_process = data_process.DataPressing() logging = log_util.Logger('event_util', level='info') def events_list(news_title_list): for news in news_title_list: print(news) pass def events_effectiveness(cluster_list, news_dict, tfidf_feature, tfidf_transformer): """ 事件有效性判断 :param tfidf_transformer: :param tfidf_feature: jj :param cluster_list:
""" import sys sys.path.append('../') sys.path.append('../../') sys.path.append('../../../') sys.path.append('../../../../') sys.path.append('../../../../../') import pandas as pd import time from sqlalchemy import create_engine from src.utils import time_util from src.utils.log import log_util from src.data_reader import read_all_data logging = log_util.Logger('xueqiu_focus_statistic') def f(row): if row[:2] == 'SH': return str(row[2:]) + '.' + 'XSHG' elif row[:2] == 'SZ': return str(row[2:]) + '.' + 'XSHE' if __name__ == '__main__': pd.set_option('display.max_rows', None, 'display.max_columns', None, "display.max_colwidth", 1000, 'display.width', 1000) # engine_mysql_test = data_source.GetDataEngine("VISIONTEST") # engine_mysql = data_source.GetDataEngine("VISION") engine_mysql_test = create_engine(
#! /usr/bin/env python # -*- coding: utf-8 -*- """ @version: ?? @author: li @file: keywords_extractor.py @time: 2018/11/19 10:19 AM 基于textRank的关键词提取 """ import numpy as np from src.utils import data_process, dicts, tokenization from src.utils.log import log_util logging = log_util.Logger('keywordsExtractor_log') class TextRank(object): def __init__(self, top_k=20, with_weight=False, window=5, alpha=0.85, min_diff=1000): """ :param top_k: return how many top keywords. `None` for all possible words. :param with_weight: if True, return a list of (word, weight); if False, return a list of words. :param window: :param alpha: :param min_diff:
@version: ?? @author: li @file: singlepass_test.py @time: 2018-12-27 20:38 """ import sys import numpy as np sys.path.append('..') sys.path.append('../') sys.path.append('../../') from src.configure import conf from src.utils.log import log_util from src.utils.VSM import tfidf logging = log_util.Logger('singlepass_test') # corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt" corpus_train_path = conf.corpus_train_path # tfidf_train, word_dict = tfidf_vector(corpus_train) # tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train) corpus_train_dict = tfidf.load_data(corpus_train_path) # load tf-idf VSM tfidf_feature_path = conf.tfidf_feature_path tfidf_transformer_path = conf.tfidftransformer_path try: tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path) tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path) logging.logger.info("TF-IDF model load sucess") except:
import pickle import pandas as pd import sys sys.path.append('..') sys.path.append('../') sys.path.append('../../') from src.configure import conf from src.utils import event_util from src.utils.log import log_util from src.utils.VSM import tfidf from src.data_reader import import_news, import_title, get_event_news # import logger logging = log_util.Logger('history_event') # 导入通过singlePass聚类生成的类簇 # clustering_path = '/Users/li/PycharmProjects/event_parser/src/model/clustering_new.pkl' clustering_path = conf.clustering_save_path try: with open(clustering_path, 'rb') as fr: clustering = pickle.load(fr) logging.logger.info( 'load cluster units from: {}'.format(clustering_path)) except IOError as err: logging.logger.error( 'cluster units pickle file load failed: {} and program stopped'.format( clustering_path)) sys.exit() # clustering.print_result()
@file: singlepass_run.py @time: 2018/11/29 8:04 PM 新闻聚类 """ import sys import time import pickle sys.path.append('..') sys.path.append('../') sys.path.append('../../') from src.configure import conf from src.utils.log import log_util from src.utils.VSM import tfidf from src.algorithm.cluster.singlePass import singlePassCluster logging = log_util.Logger('singlepass_run') # corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt" corpus_train_path = conf.corpus_train_path # tfidf_train, word_dict = tfidf_vector(corpus_train) # tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train) corpus_train_dict = tfidf.load_data(corpus_train_path) # load tf-idf VSM tfidf_feature_path = conf.tfidf_feature_path tfidf_transformer_path = conf.tfidftransformer_path try: tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path) logging.logger.info("TF-IDF feature load success") tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path) logging.logger.info("TF-IDF transformer load success")
调用jieba分词,添加用户自定义词典,封装,并且去停用词等操作 """ import codecs import sys import jieba.posseg as pseg from src.utils import data_process, dicts from src.utils.log import log_util sys.path.append('..') sys.path.append('../') sys.path.append('../../') from src.configure import Configure logging = log_util.Logger('tokenization_log') stopwords = globals() def load_stop_words(): # 停用词库准备, 构建停用词表 conf = Configure() stop_words_path = conf.stop_words_path words_count = dict() try: stop_word = codecs.open(stop_words_path, 'r', encoding='utf8').readlines() stop_words = [w.strip() for w in stop_word] logging.logger.info("Stopwords 导入成功!") return stop_words
import datetime from src import data_reader import pandas as pd from tqdm import tqdm sys.path.append('../') sys.path.append('..') sys.path.append('../../') from src.utils.log import log_util from src.configure import conf # noqa: E402 from src.utils import event_util, file_util, data_process, dicts, tokenization, time_util # noqa: E402 from src.utils.VSM import tfidf from src.algorithm.cluster.singlePass import singlePassCluster logging = log_util.Logger('dynamic_update', level='debug') logging.logger.info('事件库动态更新启动时间: {}'.format( time_util.timestamp_to_time(time.time()))) # step 1、读取指定日期之后的新闻 # 初次动态更新时,event_save_path下保存的是event latest_event_file = file_util.find_newest_file(conf.event_save_path) if latest_event_file is None or latest_event_file is 'NULL': # 如果没有动态更新过事件, 则today_timestamp # 读取当前时间段时间 now = datetime.date.today() today_timestamp = int(time.mktime(now.timetuple())) today = time_util.timestamp_to_time(today_timestamp) # logging.logger.info('读取新闻的起始时间: {}'.format(today)) # ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp) else: # 使用事件的最后更新时间作为新闻的起止时间