예제 #1
0
@version: ??
@author: li
@file: event2mysql.py
@time: 2018-12-17 13:53
将更新好的事件按固定格式保存到mysql中
"""

import json
import pandas as pd
from src import data_reader
from src.configure import conf
from src.utils import file_util, event_util
from src.utils.log import log_util
from src.utils.engine import data_source

logging = log_util.Logger('event2mysql')
event_save_path = conf.event_save_path
# event_save_path = "/Users/li/PycharmProjects/event_parser/src/model/event_model/"

# 从文件目录中导入最新的更新文件
file_new = file_util.find_newest_file(event_save_path)
new_event_units = event_util.load_history_event(event_save_path + file_new)

# 从数据库中读取最新的新闻的id,title,url和timestamp
total_data = data_reader.get_all_data()
# 将id设为index,方面后面根据id提取title和url
total_data_df = total_data.set_index('id')

# 将事件单元的信息整理成规定格式
result = []
for item in new_event_units:
예제 #2
0
"""
@version: ??
@author: li
@file: xueqiu_dicsuss_batch.py
@time: 2019-03-29 14:05
"""
import gc
import os
import glob
import pandas as pd
from datetime import datetime, timedelta
from src.utils import time_util
from src.utils.log import log_util
from src.parser.xueqiu.discuss_parser import discuss_parser

logging = log_util.Logger('xueqiu_discuss_batch')


# 使用desc和rdesc作为当前用户的讨论数据,用户id为当前用户id,讨论id为
def read_csv(path=None):
    """
    读取原始csv文件
    :param path:
    :return:
    """
    if path is None:
        path = '/Users/li/Desktop/sets1'
    file_list = glob.glob(os.path.join(path, "*.csv"))
    data_list = []
    for f in file_list:
        data_list.append(pd.read_csv(f, header=0,
예제 #3
0
import time
import pandas as pd
from tqdm import tqdm
from src.configure import conf
from src.utils.engine.data_source import GetDataEngine
from src.utils import tokenization, data_process, dicts
from src.utils.tokenization import load_stop_words
from src.utils.log import log_util
from src.utils.VSM import tfidf

# TaggededDocument = gensim.models.doc2vec.TaggedDocument

engine_mysql = GetDataEngine("XAVIER")
engine_mysql_test = GetDataEngine("VISIONTEST")
engine_sqlserver = GetDataEngine("DNDS")
logging = log_util.Logger('data_reader_log')
global _stopwords


class DataReader(object):
    def get_news_data(self, data_path=None):
        if data_path is None:
            data_path = '/Users/li/PycharmProjects/event_parser/src/text.txt'
        with open(data_path, 'r') as news:
            data = news.readlines()
        train_data = []
        for i, text in enumerate(data):
            # text_list = text.decode("utf8").split(",")
            text_list = text.split(",")
            train_data.append(text_list)
        return train_data
예제 #4
0
"""
import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')
sys.path.append('../../../../')
sys.path.append('../../../../../')

from src.utils import time_util
from src.utils.log import log_util
import pandas as pd
from sqlalchemy import create_engine
from src.data_reader import read_all_data
from src.parser.xueqiu.discuss_parser import discuss_parser, format_transform

logging = log_util.Logger('discuss_stock_filter_daily')


# 数据结构调整
def transform_fuc(id, stock_list):
    """
    将user_id和stock_list两两组合成tuple的list集合
    :param id: str
    :param stock_list: list
    :return:
    """
    if len(stock_list) <= 0:
        pass
    user_id_list = [id] * len(stock_list)
    tuple_zip = zip(stock_list, user_id_list)
    tuple_list = list(tuple_zip)
예제 #5
0
@author: li
@file: xueqiu_discuss_csv.py
@time: 2019-03-23 14:23
"""

# 从csv文件中读取文件
import time
import os
import glob
import pandas as pd
from src.utils import time_util
from src.utils.engine import data_source
from src.utils.log import log_util
from src.parser.xueqiu.discuss_parser import discuss_parser

logging = log_util.Logger('xueqiu_discuss_csv')


# 使用desc和rdesc作为当前用户的讨论数据,用户id为当前用户id,讨论id为
def read_csv(path=None):
    """
    读取原始csv文件
    :param path:
    :return:
    """
    if path is None:
        path = '/Users/li/Desktop/sets1'
    file_list = glob.glob(os.path.join(path, "*.csv"))
    data_list = []
    for f in file_list:
        data_list.append(
예제 #6
0
파일: dicts.py 프로젝트: STHSF/EventsParser
    raise

deg_dict = {}  # 程度副词
senti_dict = {}  # 情感词
eng_dict = {}  # 英语或拼音词
fou_dict = []  # 否定词
but_dict = []  # 转折词
lim_dict = []  # 限定词
new_dict = []  # 新词
zhi_dict = []  # 知网
stock_dict = []  # 股票词
stock_code_dict = []  # 股票代码
jg_dict = []  # 机构名
stock_df = []

logging = log_util.Logger('dict_log')


class DictInit(object):
    pass


def load_stock_data():
    dic_path = conf.dic_path
    st_path = dic_path + "/stock_words.txt"
    st_new_path = dic_path + "/stock.csv"
    for st in open(st_path):
        # st = st.decode("utf8")
        code1, st_code = st.split("\t")
        code, stock = st_code.split(",")
        stock_code_dict.append(code.strip("\n"))
예제 #7
0
from src.utils import data_process, keywords_extractor
from src.utils.log import log_util
from src.utils.VSM import tfidf

sys.path.append('..')
sys.path.append('../')
sys.path.append('../../')
from src.configure import conf
from src.algorithm.cluster.singlePass import singlePassCluster

# corpus_train = "/Users/li/PycharmProjects/event_parser/src/text_full_index.txt"
# corpus_train = conf.corpus_train_path

data_process = data_process.DataPressing()
logging = log_util.Logger('event_util', level='info')


def events_list(news_title_list):
    for news in news_title_list:
        print(news)
    pass


def events_effectiveness(cluster_list, news_dict, tfidf_feature,
                         tfidf_transformer):
    """
    事件有效性判断
    :param tfidf_transformer:
    :param tfidf_feature: jj
    :param cluster_list:
예제 #8
0
"""
import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')
sys.path.append('../../../../')
sys.path.append('../../../../../')

import pandas as pd
import time
from sqlalchemy import create_engine
from src.utils import time_util
from src.utils.log import log_util
from src.data_reader import read_all_data

logging = log_util.Logger('xueqiu_focus_statistic')


def f(row):
    if row[:2] == 'SH':
        return str(row[2:]) + '.' + 'XSHG'
    elif row[:2] == 'SZ':
        return str(row[2:]) + '.' + 'XSHE'


if __name__ == '__main__':
    pd.set_option('display.max_rows', None, 'display.max_columns', None,
                  "display.max_colwidth", 1000, 'display.width', 1000)
    # engine_mysql_test = data_source.GetDataEngine("VISIONTEST")
    # engine_mysql = data_source.GetDataEngine("VISION")
    engine_mysql_test = create_engine(
예제 #9
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""
@version: ??
@author: li
@file: keywords_extractor.py
@time: 2018/11/19 10:19 AM
基于textRank的关键词提取
"""
import numpy as np

from src.utils import data_process, dicts, tokenization
from src.utils.log import log_util

logging = log_util.Logger('keywordsExtractor_log')


class TextRank(object):
    def __init__(self,
                 top_k=20,
                 with_weight=False,
                 window=5,
                 alpha=0.85,
                 min_diff=1000):
        """
        :param top_k: return how many top keywords. `None` for all possible words.
        :param with_weight: if True, return a list of (word, weight);
                            if False, return a list of words.
        :param window:
        :param alpha:
        :param min_diff:
예제 #10
0
@version: ??
@author: li
@file: singlepass_test.py
@time: 2018-12-27 20:38
"""

import sys
import numpy as np
sys.path.append('..')
sys.path.append('../')
sys.path.append('../../')
from src.configure import conf
from src.utils.log import log_util
from src.utils.VSM import tfidf

logging = log_util.Logger('singlepass_test')
# corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt"
corpus_train_path = conf.corpus_train_path
# tfidf_train, word_dict = tfidf_vector(corpus_train)
# tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train)
corpus_train_dict = tfidf.load_data(corpus_train_path)

# load tf-idf VSM
tfidf_feature_path = conf.tfidf_feature_path
tfidf_transformer_path = conf.tfidftransformer_path

try:
    tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path)
    tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path)
    logging.logger.info("TF-IDF model load sucess")
except:
예제 #11
0
import pickle
import pandas as pd

import sys
sys.path.append('..')
sys.path.append('../')
sys.path.append('../../')

from src.configure import conf
from src.utils import event_util
from src.utils.log import log_util
from src.utils.VSM import tfidf
from src.data_reader import import_news, import_title, get_event_news

# import logger
logging = log_util.Logger('history_event')
# 导入通过singlePass聚类生成的类簇
# clustering_path = '/Users/li/PycharmProjects/event_parser/src/model/clustering_new.pkl'
clustering_path = conf.clustering_save_path
try:
    with open(clustering_path, 'rb') as fr:
        clustering = pickle.load(fr)
        logging.logger.info(
            'load cluster units from: {}'.format(clustering_path))
except IOError as err:
    logging.logger.error(
        'cluster units pickle file load failed: {} and program stopped'.format(
            clustering_path))
    sys.exit()
# clustering.print_result()
예제 #12
0
@file: singlepass_run.py
@time: 2018/11/29 8:04 PM
新闻聚类
"""
import sys
import time
import pickle
sys.path.append('..')
sys.path.append('../')
sys.path.append('../../')
from src.configure import conf
from src.utils.log import log_util
from src.utils.VSM import tfidf
from src.algorithm.cluster.singlePass import singlePassCluster

logging = log_util.Logger('singlepass_run')
# corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt"
corpus_train_path = conf.corpus_train_path
# tfidf_train, word_dict = tfidf_vector(corpus_train)
# tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train)
corpus_train_dict = tfidf.load_data(corpus_train_path)

# load tf-idf VSM
tfidf_feature_path = conf.tfidf_feature_path
tfidf_transformer_path = conf.tfidftransformer_path

try:
    tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path)
    logging.logger.info("TF-IDF feature load success")
    tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path)
    logging.logger.info("TF-IDF transformer load success")
예제 #13
0
调用jieba分词,添加用户自定义词典,封装,并且去停用词等操作
"""
import codecs
import sys

import jieba.posseg as pseg

from src.utils import data_process, dicts
from src.utils.log import log_util

sys.path.append('..')
sys.path.append('../')
sys.path.append('../../')
from src.configure import Configure

logging = log_util.Logger('tokenization_log')

stopwords = globals()


def load_stop_words():
    # 停用词库准备, 构建停用词表
    conf = Configure()
    stop_words_path = conf.stop_words_path
    words_count = dict()
    try:
        stop_word = codecs.open(stop_words_path, 'r',
                                encoding='utf8').readlines()
        stop_words = [w.strip() for w in stop_word]
        logging.logger.info("Stopwords 导入成功!")
        return stop_words
예제 #14
0
import datetime
from src import data_reader
import pandas as pd
from tqdm import tqdm

sys.path.append('../')
sys.path.append('..')
sys.path.append('../../')

from src.utils.log import log_util
from src.configure import conf  # noqa: E402
from src.utils import event_util, file_util, data_process, dicts, tokenization, time_util  # noqa: E402
from src.utils.VSM import tfidf
from src.algorithm.cluster.singlePass import singlePassCluster

logging = log_util.Logger('dynamic_update', level='debug')
logging.logger.info('事件库动态更新启动时间: {}'.format(
    time_util.timestamp_to_time(time.time())))
# step 1、读取指定日期之后的新闻
# 初次动态更新时,event_save_path下保存的是event
latest_event_file = file_util.find_newest_file(conf.event_save_path)
if latest_event_file is None or latest_event_file is 'NULL':
    # 如果没有动态更新过事件, 则today_timestamp
    # 读取当前时间段时间
    now = datetime.date.today()
    today_timestamp = int(time.mktime(now.timetuple()))
    today = time_util.timestamp_to_time(today_timestamp)
    # logging.logger.info('读取新闻的起始时间: {}'.format(today))
    # ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp)
else:
    # 使用事件的最后更新时间作为新闻的起止时间