示例#1
0
def checkTodayData():
    session = load_session()
    target_date = datetime.now().date()
    logger.info("checking data on {}".format(target_date))
    document_count = session.query(func.count(
        News.article_id)).filter(News.publish_time > target_date)
    session.close()
    logger.info("checked data on {}".format(target_date))
    return document_count > 0
示例#2
0
def aggregate(days):
    session = load_session()

    # cursor = session.execute("select DATE_FORMAT(publish_time,'%Y-%m-%d') as d, count(article_id) as nums "
    #                          "from news "
    #                          "where publish_time > '2021-01-29' "
    #                          "group by DATE_FORMAT(publish_time,'%Y-%m-%d') "
    #                          "order by d desc ")

    target_date = datetime.now().date() + timedelta(days=-days)
    cursor = session.execute(
        "select DATE_FORMAT(publish_time,'%Y-%m-%d') as d, count(article_id) as nums "
        "from news "
        "where DATE_FORMAT(publish_time,'%Y-%m-%d') = '{}' "
        "group by DATE_FORMAT(publish_time,'%Y-%m-%d') "
        "order by d desc ".format(target_date))

    items = cursor.fetchall()
    data = {}
    for item in items:
        data['{}'.format(item[0])] = {'kw_date': [], 'nums': item[1]}

    cursor = session.execute(
        "select kw "
        "from news_summary "
        "where DATE_FORMAT(publish_time,'%Y-%m-%d') = '{}' "
        "order by publish_time desc".format(target_date))
    kws = cursor.fetchall()
    kw_oneday = []
    for kw in kws:
        kw_oneday.append(json.loads(kw[0]))
    data['{}'.format(target_date)]['kw_date'] = kw_oneday

    # for d in range(1,days):
    #     target_date = datetime.now().date() + timedelta(days=-d)
    #     cursor = session.execute("select kw "
    #                              "from news_summary "
    #                              "where DATE_FORMAT(publish_time,'%Y-%m-%d') = '{}' "
    #                              "order by publish_time desc".format(target_date))
    #     kws = cursor.fetchall()
    #     kw_oneday = []
    #     for kw in kws:
    #         kw_oneday.append(json.loads(kw[0]))
    #     data['{}'.format(target_date)]['kw_date'] = kw_oneday

    for key in data.keys():
        news_info = NewsInfo(key, data[key]['kw_date'], data[key]['nums'])
        try:
            session.merge(news_info)
            logger.info("提交汇总")
        except Exception as e:
            logger.error("回滚汇总: {}".format(e))
            session.rollback()

    session.commit()
    session.close()
示例#3
0
def calculate_target_date(proportion):
    session = load_session()
    cursor = session.execute(
        "select count(publish_date) "
        "from (select distinct DATE_FORMAT(publish_time,'%Y-%m-%d') as publish_date "
        "      from news) as a")
    all_days = cursor.fetchone()[0]
    target_date = datetime.now().date() + timedelta(
        days=-int((1 - proportion) * all_days))
    return target_date
示例#4
0
def removeData(days):
    session = load_session()
    target_date = datetime.now().date() + timedelta(days=-days)
    try:
        session.execute(
            "delete from news_summary "
            "where DATE_FORMAT(publish_time,'%Y-%m-%d') = '{}'".format(
                target_date))
        session.execute("delete from news_info "
                        "where publish_date = '{}'".format(target_date))
    except Exception as e:
        logger.error("回滚清除: {}".format(e))
        session.rollback()
    session.commit()
    session.close()
示例#5
0
def load_update_raw_documents(days):
    """
        从数据库中加载昨日的生文档
        :return: raw_documents: List[Tuple[str]]
    """
    logger.info("loading update raw documents")
    session = load_session()
    target_date = datetime.now().date() + timedelta(days=-days)
    tomorrow = target_date + timedelta(days=1)
    logger.info("loading data from {} - {}".format(target_date, tomorrow))
    raw_documents = []
    for row in session.query(News.article_id, News.content).filter(
            News.publish_time > target_date, News.publish_time < tomorrow):
        raw_documents.append((row[0], row[1]))
    session.close()
    logger.info("loaded update raw documents")
    return raw_documents
示例#6
0
def parseDoc(access_token, days):
    logger.info("正在加载数据...")
    target_date = datetime.now().date() + timedelta(days=-days)
    with open("../TopicDiscovery/predict/top/{}.json".format(target_date),
              'r',
              encoding='utf-8') as f:
        doc_topic = json.load(f)
    logger.info(doc_topic)
    logger.info("正在处理数据...按照一定比例提取摘要")
    session = load_session()
    for item in doc_topic:
        for row in session.query(News.content, News.title,
                                 News.publish_time).filter(
                                     News.article_id == item['article_id']):
            content = row[0] if len(row[0]) <= 3000 else row[0][:3000]
            title = row[1]
            publish_time = row[2]
            if len(content) > 2100:
                summary = getSummary(access_token, title, content, 900)
            elif 1500 < len(content) <= 2100:
                summary = getSummary(access_token, title, content,
                                     len(content) * 0.25 + 375)
            elif 600 < len(content) <= 1500:
                summary = getSummary(access_token, title, content,
                                     len(content) * 0.5)
            else:
                summary = getSummary(access_token, title, content, 300)
            new_news_summary = NewsSummary(item['article_id'], title, summary,
                                           item['kw'], publish_time)
            logger.info(
                "[article_id='{}'; title='{}'; summary='{}'; kw='{}'; publish_time='{}']"
                .format(item['article_id'], title, summary, item['kw'],
                        publish_time))
            time.sleep(1)
            try:
                logger.info("提交[article_id={}]到数据库中".format(
                    item['article_id']))
                session.merge(new_news_summary)
                logger.info("提交[article_id={}成功".format(item['article_id']))
            except Exception as e:
                session.rollback()
                logger.error("提交失败,出现异常,回滚。原因:{}".format(e))
    session.commit()
    session.close()
    logger.info("处理数据完成")
示例#7
0
def load_raw_documents(proportion):
    """
    从数据库中加载生文档
    :return: raw_corpus: List[Tuple[str]]
    """
    logger.info("loading {}% raw documents from database".format(
        int(proportion * 100)))
    session = load_session()
    # cursor = session.execute("select count(publish_date) "
    #                          "from (select distinct DATE_FORMAT(publish_time,'%Y-%m-%d') as publish_date "
    #                          "      from news) as a")
    # all_days = cursor.fetchone()[0]
    # 目标日期以今天作为基准
    target_date = calculate_target_date(proportion)
    logger.info("loading data from ~ - {}".format(target_date +
                                                  timedelta(days=-1)))
    raw_corpus = []
    for row in session.query(
            News.article_id,
            News.content).filter(News.publish_time < target_date):
        raw_corpus.append((row[0], row[1]))
    session.close()
    logger.info("loaded raw documents")
    return raw_corpus