예제 #1
0
    def export_to_mysql(self,
                        source_table='',
                        aim_table='',
                        key_map='',
                        unique_key=None,
                        unique_key_mapping_source_key=None,
                        update_read_status=True,
                        condition={'read_status': 0},
                        datas=[],
                        callback=''):
        if self._aim_table != aim_table:
            self._is_set_unique_key = False

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key
        self._update_read_status = update_read_status if not datas else False
        self._condition = condition
        self._datas = datas
        self._callback = callback

        self._aim_db = MysqlDB()
        return self.__export()
예제 #2
0
    def __init__(self):

        self._mysqldb = MysqlDB(**config.get('mysqldb'))
        self._redis = RedisDB(**config.get('redisdb'))

        self._task_root_key = config.get('spider').get(
            'redis_task_cache_root_key')

        self._account_task_key = self._task_root_key + ':z_account_task'
        self._article_task_key = self._task_root_key + ':z_article_task'
        self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time'
        self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time'

        self._ignore_haved_crawl_today_article_account = config.get(
            'spider').get('ignore_haved_crawl_today_article_account')
        self._monitor_interval = config.get('spider').get('monitor_interval')
        self._zombie_account_not_publish_article_days = config.get(
            'spider').get('zombie_account_not_publish_article_days')
        self._spider_interval_min = config.get('spider').get(
            'spider_interval').get('min_sleep_time')
        self._spider_interval_max = config.get('spider').get(
            'spider_interval').get('max_sleep_time')
        self._spider_interval_max = config.get('spider').get(
            'spider_interval').get('max_sleep_time')
        self._crawl_time_range = (config.get("spider").get("crawl_time_range")
                                  or "~").split('~')
예제 #3
0
def create_table():
    wechat_article_list_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article_list` (
      `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
      `title` varchar(255) UNIQUE COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `digest` varchar(2000) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `url` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `source_url` varchar(1000) COLLATE utf8mb4_unicode_ci DEFAULT NULL  ,
      `cover` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `subtype` int(11) DEFAULT NULL,
      `is_multi` int(11) DEFAULT NULL,
      `author` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `copyright_stat` int(11) DEFAULT NULL,
      `duration` int(11) DEFAULT NULL,
      `del_flag` int(11) DEFAULT NULL,
      `type` int(11) DEFAULT NULL,
      `publish_time` datetime DEFAULT NULL,
      `sn` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `sn` (`sn`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    wechat_article_task_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article_task` (
      `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
      `sn` varchar(50) DEFAULT NULL,
      `article_url` varchar(255) DEFAULT NULL UNIQUE ,
      `state` int(11) DEFAULT '0' COMMENT '文章抓取状态,0 待抓取 2 抓取中 1 抓取完毕 -1 抓取失败',
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `sn` (`sn`) USING BTREE,
      KEY `state` (`state`) USING BTREE
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
    '''

    wechat_article_dynamic_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article_dynamic` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `sn` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `read_num` int(11) DEFAULT NULL,
      `like_num` int(11) DEFAULT NULL,
      `comment_count` int(11) DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `sn` (`sn`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    wechat_article_comment_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article_comment` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `comment_id` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '与文章关联',
      `nick_name` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `logo_url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `content` varchar(2000) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `create_time` datetime DEFAULT NULL,
      `content_id` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '本条评论内容的id',
      `like_num` int(11) DEFAULT NULL,
      `is_top` int(11) DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `content_id` (`content_id`),
      KEY `comment_id` (`comment_id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    wechat_article_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `account` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `title` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `author` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `publish_time` datetime DEFAULT NULL,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `digest` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `cover` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `pics_url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
      `content_html` text COLLATE utf8mb4_unicode_ci,
      `source_url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `comment_id` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `sn` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `sn` (`sn`),
      KEY `__biz` (`__biz`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    wechat_account_task_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_account_task` (
      `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
      `__biz` varchar(50) DEFAULT NULL,
      `last_publish_time` datetime DEFAULT NULL COMMENT '上次抓取到的文章发布时间,做文章增量采集用',
      `last_spider_time` datetime DEFAULT NULL COMMENT '上次抓取时间,用于同一个公众号每隔一段时间扫描一次',
      `is_zombie` int(11) DEFAULT '0' COMMENT '僵尸号 默认3个月未发布内容为僵尸号,不再检测',
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
    '''

    wechat_account_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_account` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `account` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `head_url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `summary` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `qr_code` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `verify` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `__biz` (`__biz`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    if config.get('mysqldb').get('auto_create_tables'):
        mysqldb = MysqlDB(**config.get('mysqldb'))
        _create_table(mysqldb, wechat_article_list_table)
        _create_table(mysqldb, wechat_article_task_table)
        _create_table(mysqldb, wechat_article_dynamic_table)
        _create_table(mysqldb, wechat_article_comment_table)
        _create_table(mysqldb, wechat_article_table)
        _create_table(mysqldb, wechat_account_task_table)
        _create_table(mysqldb, wechat_account_table)
 def export_to_mysql(self):
     self._aim_db = MysqlDB()
     self.__export()
예제 #5
0
# -*- coding: utf-8 -*-
'''
Created on 2019/5/13 12:44 AM
---------
@summary:
---------
@author:
'''
from db.mysqldb import MysqlDB
import utils.tools as tools
from utils.log import log
from config import config

db = MysqlDB(**config.get('mysqldb'))


def save_account(data):
    log.debug(tools.dumps_json(data))

    sql = tools.make_insert_sql('wechat_account', data, insert_ignore=True)
    db.add(sql)


def save_article_list(datas: list):
    log.debug(tools.dumps_json(datas))

    sql, articles = tools.make_batch_sql('wechat_article_list', datas)
    db.add_batch(sql, articles)

    # 存文章任务
    article_task = [