def get_browser(browser=None): if browser is None: browser = get_web_driver(port=config_get('selenium.chrome.port')) # 浏览器对象不可用,重新创建 try: # 获取浏览器句柄,也就是标签页 handle = browser.current_window_handle except WebDriverException as e: print('重新创建browser,error:{}'.format(e)) browser = get_web_driver(port=config_get('selenium.chrome.port')) return browser
def get_mysql_conn(): return pymysql.connect(host=config_get('mysql.host'), port=config_get('mysql.port'), user=config_get('mysql.user'), password=config_get('mysql.password'), db=config_get('mysql.db'), charset=config_get('mysql.charset'))
import os import re import threading import time import traceback from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.ui import WebDriverWait from crawler.config.config_yml import get as config_get from crawler.utils import base as base_util ATTACHMENT_CRAWLER_DIR = config_get('file.attachment.crawler.dir') INVALID_TITLE = config_get('crawler.column.invalid.title') INVALID_HREF = config_get('crawler.column.invalid.href') HTML_CLEAN_REGEXP = config_get('crawler.article.html.clean.regexp') class ColumnResult: column_id = None url = None xpath_article_title = None xpath_article_page = None begin_page = None crawled_page = None charset = None article_list = None error = None current_page = 0
import sys from flask import Flask from kafka import KafkaConsumer # crawler的父目录加入系统搜索目录,以使得在linux系统中可以import自定义的模块 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) from crawler.handler.crawl_handler import get_browser, get_conn from crawler.spider.no_login_web import crawl_column, crawl_article, ColumnResult, ArticleResult from crawler.utils.unique_id import next_id from crawler.config.config_yml import get as config_get from crawler.utils.base import str_exception BROWSER = get_browser() CONN = get_conn() FLASK_HOST = config_get('flask.host') FLASK_PORT = config_get('flask.port') KAFKA_BOOTSTRAP_SERVERS = config_get('spring.kafka.bootstrap-servers') KAFKA_TOPIC = config_get('kafka.topic.crawler') def crawl_title(column_id): print('开始爬取网站栏目数据,column_id: {}'.format(column_id)) browser = get_browser(BROWSER) conn = get_conn(CONN) cursor = conn.cursor() column_update_error = ''' update `column` set error = %s where column_id = %s '''
import time import logging import threading from crawler.config.config_yml import get as config_get # 工作站和数据中心的配置 WORKER_ID = config_get('id.unique.python.crawler.workerId') DATA_CENTER_ID = config_get('id.unique.python.crawler.dataCenterId') # 64位ID的划分 WORKER_ID_BITS = 5 DATACENTER_ID_BITS = 5 SEQUENCE_BITS = 12 # 最大取值计算 MAX_WORKER_ID = -1 ^ (-1 << WORKER_ID_BITS) # 2**5-1 0b11111 MAX_DATACENTER_ID = -1 ^ (-1 << DATACENTER_ID_BITS) # 移位偏移计算 WORKER_ID_SHIFT = SEQUENCE_BITS DATACENTER_ID_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS TIMESTAMP_LEFT_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS + DATACENTER_ID_BITS # 序号循环掩码 SEQUENCE_MASK = -1 ^ (-1 << SEQUENCE_BITS) # Twitter元年时间戳 TIME_EPOCH = 1577808000177 logger = logging.getLogger('flask.app')