示例#1
0
def get_browser(browser=None):
    if browser is None:
        browser = get_web_driver(port=config_get('selenium.chrome.port'))
    # 浏览器对象不可用,重新创建
    try:
        # 获取浏览器句柄,也就是标签页
        handle = browser.current_window_handle
    except WebDriverException as e:
        print('重新创建browser,error:{}'.format(e))
        browser = get_web_driver(port=config_get('selenium.chrome.port'))
    return browser
示例#2
0
def get_mysql_conn():
    return pymysql.connect(host=config_get('mysql.host'),
                           port=config_get('mysql.port'),
                           user=config_get('mysql.user'),
                           password=config_get('mysql.password'),
                           db=config_get('mysql.db'),
                           charset=config_get('mysql.charset'))
示例#3
0
import os
import re
import threading
import time
import traceback

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait

from crawler.config.config_yml import get as config_get
from crawler.utils import base as base_util

ATTACHMENT_CRAWLER_DIR = config_get('file.attachment.crawler.dir')
INVALID_TITLE = config_get('crawler.column.invalid.title')
INVALID_HREF = config_get('crawler.column.invalid.href')
HTML_CLEAN_REGEXP = config_get('crawler.article.html.clean.regexp')


class ColumnResult:
    column_id = None
    url = None
    xpath_article_title = None
    xpath_article_page = None
    begin_page = None
    crawled_page = None
    charset = None
    article_list = None
    error = None
    current_page = 0
示例#4
0
import sys

from flask import Flask
from kafka import KafkaConsumer

# crawler的父目录加入系统搜索目录,以使得在linux系统中可以import自定义的模块
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from crawler.handler.crawl_handler import get_browser, get_conn
from crawler.spider.no_login_web import crawl_column, crawl_article, ColumnResult, ArticleResult
from crawler.utils.unique_id import next_id
from crawler.config.config_yml import get as config_get
from crawler.utils.base import str_exception

BROWSER = get_browser()
CONN = get_conn()
FLASK_HOST = config_get('flask.host')
FLASK_PORT = config_get('flask.port')
KAFKA_BOOTSTRAP_SERVERS = config_get('spring.kafka.bootstrap-servers')
KAFKA_TOPIC = config_get('kafka.topic.crawler')


def crawl_title(column_id):
    print('开始爬取网站栏目数据,column_id: {}'.format(column_id))
    browser = get_browser(BROWSER)
    conn = get_conn(CONN)
    cursor = conn.cursor()
    column_update_error = '''
        update `column` set
        error = %s
        where column_id = %s
        '''
示例#5
0
import time
import logging
import threading
from crawler.config.config_yml import get as config_get

# 工作站和数据中心的配置
WORKER_ID = config_get('id.unique.python.crawler.workerId')
DATA_CENTER_ID = config_get('id.unique.python.crawler.dataCenterId')

# 64位ID的划分
WORKER_ID_BITS = 5
DATACENTER_ID_BITS = 5
SEQUENCE_BITS = 12

# 最大取值计算
MAX_WORKER_ID = -1 ^ (-1 << WORKER_ID_BITS)  # 2**5-1 0b11111
MAX_DATACENTER_ID = -1 ^ (-1 << DATACENTER_ID_BITS)

# 移位偏移计算
WORKER_ID_SHIFT = SEQUENCE_BITS
DATACENTER_ID_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS
TIMESTAMP_LEFT_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS + DATACENTER_ID_BITS

# 序号循环掩码
SEQUENCE_MASK = -1 ^ (-1 << SEQUENCE_BITS)

# Twitter元年时间戳
TIME_EPOCH = 1577808000177

logger = logging.getLogger('flask.app')