def __init__(self, headers=None, num_retries=3, proxies=None, delay=2, timeout=30): self.headers = headers self.num_retries = num_retries self.proxies = proxies self.throttle = Throttle(delay) self.timeout = timeout self.loger = log_func.wang_log()
#!/usr/bin/env Python # -*- coding:utf-8 -*- # 获取信息 + 保存为csv import json import re import time from urllib.parse import urlparse from datetime import datetime, timedelta import csv import requests from requests.exceptions import RequestException from bs4 import BeautifulSoup import log_func log = log_func.wang_log() class Throttle: """阀门类,对相同域名的访问添加延迟时间,避免访问过快 """ def __init__(self, delay): # 延迟时间,避免访问过快 self.delay = delay # 用字典保存访问某域名的时间 self.domains = {} def wait(self, url): """对访问过的域名添加延迟时间 """ domain = urlparse(url).netloc
def __init__(self): self.base_dir = os.getcwd() self.log = log_func.wang_log()