Пример #1
0
 def __init__(self,
              headers=None,
              num_retries=3,
              proxies=None,
              delay=2,
              timeout=30):
     self.headers = headers
     self.num_retries = num_retries
     self.proxies = proxies
     self.throttle = Throttle(delay)
     self.timeout = timeout
     self.loger = log_func.wang_log()
Пример #2
0
#!/usr/bin/env Python
# -*- coding:utf-8 -*-
# 获取信息 + 保存为csv

import json
import re
import time
from urllib.parse import urlparse
from datetime import datetime, timedelta
import csv
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import log_func

log = log_func.wang_log()


class Throttle:
    """阀门类,对相同域名的访问添加延迟时间,避免访问过快
    """
    def __init__(self, delay):
        # 延迟时间,避免访问过快
        self.delay = delay
        # 用字典保存访问某域名的时间
        self.domains = {}

    def wait(self, url):
        """对访问过的域名添加延迟时间
        """
        domain = urlparse(url).netloc
Пример #3
0
 def __init__(self):
     self.base_dir = os.getcwd()
     self.log = log_func.wang_log()