def __init__(self, crawler): super(RandUserAgent, self).__init__() self.ua = UserAgent() # 从配置文件settings中读取RANDOM_UA_TYPE值,默认为random,可以在settings中自定义 self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") self.conn = RedisClient('cookies', 'tendata')
def __init__(self, website): self.website = website self.conn = RedisClient('accounts', website)
def get_conn(): if not hasattr(g,'redis'): g.redis = RedisClient() return g.redis
def __init__(self): self.db = RedisClient() pass
""" 调度模块 """ # 调度模块的类 import time import multiprocessing from db import RedisClient from getter import proxy_func_list from verify import verify_thread_pool from api import app from config import GETTER_PROXY, VERIFY_PROXY client = RedisClient() class Schedule: # 1. 调度获取代理模块 def getter_proxy(self): while True: for func in proxy_func_list: proxies = func() for proxy in proxies: print('--代理写入数据库--', proxy) client.add(proxy) time.sleep(GETTER_PROXY) # 每五分钟爬取一次代理进行入库 # 2. 调度验证代理模块 def verify_proxy(self): while True: verify_thread_pool() time.sleep(VERIFY_PROXY)
def __init__(self): self.db = RedisClient() self.crawl = Crawl_ip() self.test = Test_ip()
#!/usr/bin/env python3 # -*- coding=utf-8 -*- import json import re import time import random import requests from lxml import etree from fake_useragent import UserAgent from db import RedisClient redis = RedisClient() ua = UserAgent() class ProxyMetaclass(type): def __new__(cls, name, bases, attrs): count = 0 attrs['__CrawlFunc__'] = [] for k, v in attrs.items(): if 'crawl_' in k: attrs['__CrawlFunc__'].append(k) count += 1 attrs['__CrawlFuncCount__'] = count return type.__new__(cls, name, bases, attrs) class Crawler(object, metaclass=ProxyMetaclass): def get_proxies(self, callback): proxies = [] for proxy in eval("self.{}()".format(callback)):
def get_conn(website): if not hasattr(g, 'redis'): g.redis = RedisClient('cookies', website) return g.redis
def __init__(self): self.redis = RedisClient() self.headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" } self.test_url = ""
def __init__(self, website='tianyancha'): """初始化数据库类和cookie爬虫类""" self.website = website self.redis = RedisClient('accounts', self.website) self.crawler = Crawler() self.accounts_db = RedisClient('accounts', self.website)
def __init__(self): self.client = RedisClient() self.kuaidai = KuaidaiProcuration()
def __init__(self): self.redis = RedisClient() self.headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', }
def __init__(self): self.redis = RedisClient() self.pool = ProxyPool.objects.filter(is_exsist=True)
def __init__(self): self.csrf_token_db = RedisClient('Csrf', 'Token')
def __init__(self): """初始化数据库类和代理爬虫类""" self.redis = RedisClient() self.crawler = Crawler()
from db import RedisClient conn = RedisClient() def set(proxy): result = conn.add(proxy) print(proxy) print('录入成功' if result else '录入失败') def scan(): print('请输入代理, 输入exit退出读入') while True: proxy = input() if proxy == 'exit': break set(proxy) if __name__ == '__main__': scan()
def get_db(): if 'db' not in g: g.db = RedisClient() return g.db
def get(): if request.args.get('m') == 'mina998': return RedisClient().pop() else: return '111.111.111.111:111'
def __init__(self, username, password): self.username = username self.password = password self.db = RedisClient() self.request = requests.Session() self.request.headers.update(Headers)
def count(): return '%s' % RedisClient().count()
def __init__(self): self.download = Download() self.db = MysqlClient() self.redisClient = RedisClient()
def get_proxy(): r = RedisClient() proxy = r.random() return proxy
return self.parse(self.make_req(self.url)) def single(asin): data = Bianti(asin).single() if data!=None: print(data) coll.insert(data) else: print(asin,':可能该商品已经不存在了') # time.sleep(random.randint(1,3)) from task import get_task from db import RedisClient r = RedisClient() queue_len = int(r.queue_len) def download_many(cc_list): print('download_many') workers = min(20, len(cc_list)) with futures.ThreadPoolExecutor(workers) as executor: executor.map(get_task, cc_list) download_many([single for x in range(queue_len)])
def count(): r = RedisClient() return str(r.count())
def __init__(self, website='default'): self.website = website self.cookies_db = RedisClient('cookies', self.website) self.accounts_db = RedisClient('accounts', self.website)
def reids_client(): client = RedisClient() choice = client.random() return choice
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def __init__(self, website='tianyancha'): """初始化数据库管理对象""" self.website = website self.redis = RedisClient('accounts', self.website)
def __init__(self): self.redis = RedisClient()
def __init__(self): self.rules = rules self.redis = RedisClient() self.spider_running = True self.ctx = None