class ProxyRefreshSchedule(ProxyManager): """ 代理定时刷新 """ def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('refresh_schedule') def valid_proxy(self): """ valid_proxy :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s start valid proxy' % time.ctime()) while raw_proxy: if validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.debug('proxy: %s validation passes' % raw_proxy) else: self.log.debug('proxy: %s validation fail' % raw_proxy) pass self.db.changeTable(self.raw_proxy_queue) raw_proxy = self.db.pop() self.log.info('%s valid proxy complete' % time.ctime())
class ProxyRefreshSchedule(ProxyManager): """ 代理定时刷新 """ def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('refresh_schedule') def validProxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 remaining_proxies = self.getAll() while raw_proxy_item: raw_proxy = raw_proxy_item.get('proxy') if isinstance(raw_proxy, bytes): # 兼容Py3 raw_proxy = raw_proxy.decode('utf8') if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy) else: self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() remaining_proxies = self.getAll() self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())
class ProxyCheck(ProxyManager, Thread): def __init__(self): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('proxy_check') def run(self): self.db.changeTable(self.useful_proxy_queue) while True: proxy_item = self.db.pop() while proxy_item: proxy = proxy_item.get('proxy') counter = proxy_item.get('value', 1) if validUsefulProxy(proxy): # 验证通过计数器加1 if counter and int(counter) < 1: self.db.put(proxy, num=int(counter) + 1) else: self.db.put(proxy) self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) # 验证失败,计数器减1 if counter and int(counter) <= FAIL_COUNT: self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(counter) - 1) proxy_item = self.db.pop() sleep(60 * 5)
class ProxyCheck(ProxyManager, Thread): def __init__(self, queue, item_dict): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('proxy_check', file=False) # 多线程同时写一个日志文件会有问题 self.queue = queue self.item_dict = item_dict def run(self): self.db.changeTable(self.useful_proxy_queue) while self.queue.qsize(): proxy = self.queue.get() count = self.item_dict[proxy] if validUsefulProxy(proxy): # 验证通过计数器减1 if count and int(count) > 0: self.db.put(proxy, num=int(count) - 1) else: pass self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) if count and int(count) + 1 >= FAIL_COUNT: self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(count) + 1) self.queue.task_done()
class ProxyValidSchedule(ProxyManager): def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('valid_schedule') def __validProxy__(self): """ 验证代理 :return: """ while 1: self.db.changeTable(self.useful_proxy_queue) for each_proxy in self.db.getAll(): if validUsefulProxy(each_proxy): self.log.debug('proxy: {} validation pass'.format(each_proxy)) else: self.db.delete(each_proxy) self.log.info('proxy: {} validation fail'.format(each_proxy)) self.log.info(u'代理验证程序运行正常') def main(self): self.__validProxy__()
def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('refresh_schedule')
def testLogHandler(): log = LogHandler('test') log.info('this is a log from test') log.resetName(name='test1') log.info('this is a log from test1') log.resetName(name='test2') log.info('this is a log from test2')
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) # store raw proxy for proxy in proxy_set: self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy): continue self.db.changeTable(self.raw_proxy_queue) self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue}
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ self.db.changeTable(self.raw_proxy_queue) for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue def get(self): """ return a useful proxy :return: """ length = 20 lists = [] self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: for i in range(1, length + 1): lists.append(random.choice(list(item_dict.keys()))) proxys = list(set(lists)) return proxys else: for i in range(1, length + 1): lists.append(random.choice(item_dict.keys())) proxys = list(set(lists)) return proxys return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
from requests.adapters import HTTPAdapter from bs4 import BeautifulSoup from Crypto.Cipher import AES from prettytable import PrettyTable sys.path.append('Util') from Util.Downloader import Downloader from Util.LogHandler import LogHandler BASE_URL = 'http://music.163.com/' _session = requests.Session() COMMENT_THRESHOLD = 10000 PAGE_LIMIT = 20 log = LogHandler('myspider', file=False) size = 100 local_List = [] def create_thread(myList): threads = [] for song in myList: thread = threading.Thread(target=get_comments_by_api, args=(song[1], song[0], song[2])) threads.append(thread) print("len: %s" % len(threads)) for thread in threads:
Description : tool function Author : JHao date: 2016/11/25 ------------------------------------------------- Change Activity: 2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree ------------------------------------------------- """ import requests from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) from Util.LogHandler import LogHandler logger = LogHandler(__name__) def getHTMLText(url, headers={'user': '******'}): try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() response.encoding = response.apparent_encoding return response.text except: return # return response.status_code # noinspection PyPep8Naming def robustCrawl(func):
def __init__(self): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('ProxyCheck')
class ProxyRefreshSchedule(ProxyManager): """ 坚持raw_proxy的IP """ def __init__(self, mode): ProxyManager.__init__(self, mode) self.refresh_log = LogHandler('refresh_schedule') self.log = LogHandler('proxy_check', file=False) self.queue = Queue() self.proxy_item = None self.item_dict = None self.timeout = 15 async def callback(self, proxy, count): # print("In proxy") self.db.changeTable(self.useful_proxy_queue) # 验证通过计数器减1 if count and int(count) > 0: self.db.put(proxy, num=int(count) - 1) else: pass async def _verify(self, proxy, count, semaphore): async with semaphore: async with aiohttp.ClientSession() as request: try: async with request.get("https://httpbin.org/ip", proxy=f"http://{proxy}", timeout=self.timeout, verify_ssl=False) as r: # text = await r.text() print(f"Raw Check {proxy}") if r.status == 200: await self.callback(proxy, count) except Exception as e: # print(e) pass def validProxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self.db.changeTable(self.raw_proxy_queue) self.refresh_log.info( 'Mode:%s ProxyRefreshSchedule: %s start validProxy' % (self.mode, time.ctime())) # 计算剩余代理,用来减少重复计算 self.proxy_item = self.db.getAll() self.item_dict = self.proxy_item for item in self.proxy_item: self.queue.put(item) proxies = [i for i in self.proxy_item] # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) semaphore = asyncio.Semaphore(500) tasks = [] # n = 0 for proxy in proxies: # proxy = self.queue.get() count = self.item_dict[proxy] tasks.append(self._verify(proxy, count, semaphore)) # n += 1 # print(n) # print("Begin") self.refresh_log.info( 'Mode:%s ProxyRefreshSchedule: %s start Refresh' % (self.mode, time.ctime())) try: loop.run_until_complete(asyncio.wait(tasks)) loop.close() except Exception as e: print(e) pass self.db.changeTable(self.raw_proxy_queue) # try: # while True: # self.db.pop() # except Exception as e: # print(e) self.refresh_log.info('Mode:%s ProxyRefreshSchedule: %s End Refresh' % (self.mode, time.ctime())) self.queue.task_done()
from Util.LogHandler import LogHandler from Util.MSSQLHelper import MSSQLHelper import aiohttp import asyncio import ssl import json import datetime import uuid from bs4 import BeautifulSoup, Comment from Util.isbn import Isbn mylog = LogHandler('test') mssql = MSSQLHelper('127.0.0.1', 'sa', '123456', 'ResourcesDB') class ConvertISBN(object): def run(self): # isbn= Isbn('978-7-5610-6751-2') # mylog.info(isbn.isbn10) count = mssql.ExecQuery( "select top 1 cip from rescipinfo order by cip desc")[0]["cip"] mylog.info(count) cip = '0' while cip < count: resultList = mssql.ExecQuery( "select top 1000 cip,isbn from rescipinfo where cip>%s and (isbn10 is null or isbn 13 is null) order by cip", (cip, )) try: # mylog.info(resultList) for item in resultList: try:
File Name: utilFunction.py Description : tool function Author : JHao date: 2016/11/25 ------------------------------------------------- Change Activity: 2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree ------------------------------------------------- """ import requests from lxml import etree from Util.LogHandler import LogHandler from Util.WebRequest import WebRequest logger = LogHandler(__name__, stream=False) # noinspection PyPep8Naming def robustCrawl(func): def decorate(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: logger.info(u"sorry, 抓取出错。错误原因:") logger.info(e) return decorate # noinspection PyPep8Naming
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) # store raw proxy for proxy in proxy_set: self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy): continue self.db.changeTable(self.raw_proxy_queue) self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
def testLogHandler(): """ test function LogHandler in Util/LogHandler :return: """ log = LogHandler('test') log.info('this is a log from test') log.resetName(name='test1') log.info('this is a log from test1') log.resetName(name='test2') log.info('this is a log from test2')
def __init__(self, queue, item_dict): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('proxy_check', file=False) # 多线程同时写一个日志文件会有问题 self.queue = queue self.item_dict = item_dict
def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('valid_schedule')
Author : JHao date: 2018/7/10 ------------------------------------------------- Change Activity: 2018/7/10: CheckProxy ------------------------------------------------- """ __author__ = 'JHao' from ProxyGetter.getFreeProxy import GetFreeProxy from Util.utilFunction import verifyProxyFormat from Util.LogHandler import LogHandler log = LogHandler('check_proxy', file=False) class CheckProxy(object): @staticmethod def checkAllGetProxyFunc(): """ 检查getFreeProxy所有代理获取函数运行情况 Returns: None """ import inspect member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction) proxy_count_dict = dict() for func_name, func in member_list:
def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('refresh_schedule')
def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('valid_schedule')
def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy'
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy.strip(): self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) # store raw proxy self.db.changeTable(self.raw_proxy_queue) for proxy in proxy_set: self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.get() # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.getAll() def get_status(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.get_status() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.get_status() return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue}
def __init__(self, queue, item_dict): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('proxy_check', file=False) # 多线程同时写一个日志文件会有问题 self.queue = queue self.item_dict = item_dict
def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy'
def __init__(self): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('proxy_check')
import re import requests import pymongo from bs4 import BeautifulSoup, Comment from urllib.request import urlopen, Request from urllib.error import HTTPError from urllib import request from urllib.parse import urlparse from datetime import datetime rootPath = os.path.abspath('../') sys.path.append(rootPath) from Util.LogHandler import LogHandler from Util.MSSQLHelper import MSSQLHelper myapp = LogHandler('test') mssql = MSSQLHelper('127.0.0.1', 'sa', '123456', 'ResourcesDB') myclient = pymongo.MongoClient("mongodb://127.0.0.1:27017/") dblist = myclient.list_database_names() mydb = myclient["ResourcesDB"] resDoubanBook = mydb["ResDoubanBook"] resDoubanBook.create_index([("id", 1)], unique=True) session = requests.Session() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' }