Exemplo n.º 1
0
class ProxyRefreshSchedule(ProxyManager):
    """
    代理定时刷新
    """

    def __init__(self):
        ProxyManager.__init__(self)
        self.log = LogHandler('refresh_schedule')

    def valid_proxy(self):
        """
        valid_proxy
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        raw_proxy = self.db.pop()
        self.log.info('%s start valid proxy' % time.ctime())
        while raw_proxy:
            if validUsefulProxy(raw_proxy):
                self.db.changeTable(self.useful_proxy_queue)
                self.db.put(raw_proxy)
                self.log.debug('proxy: %s validation passes' % raw_proxy)
            else:
                self.log.debug('proxy: %s validation fail' % raw_proxy)
                pass
            self.db.changeTable(self.raw_proxy_queue)
            raw_proxy = self.db.pop()
        self.log.info('%s valid proxy complete' % time.ctime())
Exemplo n.º 2
0
class ProxyRefreshSchedule(ProxyManager):
    """
    代理定时刷新
    """

    def __init__(self):
        ProxyManager.__init__(self)
        self.log = LogHandler('refresh_schedule')

    def validProxy(self):
        """
        验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        raw_proxy_item = self.db.pop()
        self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime())
        # 计算剩余代理,用来减少重复计算
        remaining_proxies = self.getAll()
        while raw_proxy_item:
            raw_proxy = raw_proxy_item.get('proxy')
            if isinstance(raw_proxy, bytes):
                # 兼容Py3
                raw_proxy = raw_proxy.decode('utf8')

            if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy):
                self.db.changeTable(self.useful_proxy_queue)
                self.db.put(raw_proxy)
                self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy)
            else:
                self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy)
            self.db.changeTable(self.raw_proxy_queue)
            raw_proxy_item = self.db.pop()
            remaining_proxies = self.getAll()
        self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())
Exemplo n.º 3
0
class ProxyCheck(ProxyManager, Thread):
    def __init__(self):
        ProxyManager.__init__(self)
        Thread.__init__(self)
        self.log = LogHandler('proxy_check')

    def run(self):
        self.db.changeTable(self.useful_proxy_queue)
        while True:
            proxy_item = self.db.pop()
            while proxy_item:
                proxy = proxy_item.get('proxy')
                counter = proxy_item.get('value', 1)
                if validUsefulProxy(proxy):
                    # 验证通过计数器加1
                    if counter and int(counter) < 1:
                        self.db.put(proxy, num=int(counter) + 1)
                    else:
                        self.db.put(proxy)
                    self.log.info('ProxyCheck: {} validation pass'.format(proxy))
                else:
                    self.log.info('ProxyCheck: {} validation fail'.format(proxy))
                    # 验证失败,计数器减1
                    if counter and int(counter) <= FAIL_COUNT:
                        self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy))
                        self.db.delete(proxy)
                    else:
                        self.db.put(proxy, num=int(counter) - 1)

                proxy_item = self.db.pop()
            sleep(60 * 5)
Exemplo n.º 4
0
class ProxyCheck(ProxyManager, Thread):
    def __init__(self, queue, item_dict):
        ProxyManager.__init__(self)
        Thread.__init__(self)
        self.log = LogHandler('proxy_check', file=False)  # 多线程同时写一个日志文件会有问题
        self.queue = queue
        self.item_dict = item_dict

    def run(self):
        self.db.changeTable(self.useful_proxy_queue)
        while self.queue.qsize():
            proxy = self.queue.get()
            count = self.item_dict[proxy]
            if validUsefulProxy(proxy):
                # 验证通过计数器减1
                if count and int(count) > 0:
                    self.db.put(proxy, num=int(count) - 1)
                else:
                    pass
                self.log.info('ProxyCheck: {} validation pass'.format(proxy))
            else:
                self.log.info('ProxyCheck: {} validation fail'.format(proxy))
                if count and int(count) + 1 >= FAIL_COUNT:
                    self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy))
                    self.db.delete(proxy)
                else:
                    self.db.put(proxy, num=int(count) + 1)
            self.queue.task_done()
Exemplo n.º 5
0
class ProxyValidSchedule(ProxyManager):
    def __init__(self):
        ProxyManager.__init__(self)
        self.log = LogHandler('valid_schedule')

    def __validProxy__(self):
        """
        验证代理
        :return:
        """
        while 1:
            self.db.changeTable(self.useful_proxy_queue)
            for each_proxy in self.db.getAll():
                if validUsefulProxy(each_proxy):
                    self.log.debug('proxy: {} validation pass'.format(each_proxy))
                else:
                    self.db.delete(each_proxy)
                    self.log.info('proxy: {} validation fail'.format(each_proxy))
        self.log.info(u'代理验证程序运行正常')

    def main(self):
        self.__validProxy__()
Exemplo n.º 6
0
 def __init__(self):
     ProxyManager.__init__(self)
     self.log = LogHandler('refresh_schedule')
Exemplo n.º 7
0
def testLogHandler():
    log = LogHandler('test')
    log.info('this is a log from test')

    log.resetName(name='test1')
    log.info('this is a log from test1')

    log.resetName(name='test2')
    log.info('this is a log from test2')
Exemplo n.º 8
0
class ProxyManager(object):
    """
    ProxyManager
    """

    def __init__(self):
        self.db = DbClient()
        self.config = GetConfig()
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter
        :return:
        """
        for proxyGetter in self.config.proxy_getter_functions:
            proxy_set = set()
            # fetch raw proxy
            for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                if proxy:
                    self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
                    proxy_set.add(proxy.strip())

            # store raw proxy
            for proxy in proxy_set:
                self.db.changeTable(self.useful_proxy_queue)
                if self.db.exists(proxy):
                    continue
                self.db.changeTable(self.raw_proxy_queue)
                self.db.put(proxy)

    def get(self):
        """
        return a useful proxy
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if item_dict:
            if EnvUtil.PY3:
                return random.choice(list(item_dict.keys()))
            else:
                return random.choice(item_dict.keys())
        return None
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool as list
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if EnvUtil.PY3:
            return list(item_dict.keys()) if item_dict else list()
        return item_dict.keys() if item_dict else list()

    def getNumber(self):
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.getNumber()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.getNumber()
        return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue}
Exemplo n.º 9
0
class ProxyManager(object):
    """
    ProxyManager
    """
    def __init__(self):
        self.db = DbClient()
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter/getFreeProxy.py
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        for proxyGetter in config.proxy_getter_functions:
            # fetch
            try:
                self.log.info(
                    "{func}: fetch proxy start".format(func=proxyGetter))
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                    # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                    proxy = proxy.strip()
                    if proxy and verifyProxyFormat(proxy):
                        self.log.info('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                        self.db.put(proxy)
                    else:
                        self.log.error(
                            '{func}: fetch proxy {proxy} error'.format(
                                func=proxyGetter, proxy=proxy))
            except Exception as e:
                self.log.error(
                    "{func}: fetch proxy fail".format(func=proxyGetter))
                continue

    def get(self):
        """
        return a useful proxy
        :return:
        """
        length = 20
        lists = []
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if item_dict:
            if EnvUtil.PY3:
                for i in range(1, length + 1):
                    lists.append(random.choice(list(item_dict.keys())))
                proxys = list(set(lists))
                return proxys
            else:
                for i in range(1, length + 1):
                    lists.append(random.choice(item_dict.keys()))
                proxys = list(set(lists))
                return proxys
        return None
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool as list
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if EnvUtil.PY3:
            return list(item_dict.keys()) if item_dict else list()
        return item_dict.keys() if item_dict else list()

    def getNumber(self):
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.getNumber()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.getNumber()
        return {
            'raw_proxy': total_raw_proxy,
            'useful_proxy': total_useful_queue
        }
Exemplo n.º 10
0
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
from Crypto.Cipher import AES
from prettytable import PrettyTable
sys.path.append('Util')
from Util.Downloader import Downloader
from Util.LogHandler import LogHandler

BASE_URL = 'http://music.163.com/'
_session = requests.Session()

COMMENT_THRESHOLD = 10000

PAGE_LIMIT = 20

log = LogHandler('myspider', file=False)

size = 100
local_List = []


def create_thread(myList):
    threads = []

    for song in myList:
        thread = threading.Thread(target=get_comments_by_api,
                                  args=(song[1], song[0], song[2]))
        threads.append(thread)

    print("len: %s" % len(threads))
    for thread in threads:
Exemplo n.º 11
0
   Description :  tool function
   Author :       JHao
   date:          2016/11/25
-------------------------------------------------
   Change Activity:
                   2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree
-------------------------------------------------
"""
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

from Util.LogHandler import LogHandler

logger = LogHandler(__name__)


def getHTMLText(url, headers={'user': '******'}):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except:
        return
        # return response.status_code


# noinspection PyPep8Naming
def robustCrawl(func):
Exemplo n.º 12
0
 def __init__(self):
     ProxyManager.__init__(self)
     Thread.__init__(self)
     self.log = LogHandler('ProxyCheck')
Exemplo n.º 13
0
class ProxyRefreshSchedule(ProxyManager):
    """
    坚持raw_proxy的IP
    """
    def __init__(self, mode):
        ProxyManager.__init__(self, mode)
        self.refresh_log = LogHandler('refresh_schedule')
        self.log = LogHandler('proxy_check', file=False)
        self.queue = Queue()
        self.proxy_item = None
        self.item_dict = None
        self.timeout = 15

    async def callback(self, proxy, count):
        # print("In proxy")
        self.db.changeTable(self.useful_proxy_queue)
        # 验证通过计数器减1
        if count and int(count) > 0:
            self.db.put(proxy, num=int(count) - 1)
        else:
            pass

    async def _verify(self, proxy, count, semaphore):
        async with semaphore:
            async with aiohttp.ClientSession() as request:
                try:
                    async with request.get("https://httpbin.org/ip",
                                           proxy=f"http://{proxy}",
                                           timeout=self.timeout,
                                           verify_ssl=False) as r:
                        # text = await r.text()
                        print(f"Raw Check {proxy}")
                        if r.status == 200:
                            await self.callback(proxy, count)
                except Exception as e:
                    # print(e)
                    pass

    def validProxy(self):
        """
        验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        self.refresh_log.info(
            'Mode:%s ProxyRefreshSchedule: %s start validProxy' %
            (self.mode, time.ctime()))
        # 计算剩余代理,用来减少重复计算
        self.proxy_item = self.db.getAll()
        self.item_dict = self.proxy_item
        for item in self.proxy_item:
            self.queue.put(item)
        proxies = [i for i in self.proxy_item]
        # loop = asyncio.get_event_loop()
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        semaphore = asyncio.Semaphore(500)
        tasks = []
        # n = 0
        for proxy in proxies:
            # proxy = self.queue.get()
            count = self.item_dict[proxy]
            tasks.append(self._verify(proxy, count, semaphore))
            # n += 1
            # print(n)
        # print("Begin")
        self.refresh_log.info(
            'Mode:%s ProxyRefreshSchedule: %s start Refresh' %
            (self.mode, time.ctime()))
        try:
            loop.run_until_complete(asyncio.wait(tasks))
            loop.close()
        except Exception as e:
            print(e)
            pass
        self.db.changeTable(self.raw_proxy_queue)
        # try:
        #     while True:
        #         self.db.pop()
        # except Exception as e:
        #     print(e)
        self.refresh_log.info('Mode:%s ProxyRefreshSchedule: %s End Refresh' %
                              (self.mode, time.ctime()))
        self.queue.task_done()
Exemplo n.º 14
0
from Util.LogHandler import LogHandler
from Util.MSSQLHelper import MSSQLHelper
import aiohttp
import asyncio
import ssl
import json
import datetime
import uuid
from bs4 import BeautifulSoup, Comment
from Util.isbn import Isbn

mylog = LogHandler('test')
mssql = MSSQLHelper('127.0.0.1', 'sa', '123456', 'ResourcesDB')


class ConvertISBN(object):
    def run(self):
        # isbn= Isbn('978-7-5610-6751-2')
        # mylog.info(isbn.isbn10)
        count = mssql.ExecQuery(
            "select top 1 cip from rescipinfo order by cip desc")[0]["cip"]
        mylog.info(count)
        cip = '0'
        while cip < count:
            resultList = mssql.ExecQuery(
                "select top 1000 cip,isbn from rescipinfo where cip>%s and (isbn10 is null or isbn 13 is null) order by cip",
                (cip, ))
            try:
                # mylog.info(resultList)
                for item in resultList:
                    try:
Exemplo n.º 15
0
   File Name:     utilFunction.py
   Description :  tool function
   Author :       JHao
   date:          2016/11/25
-------------------------------------------------
   Change Activity:
                   2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree
-------------------------------------------------
"""
import requests
from lxml import etree

from Util.LogHandler import LogHandler
from Util.WebRequest import WebRequest

logger = LogHandler(__name__, stream=False)


# noinspection PyPep8Naming
def robustCrawl(func):
    def decorate(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            logger.info(u"sorry, 抓取出错。错误原因:")
            logger.info(e)

    return decorate


# noinspection PyPep8Naming
Exemplo n.º 16
0
class ProxyManager(object):
    """
    ProxyManager
    """
    def __init__(self):
        self.db = DbClient()
        self.config = GetConfig()
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter
        :return:
        """
        for proxyGetter in self.config.proxy_getter_functions:
            proxy_set = set()
            # fetch raw proxy
            for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                if proxy:
                    self.log.info('{func}: fetch proxy {proxy}'.format(
                        func=proxyGetter, proxy=proxy))
                    proxy_set.add(proxy.strip())

            # store raw proxy
            for proxy in proxy_set:
                self.db.changeTable(self.useful_proxy_queue)
                if self.db.exists(proxy):
                    continue
                self.db.changeTable(self.raw_proxy_queue)
                self.db.put(proxy)

    def get(self):
        """
        return a useful proxy
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if item_dict:
            if EnvUtil.PY3:
                return random.choice(list(item_dict.keys()))
            else:
                return random.choice(item_dict.keys())
        return None
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool as list
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if EnvUtil.PY3:
            return list(item_dict.keys()) if item_dict else list()
        return item_dict.keys() if item_dict else list()

    def getNumber(self):
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.getNumber()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.getNumber()
        return {
            'raw_proxy': total_raw_proxy,
            'useful_proxy': total_useful_queue
        }
Exemplo n.º 17
0
def testLogHandler():
    """
    test function LogHandler  in Util/LogHandler
    :return:
    """
    log = LogHandler('test')
    log.info('this is a log from test')

    log.resetName(name='test1')
    log.info('this is a log from test1')

    log.resetName(name='test2')
    log.info('this is a log from test2')
Exemplo n.º 18
0
 def __init__(self, queue, item_dict):
     ProxyManager.__init__(self)
     Thread.__init__(self)
     self.log = LogHandler('proxy_check', file=False)  # 多线程同时写一个日志文件会有问题
     self.queue = queue
     self.item_dict = item_dict
Exemplo n.º 19
0
 def __init__(self):
     ProxyManager.__init__(self)
     self.log = LogHandler('valid_schedule')
Exemplo n.º 20
0
   Author :        JHao
   date:          2018/7/10
-------------------------------------------------
   Change Activity:
                   2018/7/10: CheckProxy
-------------------------------------------------
"""
__author__ = 'JHao'

from ProxyGetter.getFreeProxy import GetFreeProxy
from Util.utilFunction import verifyProxyFormat


from Util.LogHandler import LogHandler

log = LogHandler('check_proxy', file=False)


class CheckProxy(object):

    @staticmethod
    def checkAllGetProxyFunc():
        """
        检查getFreeProxy所有代理获取函数运行情况
        Returns:
            None
        """
        import inspect
        member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction)
        proxy_count_dict = dict()
        for func_name, func in member_list:
Exemplo n.º 21
0
 def __init__(self):
     ProxyManager.__init__(self)
     self.log = LogHandler('refresh_schedule')
Exemplo n.º 22
0
 def __init__(self):
     ProxyManager.__init__(self)
     self.log = LogHandler('valid_schedule')
Exemplo n.º 23
0
 def __init__(self):
     self.db = DbClient()
     self.config = GetConfig()
     self.raw_proxy_queue = 'raw_proxy'
     self.log = LogHandler('proxy_manager')
     self.useful_proxy_queue = 'useful_proxy'
Exemplo n.º 24
0
class ProxyManager(object):
    """
    ProxyManager
    """

    def __init__(self):
        self.db = DbClient()
        self.config = GetConfig()
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter
        :return:
        """
        for proxyGetter in self.config.proxy_getter_functions:
            proxy_set = set()
            # fetch raw proxy
            for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                if proxy.strip():
                    self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
                    proxy_set.add(proxy.strip())

            # store raw proxy
            self.db.changeTable(self.raw_proxy_queue)
            for proxy in proxy_set:
                self.db.put(proxy)

    def get(self):
        """
        return a useful proxy
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.get()
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        return self.db.getAll()

    def get_status(self):
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.get_status()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.get_status()
        return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue}
Exemplo n.º 25
0
 def __init__(self, queue, item_dict):
     ProxyManager.__init__(self)
     Thread.__init__(self)
     self.log = LogHandler('proxy_check', file=False)  # 多线程同时写一个日志文件会有问题
     self.queue = queue
     self.item_dict = item_dict
Exemplo n.º 26
0
 def __init__(self):
     self.db = DbClient()
     self.config = GetConfig()
     self.raw_proxy_queue = 'raw_proxy'
     self.log = LogHandler('proxy_manager')
     self.useful_proxy_queue = 'useful_proxy'
Exemplo n.º 27
0
 def __init__(self):
     ProxyManager.__init__(self)
     Thread.__init__(self)
     self.log = LogHandler('proxy_check')
Exemplo n.º 28
0
import re
import requests
import pymongo
from bs4 import BeautifulSoup, Comment
from urllib.request import urlopen, Request
from urllib.error import HTTPError
from urllib import request
from urllib.parse import urlparse
from datetime import datetime

rootPath = os.path.abspath('../')
sys.path.append(rootPath)
from Util.LogHandler import LogHandler
from Util.MSSQLHelper import MSSQLHelper

myapp = LogHandler('test')
mssql = MSSQLHelper('127.0.0.1', 'sa', '123456', 'ResourcesDB')

myclient = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
dblist = myclient.list_database_names()
mydb = myclient["ResourcesDB"]
resDoubanBook = mydb["ResDoubanBook"]
resDoubanBook.create_index([("id", 1)], unique=True)

session = requests.Session()
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
}