Пример #1
0
class DoFetchProxy(ProxyManager):
    def __init__(self):
        ProxyManager.__init__(self)
        self.log = LogHandler('fetch_proxy')

    def main(self):
        self.log.info("***** start fetch proxy *****")
        self.fetch()
        self.log.info("***** finish fetch proxy *****")
Пример #2
0
class Check(ProxyManager, Thread):
    def __init__(self, queue, thread_name):
        ProxyManager.__init__(self)
        Thread.__init__(self, name=thread_name)
        self.queue = queue
        self.log = LogHandler('init_proxy_check')

    def run(self):
        self.log.info('Init Proxy Check - {} : start'.format(self.name))
        while True:
            try:
                proxy_key = self.queue.get(block=False)
            except Empty:
                self.log.info('Init Proxy Check - {} : end'.format(self.name))
                break

            proxy_obj = Proxy.newProxyFromJson(proxy_key)
            proxy_obj, status = check_proxy_useful(proxy_obj)

            if status:
                self.log.info(
                    'Init Proxy Check - {}: {} validation pass'.format(
                        self.name, proxy_obj.proxy))
                self.client.put(proxy_obj)
            else:
                self.log.info(
                    'Init Proxy Check - {}: {} validation fail'.format(
                        self.name, proxy_obj.proxy))
                self.client.delete(proxy_obj.proxy)
            self.queue.task_done()
Пример #3
0
class ProxyValidater:
    """
     验证useful_proxy_queue中的代理,将不可用的移出
    """
    def __init__(self):
        self._pm = ProxyManager()
        self.queue = Queue()
        self.proxy_list = None
        self.proxy_dict = dict()
        self.log = LogHandler('proxy_validater')

    def _valid_proxy(self, threads=50):
        """
        验证useful_proxy代理
        :param threads: 线程数
        :return:
        """
        thread_list = list()
        for index in range(threads):
            thread_list.append(ValidateProxy(self.queue, self.proxy_dict))

        for thread in thread_list:
            thread.daemon = True
            thread.start()

        for thread in thread_list:
            thread.join()

    def put_queue(self):
        self._pm.db.change_table(self._pm.useful_proxy_queue)
        self.proxy_list = self._pm.db.get_all()
        for proxy in self.proxy_list:
            self.queue.put(proxy)
            self.proxy_dict[proxy] = 0

    def main(self):
        self.put_queue()
        while True:
            if not self.queue.empty():
                self.log.info("Start valid useful proxy")
                self._valid_proxy()
            else:
                self.log.info('Valid Complete! sleep 600 sec.')
                time.sleep(600)
                self.put_queue()
class ValidateProxy(Thread):
    """
    多线程验证useful_proxy
    """
    def __init__(self, queue, item_dict):
        self._pm = ProxyManager()
        super().__init__()
        self.log = LogHandler('validate_proxy', file=False)  # 多线程同时写一个日志文件会有问题
        self.queue = queue
        self.item_dict = item_dict

    def run(self):
        self._pm.db.change_table(self._pm.useful_proxy_queue)
        while self.queue.qsize():
            proxy = self.queue.get()
            if valid_useful_proxy(proxy):
                # 验证通过,从计数字典删除
                self.log.info('ProxyCheck: {} validation pass'.format(proxy))
                del self.item_dict[proxy]
            else:
                # 验证失败,计数加1
                self.item_dict[proxy] += 1
                self.log.info('ProxyCheck: {} validation fail'.format(proxy))
                if self.item_dict[proxy] >= FAIL_COUNT:
                    # 超过最大失败次数,从计数字典和数据库删除
                    self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy))
                    del self.item_dict[proxy]
                    self._pm.db.delete(proxy)
                else:
                    # 未超过最大失败次数,放回队列
                    self.queue.put(proxy)
            self.queue.task_done()
Пример #5
0
def run_schedule():
    start_init_proxy()
    start_proxy_check()

    schedule_log = LogHandler('schedule_log')
    schedule = BlockingScheduler(logger=schedule_log)

    schedule.add_job(start_init_proxy,
                     'interval',
                     minutes=GETTER_CYCLE,
                     id="start_init_proxy",
                     name="抓取代理初始化验证")
    schedule.add_job(start_proxy_check,
                     'interval',
                     minutes=TESTER_CYCLE,
                     id="start_proxy_check",
                     name="代理可用性定时复核")

    schedule.start()
Пример #6
0
 def __init__(self):
     ProxyManager.__init__(self)
     self.log = LogHandler('fetch_proxy')
Пример #7
0
class ProxyManager(object):
    def __init__(self):
        self.client = db.DBclient()
        self.log = LogHandler('proxy_manager')

    def fetch(self):
        proxy_set = set()
        self.log.info(u'代理抓取: start')
        get_function = GetFunctions()
        for proxy_get in get_function.proxy_get_functions:
            self.log.info('Get Proxy - {}: start'.format(proxy_get))
            try:
                for proxy in getattr(GetFreeProxy, proxy_get.strip())():
                    proxy = proxy.strip()

                    if not proxy or not verifyProxyFormat(proxy):
                        self.log.error('Get Proxy - {}: {} error'.format(
                            proxy_get, proxy))
                        continue
                    elif proxy in proxy_set:
                        self.log.info('Get Proxy - {}: {} is exist'.format(
                            proxy_get, proxy))
                        continue
                    else:
                        self.log.info('Get Proxy - {}: {} success'.format(
                            proxy_get, proxy))
                        self.client.put(Proxy(proxy, source=proxy_get))
                        proxy_set.add(proxy)

            except Exception as e:
                self.log.error('Get Proxy - {}: error'.format(proxy_get))
                self.log.error(str(e))

    def get(self):
        proxy_list = self.client.getAll()
        if proxy_list:
            proxy = random.choice(proxy_list)
            return Proxy.newProxyFromJson(proxy)
        else:
            return None

    def getAll(self):
        proxy_list = self.client.getAll()
        return [Proxy.newProxyFromJson(_) for _ in proxy_list]

    def getCount(self):
        proxy_counts = self.client.getCount()
        return proxy_counts

    def delete(self, proxy_key):
        self.client.delete(proxy_key)
Пример #8
0
import random
#当前文件的路径
pwd = os.getcwd()
project_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + "..")
sys.path.append(project_path)
print(project_path)
from proxy.proxy_valid import ValidIp
from api.rest_api import RestApi
from util.util_function import CheckDir, DownloadFile, WriteInfo

from util.log_handler import LogHandler

from util.config import GetConfig

# log = LogHandler('read_csv')
log = LogHandler('new_0')

api = RestApi()

configs = GetConfig()

# proxies = ValidIp(True,'http://www.jiayuan.com')

proxies = ValidIp(True, 'http://www.jiayuan.com')

print(proxies)

url_address = 'http://www.jiayuan.com/'

#当前文件的路径
Пример #9
0
import random
#当前文件的路径
pwd = os.getcwd()
project_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + "..")
sys.path.append(project_path)
print(project_path)
from proxy.proxy import ValidIp
from api.rest_api import RestApi
from util.util_function import CheckDir, DownloadFile, WriteInfo

from util.log_handler import LogHandler

from util.config import GetConfig

# log = LogHandler('read_csv')
log = LogHandler('search_user_photos')

api = RestApi()

configs = GetConfig()

# proxies = ValidIp("local",'http://www.jiayuan.com')

proxies = ValidIp("local", 'http://www.jiayuan.com')

print(proxies)

url_address = 'http://www.jiayuan.com/'

#当前文件的路径
Пример #10
0
class ProxyRefresher:
    """
    代理定时刷新
    """
    def __init__(self):
        self._pm = ProxyManager()
        self.log = LogHandler('proxy_refresher')

    def fetch_all_proxy(self):
        """
        fetch proxy into Db by ProxyGetter/get_free_proxy.py
        :return:
        """
        for proxyGetter in config.proxy_getter_functions:
            # fetch
            try:
                self.log.info(
                    "{func}: fetch proxy start".format(func=proxyGetter))
                # for proxy in getattr(GetFreeProxy, proxyGetter.strip())(self.get()):
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())(None):
                    # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                    proxy = proxy.strip()
                    if proxy and verify_proxy_format(proxy):
                        self.log.info('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                        self._pm.db.change_table(self._pm.raw_proxy_queue)
                        self._pm.db.put(proxy)
                    else:
                        self.log.error(
                            '{func}: fetch proxy {proxy} error'.format(
                                func=proxyGetter, proxy=proxy))
                        pass
            except Exception as e:
                self.log.error("{func}: fetch proxy fail, {e}".format(
                    func=proxyGetter, e=e))
                continue

    def validate_raw_proxy(self):
        """
        验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue
        :return:
        """
        self._pm.db.change_table(self._pm.raw_proxy_queue)
        raw_proxy = self._pm.db.pop()
        self.log.info('ProxyRefresher: %s start validProxy' % time.ctime())
        # 计算剩余代理,用来减少重复计算
        remaining_proxies = self._pm.get_all()
        while raw_proxy:
            if (raw_proxy not in remaining_proxies
                ) and valid_useful_proxy(raw_proxy):
                self._pm.db.change_table(self._pm.useful_proxy_queue)
                self._pm.db.put(raw_proxy)
                self.log.info('ProxyRefresher: %s validation pass' % raw_proxy)
            else:
                self.log.info('ProxyRefresher: %s validation fail' % raw_proxy)
            self._pm.db.change_table(self._pm.raw_proxy_queue)
            raw_proxy = self._pm.db.pop()
            remaining_proxies = self._pm.get_all()
        self.log.info('ProxyRefresher: %s validProxy complete' % time.ctime())
Пример #11
0
 def __init__(self):
     self._pm = ProxyManager()
     self.queue = Queue()
     self.proxy_list = None
     self.proxy_dict = dict()
     self.log = LogHandler('proxy_validater')
Пример #12
0
def testLogHandler():
    """
    test function LogHandler  in Util/LogHandler
    :return:
    """
    log = LogHandler('test')
    log.error('this is a log from test')

    log.resetName(name='test1')
    log.warning('this is a log from test1')

    log.resetName(name='test2')
    log.info('this is a log from test2')
Пример #13
0
 def __init__(self, queue, item_dict):
     self._pm = ProxyManager()
     super().__init__()
     self.log = LogHandler('validate_proxy', file=False)  # 多线程同时写一个日志文件会有问题
     self.queue = queue
     self.item_dict = item_dict
Пример #14
0
 def __init__(self):
     self.client = db.DBclient()
     self.log = LogHandler('proxy_manager')
Пример #15
0
# -*- coding: utf-8 -*-
# !/usr/bin/env python

import requests
import time, os, sys
from lxml import etree
from contextlib import closing

from util.log_handler import LogHandler
from util.web_request import WebRequest

sys.path.append('..')

from util.log_handler import LogHandler

log = LogHandler('photo')

# #当前文件的路径
# pwd = os.getcwd()
# #当前文件的父路径
# father_path=os.path.abspath(os.path.dirname(pwd)+os.path.sep+".")
# #当前文件的前两级目录
# grader_father=os.path.abspath(os.path.dirname(pwd)+os.path.sep+"..")


# noinspection PyPep8Naming
def robustCrawl(func):
    def decorate(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
Пример #16
0
 def __init__(self):
     self._pm = ProxyManager()
     self.log = LogHandler('proxy_refresher')
Пример #17
0
#当前文件的路径
pwd = os.getcwd()
project_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + "..")
sys.path.append(project_path)

from util.config import GetConfig

configs = GetConfig()

from proxy.proxy_valid import ValidIp
from api.rest_api import RestApi
from util.util_function import CheckDir, DownloadFile, WriteInfo

from util.log_handler import LogHandler

log = LogHandler('read_csv')

api = RestApi()

# proxies = ValidIp('1','http://www.jiayuan.com')
proxies = ValidIp(True, 'http://www.jiayuan.com')

#当前文件的路径

# csv_path = project_path+'\logs\csv\\'
csv_path = project_path + '/logs/csv/'

#输出文件夹
out_dir = './download'

Пример #18
0
from util.log_handler import LogHandler

from util.config import GetConfig

configs = GetConfig()

host = 'ws://' + str(configs.host_ip) + ':' + str(configs.host_port) + "/cable"

try:
    import thread
except ImportError:
    import _thread as thread
import time

logger = LogHandler('web_socket')
logger.info('this is a log from web_socket')


def on_message(ws, message):
    data = json.loads(message)
    print(data['type'])

    if data['type'] == 'ping':
        print(data['type'])

    else:

        logger.info(data)

Пример #19
0
# -*- coding: utf-8 -*-
'''
-----------------------------------
    FileName:     check_proxy
    Description:  验证代理格式
    Author:       瓦都尅
    Date:         2019/10/30
-----------------------------------
'''
import re

from proxy.get_free_proxyip import GetFreeProxy
from util.log_handler import LogHandler

log = LogHandler('check_proxy', file=False)


def verifyProxyFormat(proxy):
    """
    检查代理格式
    """
    verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
    _proxy = re.findall(verify_regex, proxy)
    return True if len(_proxy) == 1 and _proxy[0] == proxy else False


class CheckProxy(object):
    @staticmethod
    def checkAllGetProxyFunc():
        """
        检查get_free_proxyip所有代理获取函数运行情况
Пример #20
0
# from tomorrow import threads
import random
#当前文件的路径
pwd = os.getcwd()
project_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + "..")
sys.path.append(project_path)
print(project_path)
from api.rest_api import RestApi
from util.util_function import CheckDir, DownloadFile, WriteInfo

from util.log_handler import LogHandler

from util.config import GetConfig

# log = LogHandler('read_csv')
log = LogHandler('test_uid')

api = RestApi()


def get_uid(data):
    try:
        r = api.get_uid(data)
        return (json.loads(r)["data"])
    except Exception as e:
        log.error("api request fail: %s", format(e))


while True:
    data = {'need': 20000, 'remark': "im test"}
Пример #21
0
# print(project_path)

from login import GetUserCookie
from proxy.proxy_valid import ValidIp
from api.rest_api import RestApi
from util.util_function import CheckDir, DownloadFile, WriteInfo
from util.log_handler import LogHandler
from util.config import GetConfig

# change: 定义当前爬虫名字
app = "uid5"

# change: 每次请求 uid 数量
req_nums = 200

log = LogHandler(app)

# 初始化
api = RestApi()
configs = GetConfig()

url_address = 'http://www.jiayuan.com/'

#当前文件的路径
csv_path = project_path + '\logs\csv\\'

#输出文件夹
out_dir = './download.new'

# cookie = GetUserCookie()
Пример #22
0
# coding =utf-8

import json, random, sys
import requests

sys.path.append('..')

from util.config import GetConfig
from util.log_handler import LogHandler

configs = GetConfig()

log = LogHandler('proxy')

#本地ip
proxy_local_host = configs.proxy_local

#在线ip https://github.com/jhao104/proxy_pool
proxy_online_host = configs.proxy_online


#
# 1.只调用一个方法,本地和网络均可用
#
#
#使用本地代理获取ip
def GetLocalIp():

	r = requests.get(proxy_local_host)
	ip_ports = json.loads(r.text)
	num = random.randint(0,10)
Пример #23
0
 def __init__(self, queue, thread_name):
     ProxyManager.__init__(self)
     Thread.__init__(self, name=thread_name)
     self.queue = queue
     self.log = LogHandler('init_proxy_check')
Пример #24
0
# coding =utf-8

import requests
import json, sys
import random
from proxy.proxy import NewProxyIp

sys.path.append('..')

from util.log_handler import LogHandler

log = LogHandler('proxy')
loger = LogHandler('proxy_ok')

#
#2. 获取到代理后判断能否访问网站
#

#获取ip,调用NewProxyIp()默认为在线获取,NewProxyIp("1")为本地代理获取
def GenNewIp(local):
	proxy = NewProxyIp(local)
	return proxy

#验证IP地址是否能进入网站
#ValidIp('1','http://www.jiayuan.com' )
def ValidIp(local=True, valid_host='http://httpbin.org/ip'):
	#调用获取ip方法
	proxy = GenNewIp(local)
	# print(proxy)

	retry_count = 20