Пример #1
0
    def __init__(self,
                 database=None,
                 url_prefix=None,
                 fetcher=None,
                 checker=None):
        if not database:
            self.database = RedisWrapper("127.0.0.1", 6379, 0)
        else:
            self.database = RedisWrapper(database.host, database.port,
                                         database.db, database.password)

        self._origin_prefix = 'origin_proxy'
        self._useful_prefix = 'useful_proxy'
        self._hundred_prefix = 'hundred_proxy'
        self._current_prefix = 'current_proxy'

        if not url_prefix:
            self._url_prefix = "default"
        else:
            self._url_prefix = url_prefix

        if not fetcher:  # validater
            self._fetcher = Fetcher()
        else:  # refresher
            self._fetcher = fetcher
            self._fetcher.backup_provider()
            log.info("REFRESH FETCHER BACKUP PROVIDER {0}".format(
                str(self._fetcher)))

        if not checker:
            self._checker = Checker()
        else:
            self._checker = checker

        self.log = log
Пример #2
0
class ProxyPipe(object):
    def __init__(self, database=None, fetcher=None, url_prefix=None):
        if not fetcher:
            self._fetcher = Fetcher()
        else:
            self._fetcher = fetcher

        self._database = database
        self._url_prefix = url_prefix

    def set_fetcher(self, provider_list):
        self._fetcher.set_provider(provider_list)
        return self

    def start(self):
        proc1 = Process(target=validater_run,
                        args=(
                            self._url_prefix,
                            self._database,
                        ))
        proc2 = Process(target=refresher_run,
                        args=(
                            self._url_prefix,
                            self._fetcher,
                            self._database,
                        ))

        proc_list = [proc1, proc2]

        for proc in proc_list:
            proc.start()
            time.sleep(1)
        for proc in proc_list:
            proc.join()
Пример #3
0
    def __init__(self, database=None, fetcher=None, url_prefix=None):
        if not fetcher:
            self._fetcher = Fetcher()
        else:
            self._fetcher = fetcher

        self._database = database
        self._url_prefix = url_prefix
Пример #4
0
    def __init__(self, database=None, url_prefix=None, fetcher=None, checker=None):
        if not database:
            self.database = RedisWrapper("127.0.0.1", 6379, 0)
        else:
            self.database = RedisWrapper(database.host, database.port, database.db)

        self._origin_prefix = 'origin_proxy'
        self._useful_prefix = 'useful_proxy'

        if not url_prefix:
            self._url_prefix = "default"
        else:
            self._url_prefix = url_prefix

        if not fetcher:
            self._fetcher = Fetcher()
        else:
            self._fetcher = fetcher

        if not checker:
            self._checker = Checker()
        else:
            self._checker = checker

        self.log = log
Пример #5
0
def main_run():
    redis = RedisConfig("127.0.0.1", 21009)
    p1 = ProxyPipe(url_prefix="https://www.baidu.com",
                   fetcher=Fetcher(use_default=False),
                   database=redis,
                   checker=CheckerBaidu()).set_fetcher(
                       [KuaiProvider()]).add_fetcher([XiciProvider()])
    p1.start()
Пример #6
0
    def __init__(self,
                 database=None,
                 fetcher=None,
                 url_prefix=None,
                 checker=None,
                 validater_thread_num=30,
                 refresher_thread_num=30):
        if not fetcher:
            self._fetcher = Fetcher()
        else:
            self._fetcher = fetcher

        if not checker:
            self._checker = Checker()
        else:
            self._checker = checker

        self._database = database
        self._url_prefix = url_prefix

        self.validater_thread_num = validater_thread_num
        self.refresher_thread_num = refresher_thread_num
Пример #7
0
    def __init__(self, url_prefix=None, fetcher=None):
        self.database = RedisWrapper("127.0.0.1", 6379)

        self._origin_prefix = 'origin_proxy'
        self._useful_prefix = 'useful_proxy'

        if not url_prefix:
            self._url_prefix = "default"
        else:
            self._url_prefix = url_prefix

        if not fetcher:
            self._fetcher = Fetcher()
        else:
            self._fetcher = fetcher

        self.log = log
Пример #8
0
class Manager(object):
    def __init__(self,
                 database=None,
                 url_prefix=None,
                 fetcher=None,
                 checker=None):
        if not database:
            self.database = RedisWrapper("127.0.0.1", 6379, 0)
        else:
            self.database = RedisWrapper(database.host, database.port,
                                         database.db, database.password)

        self._origin_prefix = 'origin_proxy'
        self._useful_prefix = 'useful_proxy'
        self._hundred_prefix = 'hundred_proxy'
        self._current_prefix = 'current_proxy'

        if not url_prefix:
            self._url_prefix = "default"
        else:
            self._url_prefix = url_prefix

        if not fetcher:  # validater
            self._fetcher = Fetcher()
        else:  # refresher
            self._fetcher = fetcher
            self._fetcher.backup_provider()
            log.info("REFRESH FETCHER BACKUP PROVIDER {0}".format(
                str(self._fetcher)))

        if not checker:
            self._checker = Checker()
        else:
            self._checker = checker

        self.log = log

    def get_netloc(self):
        if self._url_prefix == "default":
            return "default"
        return urlparse(self._url_prefix).netloc

    def generate_name(self, prefix):
        return ":".join(["spoon", self.get_netloc(), prefix])

    def refresh_condition(self):
        all_proxy_score = [[
            k.decode('utf-8'), int(v.decode('utf-8'))
        ] for (k, v) in self.get_all_kv_from(
            self.generate_name(self._useful_prefix)).items()]

        all_length = len(all_proxy_score)
        count_length = len([0 for (k, v) in all_proxy_score if v >= 95])

        if all_length <= 100:
            return True

        if count_length / all_length >= 0.2:
            return True
        else:
            return False

    def refresh(self):
        log.info("REFRESH START WITH {0} TARGET {1}".format(
            str(self._fetcher), self.get_netloc()))
        if not self.refresh_condition():
            log.info("REFRESH DID NOT MEET CONDITION. TARGET{0}".format(
                self.get_netloc()))
            return

        if len(self._fetcher) < 6:
            self._fetcher.restore_provider()
            log.info(
                "REFRESH FETCHER FAILED: NO ENOUGH PROVIDER, RESTORE PROVIDERS TO {0} for TARGET {1}"
                .format(str(self._fetcher), self.get_netloc()))
        proxy_set = set()

        provider_to_be_removed_index = []
        for index in range(len(self._fetcher)):
            provider = self._fetcher.get_provider(index)
            try:
                for proxy in provider.getter():
                    if proxy.strip():
                        self.log.info(
                            "REFRESH FETCHER: TARGET {0} PROVIDER {1} PROXY {2}"
                            .format(self.get_netloc(),
                                    provider.__class__.__name__,
                                    proxy.strip()))
                        proxy_set.add(proxy.strip())
            except Exception as e:
                provider_to_be_removed_index.append(index)
                log.error(
                    "REFRESH FETCHER FAILED: PROVIDER {0} WILL BE REMOVED ERROR {1}"
                    .format(provider.__class__.__name__, e))

            for proxy in proxy_set:
                self.database.set_value("spoon:proxy_stale", proxy,
                                        time.time())
                self.database.put(self.generate_name(self._origin_prefix),
                                  proxy)

        log.info("REFRESH FETCHER DELETE {0}. TARGET {1}".format(
            provider_to_be_removed_index, self.get_netloc()))
        self._fetcher.remove_provider(provider_to_be_removed_index)

    def get(self):
        return self.database.get(self.generate_name(self._useful_prefix))

    def set_value(self, key, value):
        return self.database.set_value(self.generate_name(self._useful_prefix),
                                       key, value)

    def delete(self, proxy):
        self.database.delete(self.generate_name(self._useful_prefix), proxy)

    def get_all(self):
        return self.database.get_all(self.generate_name(self._useful_prefix))

    def get_status(self):
        total_origin_proxy = self.database.get_status(
            self.generate_name(self._origin_prefix))
        total_useful_queue = self.database.get_status(
            self.generate_name(self._useful_prefix))
        return {
            'origin_proxy': total_origin_proxy,
            'useful_proxy': total_useful_queue
        }

    # For spoon_web
    def get_keys(self):
        return [key.decode("utf-8") for key in self.database.get_keys()]

    def get_from(self, target):
        return self.database.get(target)

    def get_all_from(self, target):
        return self.database.get_all(target)

    def get_all_kv_from(self, target):
        return self.database.get_all_kv(target)

    def get_range_from(self, target):
        return self.database.zrange(target, 0, -1)

    def scan_kv_from(self, target, cursor):
        return self.database.scan_kv(target, cursor)
Пример #9
0
def main_run():
    redis = RedisConfig("127.0.0.1", 21009)
    p1 = ProxyPipe(url_prefix="https://www.google.com",
                   fetcher=Fetcher(use_default=False),
                   database=redis).set_fetcher([UsProvider()])
    p1.start()
Пример #10
0
def main_run():
    p1 = ProxyPipe(url_prefix="https://www.google.com",
                   fetcher=Fetcher(use_default=False)).set_fetcher(
                       [UsProvider()])
    p1.start()
Пример #11
0
    def __init__(self, fetcher=None, url_prefix=None):
        if not fetcher:
            self._fetcher = Fetcher()

        self._url_prefix = url_prefix