class BaseTest(unittest.TestCase): def __init__(self, *pargs, **kwargs): unittest.TestCase.__init__(self, *pargs, **kwargs) def setUp(self): self.pl = GetProxy() def test_T1(self): """Check if database is empty""" self.assertTrue(check_if_database_is_empty(), "Empty database!") def test_T2(self): """Get one proxy""" buff = self.pl.get_proxy() self.assertTrue(buff.startswith("http")) def test_T3(self): """Get many proxy""" self.assertTrue(get_many_proxy(15, self.pl)) def test_T4(self): """Testing if database contains only unique proxy addresses""" self.assertTrue(check_unique_addresses(), "Not all addresses are unique.") def test_https(self): proxy = self.pl.get_https_proxy() self.assertTrue(check_https(proxy), "Not https") def test_proxy(self): """Generate proxy and test output""" self.shortDescription() proxy = self.pl.get_proxy() self.assertTrue(check_proxy(proxy), "%s doesn't match" % proxy)
def main(): get_proxy = GetProxy(ProxyItemsDropDB) while True: item = get_proxy.get_proxy() ret = valid_proxy(item) if ret: ProxyItemsDB.upsert_proxy_item(ret) ProxyItemsDropDB.remove_proxy_item(item) pass else: ProxyItemsDropDB.remove_proxy_item(item) ProxyItemsDropForeverDB.upsert_proxy_item(item)
class Spider(object): def __init__(self, num): self.num = num self._client = SiteClient() self._getProxy = GetProxy(2, num) self._getSearchKey = GetSearchKey() pass def _refresh_proxy(self): self._proxy_ip, self._proxy_port, proxy_type = self._getProxy.get_proxy( ) http_proxy = "http://%s:%s" % (self._proxy_ip, self._proxy_port) proxies = {"http": http_proxy} logging.info("++++++++proxies: %s++++++++++++" % proxies) self._client = SiteClient(proxies) pass def run(self): try: # cur = QyxybaicDB.get_all() self._refresh_proxy() # is_need_refresh_proxy = 5 while True: try: # if is_need_refresh_proxy > 0: # is_need_refresh_proxy -= 1 # else: # self._refresh_proxy() # is_need_refresh_proxy = 3 reg_bus_ent_id = self._getSearchKey.get_reg_bus_ent_id() # reg_bus_ent_id = item['reg_bus_ent_id'] logging.info("-------------%s--------------" % reg_bus_ent_id) if QyxybaicLevel2DB.get_one(reg_bus_ent_id): logging.info( "----------------is have-------------------") continue company = {"reg_bus_ent_id": reg_bus_ent_id} company_info = self.get_company(reg_bus_ent_id) company.update(company_info) QyxybaicLevel2DB.upsert_company_detail_level_2(company) except NeedrefreshProxyError, err: self._refresh_proxy() continue except ErrorStatusCode, err: self._refresh_proxy() continue except HttpClientError, err: self._refresh_proxy() continue
def start(args): db_type = args.database init(db_type) get_proxy = GetProxy(db_type) validate_origin = ValidateOrigin(db_type) test_available = TestAvailable(db_type) thread_get_proxy = threading.Thread(target=get_proxy.cycle_get, name="thread-get-ip") # 定时从网站获取ip thread_validate_proxy = threading.Thread( target=validate_origin.cycle_validate, name="thread-validate-ip") # 定时测试能用代理 thread_test_proxy = threading.Thread( target=test_available.cycle_test, name="Thread-test-ip") # 定时检查,剔除不能用的代理 thread_get_proxy.start() thread_validate_proxy.start() thread_test_proxy.start()
def __init__(self): self._client = SiteClient() self._getProxy = GetProxy(3, 0) self._getSearchKey = GetSearchKey() pass
def __init__(self, num): self.num = num self._client = SiteClient() self._getProxy = GetProxy(2, num) self._getSearchKey = GetSearchKey() pass
def setUp(self): self.pl = GetProxy()
def __init__(self): self._client = None self._getProxy = GetProxy(4, 0) self._getSearchKey = GetSearchKey() pass
def get(): return random.choice(list(useful_proxys)) @app.route('/get_all') def get_all(): return list(useful_proxys) @app.route('/get_num') def get_num(): return len(useful_proxys) if __name__ == '__main__': logger.info("开始爬取代理") proxys = GetProxy() proxys.get() # 获得没有经过过滤的ip代理 logger.info("代理爬取完毕,一共爬取到{}条".format(len(origin_proxys))) # 接下来开始验证,验证完之后保存在文本中 logger.info("接下来开始验证") proxy_check() logger.info("验证结束,有用的代理有{}条".format(len(useful_proxys))) logger.info(useful_proxys) with open("proxys.txt", 'w') as f: for proxy in useful_proxys: f.write(proxy + '\n') app.run(debug=True)
def __init__(self, num): self.num = num self._client = None self._getProxy = GetProxy(1, num) self._getSearchKey = GetSearchKey() pass
""" Method should return List[Proxy] and it will use in GetProxy.get_proxies """ pass save_as_method = "txt" # based on attribute name save_as_{save_as_method} def save_as_txt(self, proxy_list: ProxyList) -> None: # save method will pass ProxyList pass # get_proxy = ExtendedGetProxy(timeout=10, check_duplicates=False) # Will use get_from and check_proxy methods # Default Usage get_proxy = GetProxy(use_tqdm=True, check_duplicates=True, timeout=10, limit=5) proxy_list = get_proxy.list # Returns ProxyList print(proxy_list.length) proxy_list.all # Returns List[Proxy] proxy_list.filter( ) # country_code: list=None, ssl_support :bool=None, google_passed:bool=None, use_limit :int=0 proxy_list.get().to_dict( ) # country_code: list=None, ssl_support :bool=None, google_passed:bool=None, use_limit :int=0 print(proxy_list.first.to_dict()) print(proxy_list.last_used.to_dict()) print(proxy_list.next.to_dict()) get_proxy.save() # To save