def firefox_ver(self, value): # 如果格式不符合,保持原来的值,不做任何修改 if not helper.match_expect_type(value, 'dict'): return if 'min' in value and helper.match_expect_type(value['min'], 'int'): self._firefox_ver['min'] = value['min'] # 使用range进行list生成时,会会忽略最大值,所以需要+1 # [range(74, 75)] => [74] if 'max' in value and helper.match_expect_type(value['max'], 'int'): self._firefox_ver['max'] = value['max'] + 1
def storage_type(self, value): r = gbh_helper.enum_set_check(value=value, enum_type=gfp_self_enum.StorageType) if r is None: return else: self._storage_type = r
def proxy_type(self, value): r = gbh_helper.enum_set_check(value=value, enum_type=gfp_self_enum.ProxyType) if r is None: return else: self._proxy_type = r
def country(self, value): r = gbh_helper.enum_set_check(value=value, enum_type=gfp_self_enum.Country, replace=False) if r is None: return else: self._country = r
def get_chrome_ver(setting, url, if_need_proxy, proxies): ''' :param setting: setting的实例 :param url: 获取chrome版本的url :param if_need_proxy:连接到https://www.chromedownloads.net是否需要代理 :param proxies: 如果需要使用代理,可用的代理 :return: set ''' chrome_ver = set({}) current_year = datetime.date.today().year # print(current_year) # print(if_need_proxy) # valid_proxies = None # if if_need_proxy: # if setting.proxies is None: # raise Exception("setting没有设置任何代理,无法连接到https://www.chromedownloads\ # .net获得chrome版本") # # print(setting.proxies) # # for single_proxies in setting.proxies: # tmp = helper.detect_if_proxy_usable(proxies=single_proxies, url=url) # # print(tmp) # if tmp: # # print(single_proxies) # valid_proxies = single_proxies # break # # if valid_proxies is None: # raise Exception('尝试了所有代理,都无法连接https://www.chromedownloads.net') # print(valid_proxies) r = helper.send_request_get_response(url=url, if_use_proxy=if_need_proxy, proxies=proxies, header=self_constant.HEADER) # print(r.html) records = r.html.find( 'div.download_content>ul.fix>' 'li[class!=divide-line]', first=False) # print(records) for single_record in records: # print(single_record.text) version_element_list = single_record.find('span.version_title>a') release_data_element_list = single_record.find('span.release_date') # 第一个li是标题,需要忽略 if len(version_element_list) == 0: continue # 判断版本时间 version_release_year = \ int(release_data_element_list[0].text.split('-')[0]) if current_year - version_release_year + 1 > \ setting.chrome_max_release_year: continue chrome_ver.add(version_element_list[0].text.split('_')[3]) return chrome_ver
def raw_site(self, value): r = gbh_helper.enum_set_check(value=value, enum_type=gfp_self_enum.SupportedWeb) if r is None: return else: self._raw_site = r self._site = self._generate_site( enumset_site=self._raw_site, enumset_protocol=self._protocol, int_site_max_page_no=self._site_max_page_no)
def chrome_max_release_year(self, value): # 是否为整数 if not helper.match_expect_type(value, 'int'): return # 是否大于0 if value < 0: return # 是否小于当前年-2008 if CHROME_MAX_RELEASE_YEAR < value: return self._chrome_max_release_year = value
def site_max_page_no(self, value): if not gbh_helper.match_expect_type(value, 'int'): raise ValueError('site_max_page_no的值必须是整数') if value < 1 or value > 10: raise ValueError('site_max_page_no的值必须在1到9之间') # 实际使用列表表达式生成url,因此site_max_page_no要+1,符合感受 self._site_max_page_no = value + 1 self._site = self._generate_site( enumset_site=self._raw_site, enumset_protocol=self._protocol, int_site_max_page_no=self._site_max_page_no)
def time_interval_in_seconds(self, old_date_time, new_date_time): ''' 计算old_date_time和new_date_time之间时间间隔,单位秒 :param old_date_time: :param new_date_time: :return: int ''' if not helper.match_expect_type(old_date_time, 'datetime.datetime'): if helper.match_expect_type(old_date_time, 'str'): old_date_time = datetime.datetime.strptime( old_date_time, '%Y-%m-%d %H:%M:%S') else: raise ValueError('old_date_time的格式不正确') if not helper.match_expect_type(new_date_time, 'datetime.datetime'): if helper.match_expect_type(new_date_time, 'str'): new_date_time = datetime.datetime.strptime( new_date_time, '%Y-%m-%d %H:%M:%S') else: raise ValueError('new_date_time的格式不正确') # datetime.datetime.now()+datetime.timedelta(days=1) return int((new_date_time - old_date_time).total_seconds())
def getPrefilterFunction(webenum): ''' 根据enum SupportedWeb的名字,返回对应的prefilter函数,如果没有出错,返回None :param webenum: :return: function ''' if not gbh_helper.match_expect_type(webenum, 'SupportedWeb'): return if webenum.name == gfp_self_enum.SupportedWeb.Xici.name: return prefilter.pre_filter_xicidaili if webenum.name == gfp_self_enum.SupportedWeb.Kuai.name: return prefilter.pre_filter_kuaidaili if webenum.name == gfp_self_enum.SupportedWeb.Proxylist.name: return prefilter.pre_filter_proxy_list if webenum.name == gfp_self_enum.SupportedWeb.Hidemy.name: return prefilter.pre_filter_hidemy return
def getExtractDataFunction(webenum): ''' 根据enum SupportedWeb的名字,返回对应的prefilter函数,如果没有出错,返回None :param webenum: :return: function ''' if not gbh_helper.match_expect_type(webenum, 'SupportedWeb'): return if webenum.name == gfp_self_enum.SupportedWeb.Xici.name: return gen_proxy_from_page.extra_data_from_page_xicidaili if webenum.name == gfp_self_enum.SupportedWeb.Kuai.name: return gen_proxy_from_page.extra_data_from_page_kuaidaili if webenum.name == gfp_self_enum.SupportedWeb.Proxylist.name: return gen_proxy_from_page.extra_data_from_page_proxylist if webenum.name == gfp_self_enum.SupportedWeb.Hidemy.name: return gen_proxy_from_page.extra_data_from_page_hidemy return
def validate_single_proxy(self, single_proxy, url, final_result): ''' :param single_proxy:dict。 gen_proxy获得的结果中,单个记录。{ip,port,type,protocol} :param url: 代理对此url是否有效 :param final_result:list。为了在协程中直接将valid的proxy提取,直接传入此参数 :return: boolean。实际上,使用协程时,无法使用此返回值,而是直接将结果放入final_result ''' ip = single_proxy['ip'] port = single_proxy['port'] proxy = {'http': '%s:%s' % (ip, port), 'https': '%s:%s' % (ip, port)} print('开始检测代理%s:%s对网站%s是否有效' % (single_proxy['ip'], single_proxy['port'], url)) # print(proxy) if gbh_helper.detect_if_proxy_usable(proxies=proxy, url=url): print('代理 %s 有效' % proxy['http']) final_result.append(single_proxy) return True else: print('代理 %s 无效' % proxy['http']) # final_result.append(single_result) return False
def check_if_site_need_proxy(self): # print(self._site) for single_site in self._site: single_site['need_proxy'] = gbh_helper.detect_if_need_proxy( single_site['urls'][0])
def gen_header(setting, url, num=None): ''' :param setting: :param url: 根据url生成host :param num: :return: ''' ua = [] if num is not None: # 如果只需要一个header,优选返回firefox的ua if num == 1: # print(setting.browser_type) if self_enum.BrowserType.FireFox in setting.browser_type: # print('num =1 browse=ff') ua += gen_ua.generate_firefox_ua(setting=setting, num=1) elif self_enum.BrowserType.Chrome in setting.browser_type: # print('num =1 browse=ch') ua += gen_ua.generate_chrome_ua(setting=setting, num=1) # 如果需要多个header else: # 如果可以产生ff的ua,先产生 if self_enum.BrowserType.FireFox in setting.browser_type: ua += gen_ua.generate_firefox_ua(setting=setting, num=num) # 如果生成的ff的ua数量不满足,再尝试生成chrome的ua if len(ua) < num: if self_enum.BrowserType.Chrome in setting.browser_type: ua += gen_ua.generate_chrome_ua(setting=setting, num=num) # num = None,生成最大数量的ua else: if self_enum.BrowserType.FireFox in setting.browser_type: ua += gen_ua.generate_firefox_ua(setting=setting) if self_enum.BrowserType.Chrome in setting.browser_type: ua += gen_ua.generate_chrome_ua(setting=setting) header = [] host = gbh_helper.extract_host_from_url(url) for single_ua in ua: # setting.header_no_ua['User-Agent'] = single_ua # tmp_header = setting.header_no_ua # tmp_header['User-Agent'] = single_ua if 'Firefox' in single_ua: header.append({ **setting.firefox_header_no_ua, **{ 'User-Agent': single_ua }, **{ 'Host': host } }) elif 'Chrome' in single_ua: header.append({ **setting.chrome_header_no_ua, **{ 'User-Agent': single_ua }, **{ 'Host': host } }) return header
def generate_chrome_ua(setting, num=None): ''' :param setting: setting的实例 :param num: 期望生成chrome_ua的个数 :return: list,包含需要获取版本的UA ''' if num is not None: # 如果只需要返回一个,直接生成 if num == 1: return [ 'Mozilla/5.0 (Windows NT 6.0; Win64; x64) AppleWebKit/537.36 \ (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' ] try: version_url = generate_chrome_url_base_on_type(setting) except ValueError as e: # print('generate_chrome_header调用generate_chrome_url_base_on_type' # ',传入的参数必须是set') print(e) return # 检测是否需要代理,如果需要,设置代理 # if_use_proxy = helper.detect_if_need_proxy(version_url[0]) # print(version_url) if_need_proxy = helper.detect_if_need_proxy(self_constant.CHROME_BASE_URL) valid_proxies = None if if_need_proxy: if setting.proxies is None: raise Exception("setting没有设置任何代理,无法连接到https://www.chromedownloads\ .net获得chrome版本") # print(setting.proxies) for single_proxies in setting.proxies: tmp = helper.detect_if_proxy_usable( proxies=single_proxies, url=self_constant.CHROME_BASE_URL) # print(tmp) if tmp: # print(single_proxies) valid_proxies = single_proxies break if valid_proxies is None: raise Exception('尝试了所有代理,都无法连接https://www.chromedownloads.net') chrome_ver = set({}) for single_url in version_url: tmp_chrome_ver = get_chrome_ver(url=single_url, setting=setting, if_need_proxy=if_need_proxy, proxies=valid_proxies) # logging.debug(tmp_chrome_ver) # 获得的version加入chrome_ver chrome_ver = chrome_ver | tmp_chrome_ver # logging.debug(chrome_ver) os_bit = set([]) if self_enum.OsType.All in setting.os_type: os_bit = {'Win32; x32', 'Win64; x64'} else: if self_enum.OsType.Win32 in setting.os_type: os_bit.add('Win32; x32') if self_enum.OsType.Win64 in setting.os_type: os_bit.add('Win64; x64') chrome_ua = [ 'Mozilla/5.0 (%s; %s) AppleWebKit/537.36 (KHTML, \ like Gecko) Chrome/%s Safari/537.36' % (winver, osbit, chromever) for osbit in os_bit for winver in setting.WIN_VER for chromever in chrome_ver ] # else: # raise Exception('当前不支持产生非Windows的user-agent') if num is not None: if len(chrome_ua) > num: return random.sample(chrome_ua, num) return chrome_ua
def chrome_type(self, value): r = helper.enum_set_check(value, self_enum.ChromeType) if r is None: return else: self._chrome_type = r
def os_type(self, value): r = helper.enum_set_check(value, self_enum.OsType) if r is None: return else: self._os_type = r
def browser_type(self, value): r = helper.enum_set_check(value, self_enum.BrowserType) if r is None: return else: self._browser_type = r
def valid_time_in_db(self, value): if not gbh_helper.match_expect_type(value, 'int'): raise ValueError('valid_time_in_db的值必须是整数') if value < 300 or value > 86400 * 5: raise ValueError('valid_time_in_db的值必须在300到86400×5之间') self._valid_time_in_db = value