def javlib_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//*[@class="video"]/a/@title', 'javid': '//*[@class="video"]/@id', 'img': '//*[@class="video"]/a/img/@src', 'car': '//*/div[@class="video"]/a/div[@class="id"]/text()' } xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href' # force to get url from ini file each time javlib_url = return_config_string(['其他设置', 'javlibrary网址']) lib_url = javlib_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {lib_url}') res = return_post_res(lib_url, behind_cloudflare=True).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = find_max_page(root.xpath(xpath_max_page)[0]) except IndexError: max_page = page_num return jav_objs_raw, max_page
def javbus_magnet_search(car: str): jav_url = return_config_string(['其他设置', 'javbus网址']) gid_match = r'.*?var gid = (\d*);.*?' magnet_xpath = { 'magnet': '//tr/td[position()=1]/a[1]/@href', 'title': '//tr/td[position()=1]/a[1]/text()', 'size': '//tr/td[position()=2]/a[1]/text()' } main_url_template = jav_url+'{car}' magnet_url_template = jav_url+'ajax/uncledatoolsbyajax.php?gid={gid}&uc=0' res = return_get_res(main_url_template.format(car=car)).text gid = re.search(gid_match, res).groups()[0] res = return_get_res(magnet_url_template.format(gid=gid), headers={'referer': main_url_template.format(car=car)}).content root = etree.HTML(res) magnets = defaultlist(dict) for k, v in magnet_xpath.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): magnets[_i].update({k: _value.strip('\t').strip('\r').strip('\n').strip()}) if k == 'size': magnets[_i].update({'size_sort': parsed_size_to_int(_value.strip('\t').strip('\r').strip('\n').strip())}) return magnets
def javdb_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: """ website parse function """ xpath_dict = { 'title': '//a[@class="box"]/div[@class="video-title"]/text()', 'javid': '//a[@class="box"]/div[@class="uid"]/text()', 'img': '//div[@class="item-image fix-scale-cover"]/img/@data-src', 'car': '//a[@class="box"]/div[@class="uid"]/text()' } xpath_max_page = '//ul[@class="pagination-list"]/li/a[@class="pagination-link"][last()]/text()' # force to get url from ini file each time javdb_url = 'https://javdb4.com/' set_url = javdb_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {set_url}') # not really behind cloudflare but may prevent python scrape res = return_post_res(set_url, cookies={'over18': "1"}, behind_cloudflare=True).content root = etree.HTML(res.decode('utf-8')) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = root.xpath(xpath_max_page)[-1] except: max_page = page_num if not max_page: max_page = page_num return jav_objs_raw, max_page
def javbus_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@title', 'javid': '//div[@class="photo-info"]/span/date[1]/text()', 'img': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@src', 'car': '//div[@class="photo-info"]/span/date[1]/text()' } xpath_max_page = '//ul[@class="pagination pagination-lg"]/li/a/text()' # force to get url from ini file each time javbus_url = return_config_string(['其他设置', 'javbus网址']) set_url = javbus_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {set_url}') res = return_post_res(set_url).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = root.xpath(xpath_max_page)[-2] except: max_page = page_num if not max_page: max_page = page_num return jav_objs_raw, max_page
def jav321_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//div[@class="thumbnail"]/a/text()', 'javid': '//div[@class="thumbnail"]/a/@href', # need to extract from link 'img': '//div[@class="thumbnail"]/a/img/@src', 'car': '//div[@class="thumbnail"]/a/text()' # need to extract from title } xpath_max_page = '//ul[@class="pager"]/li[@class="next"]/a/text()' max_page = page_num # default value # force to get url from ini file each time #javbus_url = return_config_string(['其他设置', 'javbus网址']) jav_url = 'https://www.jav321.com/' set_url = jav_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {set_url}') res = return_post_res(set_url).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): # need to extract car from title, reusing file_scanner function if k == 'car': # need to separate text with car first _preprocess = _value.split(' ')[-1] # try to extract proper car try: name_group = re.search(DEFAULT_FILENAME_PATTERN, _preprocess) name_digits = name_group.group('digit') # only keep 0 under 3 digits # keep 045, 0830 > 830, 1130, 0002 > 002, 005 if name_digits.isdigit(): name_digits = str(int(name_digits)) while len(name_digits) < 3: name_digits = '0' + name_digits _value = name_group.group('pre') + '-' + name_digits except AttributeError as e: print(f'cannot extract standard car format from {_preprocess} due to {e}') _value = _preprocess jav_objs_raw[_i].update({k: _value}) try: _new_max = root.xpath(xpath_max_page) if len(_new_max) > 0: max_page = int(max_page) + 1 except: pass # max page override #if 'type' in page_template: # max_page = max_page * 100 return jav_objs_raw, max_page
def jav777_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//h2[@class="post-title"]/a/@title', 'javid': '//div[@class="post-container"]/div/@id', 'img': '//div[@class="featured-media"]/a/img/@src', 'car': '//h2[@class="post-title"]/a/@title' } xpath_max_page = '//center/a[position() = (last()-1)]/text()' # force to get url from ini file each time jav777_url = JAV777_URL set_url = jav777_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {set_url}') res = return_post_res(set_url).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): # need to extract car from title, reusing file_scanner function if k == 'car': # remove hd prefixes _value = _value.lstrip('(HD)') name_group = re.search(DEFAULT_FILENAME_PATTERN, _value) name_digits = name_group.group('digit') # only keep 0 under 3 digits # keep 045, 0830 > 830, 1130, 0002 > 002, 005 if name_digits.isdigit(): name_digits = str(int(name_digits)) while len(name_digits) < 3: name_digits = '0' + name_digits _value = name_group.group('pre') + '-' + name_digits jav_objs_raw[_i].update({k: _value}) try: max_page = root.xpath(xpath_max_page)[0] except: max_page = page_num if not max_page: max_page = page_num return jav_objs_raw, max_page
def javlib_set_page(page_prefix: str, page_num: int, config=None) -> dict: xpath_dict = { 'title': '//*[@class="video"]/a/@title', 'javid': '//*[@class="video"]/@id', 'img': '//*[@class="video"]/a/img/@src', 'car': '//*/div[@class="video"]/a/div[@class="id"]/text()' } xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href' # force to get url from ini file each time javlib_url = return_config_string(['其他设置', 'javlibrary网址']) # fill missing parameters if config == None: config = deepcopy(DEFAULT_JAVLIB_CONFIG) lib_url = javlib_url + page_prefix + str(page_num) print(f'accessing {lib_url}') res = return_post_res(lib_url, proxies=config['proxies'], cookies=config['cookies']).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = find_max_page(root.xpath(xpath_max_page)[0]) except IndexError: max_page = page_num return jav_objs_raw, max_page