def return_get_res(url, cookies={}, proxies=None, headers=None, encoding='utf-8', behind_cloudflare=False): #print(f'accessing {url}') if not headers: headers = DEFAULT_HEADERS # read settings from ini file use_proxy = return_config_string(['代理', '是否使用代理?']) # prioritize passed in proxies if use_proxy == '是' and not proxies: proxies = return_config_string(['代理', '代理IP及端口']) if behind_cloudflare: res = cloudflare_get(url, cookies=cookies, proxies=proxies) else: res = requests.get(url, headers=headers, cookies=cookies, proxies=proxies) res.encoding = encoding return res
def return_post_res(url, data=None, cookies=None, proxies=None, headers=None, encoding='utf-8'): if not headers: headers = DEFAULT_HEADERS # read settings from ini file use_proxy = return_config_string(['代理', '是否使用代理?']) # prioritize passed in proxies if use_proxy == '是' and not proxies: proxies = return_config_string(['代理', '代理IP及端口']) else: pass #print('not using proxy for requests') res = requests.post(url, data, headers=headers, cookies=cookies, proxies=proxies) res.encoding = encoding return res
def return_html_text(url, cookies=None, proxies=None, encoding='utf-8'): # read settings from ini file use_proxy = return_config_string(['代理', '是否使用代理?']) # prioritize passed in proxies if use_proxy == '是' and not proxies: proxies = return_config_string(['代理', '代理IP及端口']) res = requests.get(url, cookies=cookies, proxies=proxies) res.encoding = encoding return res.text
def return_html_text(url, cookies=None, proxies=None, encoding='utf-8', behind_cloudflare=False): # read settings from ini file use_proxy = return_config_string(['代理', '是否使用代理?']) # prioritize passed in proxies if use_proxy == '是' and not proxies: proxies = return_config_string(['代理', '代理IP及端口']) if behind_cloudflare: res = cloudscraper.create_scraper().get(url, cookies=cookies, proxies=proxies) else: res = requests.get(url, cookies=cookies, proxies=proxies) res.encoding = encoding return res.text
def send_emby_images(image_folder_path): # init num = 0 up_num = 0 if not os.path.exists(image_folder_path): print('current path: {}'.format(os.getcwd())) raise Exception('{} image folder doesn\'t exist, please specify correct path'.format(image_folder_path)) emby_url = return_config_string(["emby专用", "网址"]) api_key = return_config_string(["emby专用", "api id"]) # try correct emby url with / if not emby_url.endswith('/'): emby_url += '/' try: for actress in list_emby_actress(emby_url, api_key): num += 1 if num % 500 == 0: print('have processed', num, '个actress') actress_name = actress['Name'] actress_id = actress['Id'] res_info = {'log': f'processed 女优:{actress_name}, ID:{actress_id}'} if os.path.isfile(os.path.join(image_folder_path, f'{actress_name}.jpg')): file_path = os.path.join(image_folder_path, f'{actress_name}.jpg') up_num += post_image_to_actress(actress_id, file_path, emby_url, api_key) elif os.path.isfile(os.path.join(image_folder_path, f'{actress_name}.png')): file_path = os.path.join(image_folder_path, f'{actress_name}.png') up_num += post_image_to_actress(actress_id, file_path, emby_url, api_key) else: res_info = {'log': f'{actress_name} image file doen\'t exist'} print(res_info) yield json.dumps(res_info)+'\n' except requests.exceptions.ConnectionError: print('emby服务端无法访问,请检查:', emby_url, '\n') except Exception as err: traceback.print_exc() print('发生未知错误,请截图给作者:', emby_url, err) print(f'成功upload {up_num} 个女优头像!') yield json.dumps({'log': f'成功upload {up_num} 个女优头像!'})+'\n'
def javlib_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//*[@class="video"]/a/@title', 'javid': '//*[@class="video"]/@id', 'img': '//*[@class="video"]/a/img/@src', 'car': '//*/div[@class="video"]/a/div[@class="id"]/text()' } xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href' # force to get url from ini file each time javlib_url = return_config_string(['其他设置', 'javlibrary网址']) lib_url = javlib_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {lib_url}') res = return_post_res(lib_url, behind_cloudflare=True).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = find_max_page(root.xpath(xpath_max_page)[0]) except IndexError: max_page = page_num return jav_objs_raw, max_page
def javbus_magnet_search(car: str): jav_url = return_config_string(['其他设置', 'javbus网址']) gid_match = r'.*?var gid = (\d*);.*?' magnet_xpath = { 'magnet': '//tr/td[position()=1]/a[1]/@href', 'title': '//tr/td[position()=1]/a[1]/text()', 'size': '//tr/td[position()=2]/a[1]/text()' } main_url_template = jav_url+'{car}' magnet_url_template = jav_url+'ajax/uncledatoolsbyajax.php?gid={gid}&uc=0' res = return_get_res(main_url_template.format(car=car)).text gid = re.search(gid_match, res).groups()[0] res = return_get_res(magnet_url_template.format(gid=gid), headers={'referer': main_url_template.format(car=car)}).content root = etree.HTML(res) magnets = defaultlist(dict) for k, v in magnet_xpath.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): magnets[_i].update({k: _value.strip('\t').strip('\r').strip('\n').strip()}) if k == 'size': magnets[_i].update({'size_sort': parsed_size_to_int(_value.strip('\t').strip('\r').strip('\n').strip())}) return magnets
def javbus_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@title', 'javid': '//div[@class="photo-info"]/span/date[1]/text()', 'img': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@src', 'car': '//div[@class="photo-info"]/span/date[1]/text()' } xpath_max_page = '//ul[@class="pagination pagination-lg"]/li/a/text()' # force to get url from ini file each time javbus_url = return_config_string(['其他设置', 'javbus网址']) set_url = javbus_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {set_url}') res = return_post_res(set_url).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = root.xpath(xpath_max_page)[-2] except: max_page = page_num if not max_page: max_page = page_num return jav_objs_raw, max_page
def read_local_ini(): if request.args.get('filter_dict'): res = {} errors = [] filter_dict = literal_eval(request.args.get('filter_dict')) for k, v in filter_dict.items(): try: res[k] = return_config_string(v) except IniNotFoundException as e: errors.append(str(e)) return jsonify({'local_config': res, 'error': errors}) else: return jsonify({'local_config': load_ini_file()._sections}) # convert returned obj to dict format
def __init__(self, *args, **kwargs): super(JavLibraryScraper, self).__init__(*args, **kwargs) self.source = 'javlibrary' self.xpath_dict = { 'search_field': { 'title': '//title/text()', 'studio': '//tr[td="制作商:"]/td[2]/span/a/text()', 'premiered': '//tr[td="发行日期:"]/td[2]/text()', #'year': processed from release date 'length': '//tr[td="长度:"]/td[2]/span/text()', 'director': '//tr[td="导演:"]/td[2]/text()', 'image': '//img[@id="video_jacket_img"]/@src', 'score': '//span[@class="score"]/text()' }, 'search_list_field': { #'plot': no good source, 'all_actress': '//span[@class="star"]/a/text()', 'genres': '//span[@class="genre"]/a/text()' }, } self.jav_url = return_config_string(['其他设置', 'javlibrary网址'])
def __init__(self, *args, **kwargs): super(JavBusScraper, self).__init__(*args, **kwargs) self.source = 'javlibrary' self.xpath_dict = { 'search_field': { 'title': '//a[@class="bigImage"]/img/@title', 'studio': '//p[span="製作商:"]/a/text()', 'premiered': '//p[span="發行日期:"]/text()', #'year': processed from release date 'length': '//p[span="長度:"]/text()', 'director': '//p[span="導演:"]/a/text()', 'image': '//a[@class="bigImage"]/img/@src', #'score': no good source }, 'search_list_field': { #'plot': no good source, 'all_actress': '//span[@class="genre" and @onmouseover]/a/text()', 'genres': '//span[@class="genre"]/a[contains(@href, "genre")]/text()' }, } self.jav_url = return_config_string(['其他设置', 'javbus网址'])
def javlib_set_page(page_prefix: str, page_num: int, config=None) -> dict: xpath_dict = { 'title': '//*[@class="video"]/a/@title', 'javid': '//*[@class="video"]/@id', 'img': '//*[@class="video"]/a/img/@src', 'car': '//*/div[@class="video"]/a/div[@class="id"]/text()' } xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href' # force to get url from ini file each time javlib_url = return_config_string(['其他设置', 'javlibrary网址']) # fill missing parameters if config == None: config = deepcopy(DEFAULT_JAVLIB_CONFIG) lib_url = javlib_url + page_prefix + str(page_num) print(f'accessing {lib_url}') res = return_post_res(lib_url, proxies=config['proxies'], cookies=config['cookies']).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = find_max_page(root.xpath(xpath_max_page)[0]) except IndexError: max_page = page_num return jav_objs_raw, max_page
def parse_javlib(jav_obj: dict, config=None) -> dict: # force to get url from ini file each time javlib_url = return_config_string(['其他设置', 'javlibrary网址']) # fill missing parameters if config == None: config = deepcopy(DEFAULT_JAVLIB_CONFIG) # perform search first lib_search_url = javlib_url + 'vl_searchbyid.php?keyword=' + jav_obj['car'] print(f'accessing {lib_search_url}') jav_html = return_html_text(lib_search_url, proxies=config['proxies'], cookies=config['cookies']) # 搜索结果的网页,大部分情况就是这个影片的网页,也有可能是多个结果的网页 # 尝试找标题,第一种情况:找得到,就是这个影片的网页 if jav_obj['car'].startswith('T28'): # special filter for T28 title_re = re.search(r'<title>(T28-\d{1,5}.+?) - JAVLibrary</title>', jav_html) # update title re config['search_field'][ 'title'] = r'<title>(T28-\d{1,5}.+?) - JAVLibrary</title>' else: title_re = re.search( r'<title>([a-zA-Z]{1,6}-\d{1,5}.+?) - JAVLibrary</title>', jav_html) # 匹配处理“标题” # 搜索结果就是AV的页面 if title_re: pass # 第二种情况:搜索结果可能是两个以上,所以这种匹配找不到标题,None! else: # 继续找标题,但匹配形式不同,这是找“可能是多个结果的网页”上的第一个标题 search_results = re.findall(r'v=javli(.+?)" title=".+?-\d+?[a-z]? ', jav_html) # 搜索有几个结果,用第一个AV的网页,打开它 if search_results: # 只有一个结果,其实其他的被忽略的,比如avop-00127bod result_first_url = javlib_url + '?v=javli' + search_results[0] jav_html = return_html_text(result_first_url, proxies=config['proxies'], cookies=config['cookies']) # 第三种情况:搜索不到这部影片,搜索结果页面什么都没有 else: raise JAVNotFoundException('{} cannot be found in javlib'.format( jav_obj['car'])) print('>>正在处理:', jav_obj['car']) # process standard fields # we can use update here since each field only allows one value jav_obj.update(re_parse_html(config['search_field'], jav_html)) # process list fields for k, v in config['search_list_field'].items(): for each_v in re.findall(v, jav_html): jav_obj.setdefault(k, []).append(each_v) # get rid of car in title if 'title' in jav_obj: title_re = re.search(r'(.+?) (.+)', jav_obj['title']) if title_re: jav_obj['title'] = title_re.group(2) else: import ipdb ipdb.set_trace() # process score to make it more realistic if 'score' in jav_obj: score = (float(jav_obj['score']) - 4) * 5 / 3 if score >= 0: score = '%.1f' % score jav_obj['score'] = str(float(score) * 10) # extra processing for actress names for japanese name jav_obj['all_actress'] = [] actress_jav_ids = r'<a href=\"vl_star\.php\?s=(.+?)\" rel=\"tag\">(.+?)</a>' if True and 'ja' not in javlib_url: # read from ini file javlib_url_jp = javlib_url.replace('cn', 'ja') for act_id_re in re.findall(actress_jav_ids, jav_html): if len(act_id_re) != 2: print(f'skipping {act_id_re}, not enough info') continue ind_url_jp = javlib_url_jp + f'vl_star.php?s={act_id_re[0]}' print(f'requesting {ind_url_jp} for jp name') jp_html_text = return_html_text(ind_url_jp) # compare jp with cn name try: jp_name_filter = r'<div class="boxtitle">(.+?)のビデオ</div>' jp_name = re.search(jp_name_filter, jp_html_text).group(1) act_name = act_id_re[1] except: import ipdb ipdb.set_trace() # merge jp into cn name if jp_name != act_name: act_name = '{}[{}]'.format(act_name, jp_name) jav_obj['all_actress'].append(act_name) # force set year if not detected if not jav_obj.get('year'): jav_obj['year'] = 'unknown' return jav_obj
def send_emby_images(self, image_folder_path=None): # init num = 0 up_num = [] failed_names = [] emby_url = return_config_string(["emby专用", "网址"], config=self.config) api_key = return_config_string(["emby专用", "api id"], config=self.config) image_scraper = WarashiScraper() # try correct emby url with / if not emby_url.endswith('/'): emby_url += '/' try: for actress in self.actress_yielder(emby_url, req_site='emby'): num += 1 if num % 500 == 0: print('have processed', num, '个actress') actress_name = actress['Name'] actress_id = actress['Id'] actress_formatter = r'(.+?)\[(.+?)\]' actress_groups = re.search(actress_formatter, actress_name) if actress_groups and len(actress_groups.groups()) == 2: search_term = actress_groups.groups()[1] print(f'use {search_term} for search') else: search_term = None if not self.replace and actress.get('ImageTags', {}) != {}: res_info = { 'log': f'skipping 女优:{actress_name}, already has existing images' } yield json.dumps(res_info, ensure_ascii=False) + '\n' continue has_local_image = False if image_folder_path: if os.path.isfile( os.path.join(image_folder_path, f'{actress_name}.jpg')): file_path = os.path.join(image_folder_path, f'{actress_name}.jpg') self.post_image_to_actress(actress_id, file_path, emby_url, api_key) up_num.append(actress_name) has_local_image = True elif os.path.isfile( os.path.join(image_folder_path, f'{actress_name}.png')): file_path = os.path.join(image_folder_path, f'{actress_name}.png') self.post_image_to_actress(actress_id, file_path, emby_url, api_key) up_num.append(actress_name) has_local_image = True if not has_local_image: try: if not self.walked_actress.get( search_term or actress_name, ''): image_url = image_scraper.return_image_by_name( search_term or actress_name) self.walked_actress[search_term or actress_name] = image_url else: image_url = self.walked_actress[search_term or actress_name] self.post_image_to_actress(actress_id, image_url, emby_url, api_key) up_num.append(actress_name) except ActorNotFoundException as e: res_info = {'log': str(e)} failed_names.append(actress_name) yield json.dumps(res_info, ensure_ascii=False) + '\n' continue except Exception as e: traceback_str = traceback.format_exc() yield json.dumps(traceback_str, ensure_ascii=False) + '\n' continue res_info = { 'log': f'processed 女优:{actress_name}, ID:{actress_id}' } yield json.dumps(res_info, ensure_ascii=False) + '\n' except requests.exceptions.ConnectionError: print('emby服务端无法访问,请检查:', emby_url) except Exception as err: traceback.print_exc() print('发生未知错误,请截图给作者:', emby_url, err) print(f'成功upload {len(up_num)} 个女优头像!') yield json.dumps( { 'log': f'成功upload {len(up_num)} 个女优头像!succeeded on {up_num} \n failed on {failed_names}' }, ensure_ascii=False) + '\n'
def setup_credentials(self): api_key = return_config_string(["emby专用", "api id"], config=self.config) return {'api_key': api_key}