def get_subtitles(self, video_name, sub_num=5): print(prefix + ' Searching ZIMUZU...', end='\r') keywords, info_dict = Downloader.get_keywords(video_name) keyword = ' '.join(keywords) sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(ZimuzuDownloader.search_url.format(keyword), headers=Downloader.header, timeout=10) bs_obj = BeautifulSoup(r.text, 'html.parser') tab_text = bs_obj.find('div', {'class': 'article-tab'}).text tab_text = tab_text.encode('utf8') if py == 2 else tab_text if '字幕(0)' not in tab_text: for one_box in bs_obj.find_all('div', {'class': 'search-item'}): sub_name = ZimuzuDownloader.choice_prefix + \ one_box.find('strong', {'class': 'list_title'}).text sub_name = sub_name.encode('utf8') if py == 2 else sub_name if info_dict['type'] == 'movie' and '美剧字幕' in sub_name: continue a = one_box.find('a') text = a.text.encode('utf8') if py == 2 else a.text sub_url = ZimuzuDownloader.site_url + a.attrs['href'] type_score = 0 type_score += ('英文' in text) * 1 type_score += ('繁体' in text) * 2 type_score += ('简体' in text) * 4 type_score += ('中英' in text) * 8 sub_dict[sub_name] = { 'lan': type_score, 'link': sub_url, 'session': None } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], '') keywords.pop(-1) continue break # 第一个候选字幕没有双语 if (len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8): sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True)) return sub_dict
def get_subtitles(self, video_name, sub_num=5): print(PREFIX + " Searching ZIMUZU...", end="\r") keywords, info_dict = Downloader.get_keywords(video_name) keyword = " ".join(keywords) sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get( ZimuzuDownloader.search_url.format(keyword), headers=Downloader.header, timeout=10, ) bs_obj = BeautifulSoup(r.text, "html.parser") tab_text = bs_obj.find("div", {"class": "article-tab"}).text if "字幕(0)" not in tab_text: for one_box in bs_obj.find_all("div", {"class": "search-item"}): sub_name = ( ZimuzuDownloader.choice_prefix + one_box.find("strong", {"class": "list_title"}).text ) if info_dict["type"] == "movie" and "美剧字幕" in sub_name: continue a = one_box.find("a") text = a.text sub_url = ZimuzuDownloader.site_url + a.attrs["href"] type_score = 0 type_score += ("英文" in text) * 1 type_score += ("繁体" in text) * 2 type_score += ("简体" in text) * 4 type_score += ("中英" in text) * 8 sub_dict[sub_name] = { "lan": type_score, "link": sub_url, "session": None, } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], "") keywords.pop(-1) continue break # 第一个候选字幕没有双语 if len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]["lan"] < 8: sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]["lan"], reverse=True) ) return sub_dict
def get_subtitles(self, keywords, sub_num=5): print(prefix + ' Searching ZIMUZU...', end='\r') keywords = list(keywords) keyword = '' for one in keywords: keyword += (one + ' ') sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(self.search_url.format(keyword), headers=self.headers, timeout=10) bs_obj = BeautifulSoup(r.text, 'html.parser') tab_text = bs_obj.find('div', {'class': 'article-tab'}).text tab_text = tab_text.encode('utf8') if py == 2 else tab_text if '字幕(0)' not in tab_text: for one_box in bs_obj.find_all('div', {'class': 'search-item'}): sub_name = '[ZMZ]' + one_box.find('p').find('font').text sub_name = sub_name.encode('utf8') if py == 2 else sub_name a = one_box.find('a') text = a.text.encode('utf8') if py == 2 else a.text sub_url = self.site_url + a.attrs['href'] type_score = 0 type_score += ('英文' in text) * 1 type_score += ('繁体' in text) * 2 type_score += ('简体' in text) * 4 type_score += ('中英' in text) * 8 sub_dict[sub_name] = {'lan': type_score, 'link': sub_url} if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], '') keywords.pop(-1) continue break # 第一个候选字幕没有双语 if (len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8): sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True)) return sub_dict
def get_subtitles(self, keywords, sub_num=5): """ 传入关键字列表,返回有序字典。 keywords:重要度降序的关键字列表 sub_num: 字幕结果数,默认为5 返回: 字幕字典{'字幕名':{'lan':'字幕包含语言值', 'link': '字幕链接'}},按语言值降序排列 字幕包含语言值:英文加1, 繁体加2, 简体加4, 双语加8 """ print('├ Searching SUBHD...', end='\r') keywords = list(keywords) keyword = '' for one in keywords: keyword += (one + ' ') sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(self.search_url + keyword, headers=self.headers) bs_obj = BeautifulSoup(r.text, 'html.parser') if '总共 0 条' not in bs_obj.find('small').text: for one_box in bs_obj.find_all('div', {'class': 'box'}): a = one_box.find('div', {'class': 'd_title'}).find('a') sub_url = self.site_url + a.attrs['href'] sub_name = '[SUBHD]' + a.text if '/ar1/' in a.attrs['href']: type_score = 0 type_score += ('英文' in one_box.text) * 1 type_score += ('繁体' in one_box.text) * 2 type_score += ('简体' in one_box.text) * 4 type_score += ('双语' in one_box.text) * 8 sub_dict[sub_name] = {'lan': type_score, 'link': sub_url} if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], '') keywords.pop(-1) continue break if len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8: # 第一个候选字幕没有双语 sub_dict = order_dict(sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True)) return sub_dict
def get_subtitles(self, keywords, sub_num=5): print(prefix + " Searching ZIMUZU...", end="\r") keywords = list(keywords) keyword = "" for one in keywords: keyword += one + " " sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(self.search_url.format(keyword), headers=self.headers, timeout=10) bs_obj = BeautifulSoup(r.text, "html.parser") tab_text = bs_obj.find("div", {"class": "article-tab"}).text tab_text = tab_text if "字幕(0)" not in tab_text: for one_box in bs_obj.find_all("div", {"class": "search-item"}): sub_name = "[ZMZ]" + one_box.find("p").find("font").text a = one_box.find("a") text = a.text sub_url = self.site_url + a.attrs["href"] sub_dict[sub_name] = { "lan": get_type_score(text), "link": sub_url, "version": one_box.find("font", "f4").text, } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], "") keywords.pop(-1) continue break # 第一个候选字幕没有双语 if len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]["lan"] < 8: sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]["lan"], reverse=True) ) return sub_dict
def get_path_name(self, args): """ 传入输入的视频名称或路径, 构造一个包含视频路径和是否存在字幕信息的字典返回。 video_dict: {'path': path, 'have_subtitle': sub_exists} """ mix_str = args.replace('"', '') video_dict = order_dict() if os.path.isdir(mix_str): # 一个文件夹 for root, dirs, files in os.walk(mix_str): for one_name in files: suffix = os.path.splitext(one_name)[1] # 检查后缀是否为视频格式 if suffix not in self.video_format_list: continue v_name_no_format = os.path.splitext(one_name)[0] sub_exists = max( list( map( lambda sub_type: int(v_name_no_format + sub_type in files), self.sub_format_list ) ) ) video_dict[one_name] = {'path': root, 'have_subtitle': sub_exists} elif os.path.isabs(mix_str): # 视频绝对路径 v_path, v_name = os.path.split(mix_str) v_name_no_format = os.path.splitext(v_name)[0] sub_exists = max( list( map( lambda sub_type: os.path.exists( os.path.join(v_path, v_name_no_format+sub_type) ), self.sub_format_list ) ) ) video_dict[v_name] = {'path': os.path.dirname(mix_str), 'have_subtitle': sub_exists} else: # 单个视频名字,无路径 video_dict[mix_str] = {'path': os.getcwd(), 'have_subtitle': 0} return video_dict
def start(self): all_video_dict = self.get_path_name(self.arg_name, self.sub_store_path) for one_video, video_info in all_video_dict.items(): self.s_error = '' # 重置错误记录 self.f_error = '' try: print('\n' + prefix + ' ' + one_video) # 打印当前视频及其路径 print(prefix + ' ' + video_info['path'] + '\n' + prefix) if video_info['have_subtitle'] and not self.over: print(prefix + " subtitle already exists, add '-o' to replace it.") continue sub_dict = order_dict() for i, downloader in enumerate(self.downloader): try: sub_dict.update( downloader.get_subtitles(one_video, sub_num=self.sub_num)) except ValueError as e: if str(e) == 'Zimuku搜索结果出现未知结构页面': print(prefix + ' warn: ' + str(e)) else: raise (e) except (exceptions.Timeout, exceptions.ConnectionError): print(prefix + ' connect timeout, search next site.') if i < (len(self.downloader) - 1): continue else: print(prefix + ' PLEASE CHECK YOUR NETWORK STATUS') sys.exit(0) if len(sub_dict) >= self.sub_num: break if len(sub_dict) == 0: self.s_error += 'no search results. ' continue extract_sub_names = [] # 遍历字幕包直到有猜测字幕 while not extract_sub_names and len(sub_dict) > 0: exit, sub_choices = self.choose_subtitle(sub_dict) if exit: break for i, choice in enumerate(sub_choices): sub_choice, link, session = choice sub_dict.pop(sub_choice) try: if i == 0: error, n_extract_sub_names = self.process_archive( one_video, video_info, sub_choice, link, session) else: error, n_extract_sub_names = self.process_archive( one_video, video_info, sub_choice, link, session, rename=False, delete=False) if error: print(prefix + ' error: ' + error) print(prefix) continue elif not n_extract_sub_names: print(prefix + ' no matched subtitle in this archive') continue else: extract_sub_names += n_extract_sub_names except TypeError as e: print(format_exc()) continue except (rarfile.BadRarFile, TypeError) as e: print(prefix + ' Error:' + str(e)) continue except rarfile.RarCannotExec: self.s_error += 'Unrar not installed?' except AttributeError: self.s_error += 'unknown error. try again.' self.f_error += format_exc() except Exception as e: self.s_error += str(e) + '. ' self.f_error += format_exc() finally: if ('extract_sub_names' in dir() and not extract_sub_names and len(sub_dict) == 0): # 自动模式下所有字幕包均没有猜测字幕 self.s_error += " failed to guess one subtitle," self.s_error += "use '-q' to try query mode." if self.s_error and not self.debug: self.s_error += "add --debug to get more info of the error" if self.s_error: self.failed_list.append({ 'name': one_video, 'path': video_info['path'], 'error': self.s_error, 'trace_back': self.f_error }) print(prefix + ' error:' + self.s_error) if len(self.failed_list): print('\n===============================', end='') print('FAILED LIST===============================\n') for i, one in enumerate(self.failed_list): print('%2s. name: %s' % (i + 1, one['name'])) print('%3s path: %s' % ('', one['path'])) print('%3s info: %s' % ('', one['error'])) if self.debug: print('%3s TRACE_BACK: %s' % ('', one['trace_back'])) print('\ntotal: %s success: %s fail: %s\n' % (len(all_video_dict), len(all_video_dict) - len(self.failed_list), len(self.failed_list))) return { 'total': len(all_video_dict), 'success': len(all_video_dict) - len(self.failed_list), 'fail': len(self.failed_list), 'fail_videos': self.failed_list }
def get_subtitles(self, keywords, sub_num=10): print(prefix + ' Searching ZIMUKU...', end='\r') keywords = list(keywords) keyword = ' '.join(keywords) info = guessit(keyword) keywords.pop(0) keywords.insert(0, info['title']) if info.get('season'): season = str(info['season']).zfill(2) keywords.insert(1, 's' + season) sub_dict = order_dict() s = requests.session() s.headers.update(self.headers) while True: # 当前关键字搜索 r = s.get(self.search_url + keyword, timeout=10) if py == 2: html = r.text.encode('utf8') else: html = r.text if '搜索不到相关字幕' not in html: bs_obj = BeautifulSoup(r.text, 'html.parser') if bs_obj.find('div', {'class': 'item'}): # 综合搜索页面 for item in bs_obj.find_all('div', {'class': 'item'}): title_boxes = item.find( 'div', {'class': 'title'}).find_all('p') title_box = title_boxes[0] sub_title_box = title_boxes[1] if py == 2: item_title = title_box.text.encode('utf8') item_sub_title = sub_title_box.text.encode('utf8') else: item_title = title_box.text item_sub_title = sub_title_box.text item_info = guessit(item_title) if info.get('year') and item_info.get('year'): if info['year'] != item_info['year']: # 年份不匹配,跳过 continue item_titles = [ item_info.get('title', '').lower(), item_info.get('alternative_title', '').lower() ] + item_sub_title.lower().strip().split(',') title_included = sum([ 1 for _ in item_sub_title if info['title'].lower() not in _ ]) if title_included == 0: # guessit抽取标题不匹配,跳过 item_title_split = \ [one.split() for one in item_titles] info_title_split = info['title'].lower().split() sum1 = sum([1 for _ in info_title_split if _ in item_title_split[0]]) sum2 = sum([1 for _ in info_title_split if _ in item_title_split[1]]) if not (sum1 / len(info_title_split) >= 0.5 or sum2 / len(info_title_split) >= 0.5): # 标题不匹配,跳过 continue for a in item.find_all('td', {'class': 'first'})[:3]: a = a.a a_link = self.site_url + a.attrs['href'] if py == 2: a_title = a.text.encode('utf8') else: a_title = a.text a_title = '[ZIMUKU]' + a_title sub_dict[a_title] = {'type': 'default', 'link': a_link} elif bs_obj.find('div', {'class': 'persub'}): # 射手字幕页面 for persub in bs_obj.find_all('div', {'class': 'persub'}): if py == 2: a_title = persub.h1.text.encode('utf8') else: a_title = persub.h1.text a_link = self.site_url + persub.h1.a.attrs['href'] a_title = '[ZIMUKU]' + a_title sub_dict[a_title] = {'type': 'shooter', 'link': a_link} else: raise ValueError('Zimuku搜索结果出现未知结构页面') if len(sub_dict) >= sub_num: del keywords[:] break if len(keywords) > 1: keyword = keyword.replace(keywords[-1], '').strip() keywords.pop(-1) continue break for sub_name, sub_info in sub_dict.items(): if sub_info['type'] == 'default': # 综合搜索字幕页面 r = s.get(sub_info['link'], timeout=60) bs_obj = BeautifulSoup(r.text, 'html.parser') lang_box = bs_obj.find('ul', {'class': 'subinfo'}).find('li') type_score = 0 for lang in lang_box.find_all('img'): if 'uk' in lang.attrs['src']: type_score += 1 elif 'hongkong' in lang.attrs['src']: type_score += 2 elif 'china' in lang.attrs['src']: type_score += 4 elif 'jollyroger' in lang.attrs['src']: type_score += 8 sub_info['lan'] = type_score download_link = bs_obj.find('a', {'id': 'down1'}).attrs['href'] download_link = urljoin(self.site_url, download_link) r = s.get(download_link, timeout=60) bs_obj = BeautifulSoup(r.text, 'html.parser') download_link = bs_obj.find('a', {'rel': 'nofollow'}) download_link = download_link.attrs['href'] download_link = urljoin(self.site_url, download_link) sub_info['link'] = download_link else: # 射手字幕页面 r = s.get(sub_info['link'], timeout=60) bs_obj = BeautifulSoup(r.text, 'html.parser') lang_box = bs_obj.find('ul', {'class': 'subinfo'}).find('li') type_score = 0 if py == 2: text = lang_box.text.encode('utf8') else: text = lang_box.text if '英' in text: type_score += 1 elif '繁' in text: type_score += 2 elif '简' in text: type_score += 4 elif '双语' in text: type_score += 8 sub_info['lan'] = type_score download_link = bs_obj.find('a', {'id': 'down1'}).attrs['href'] sub_info['link'] = download_link backup_session = requests.session() backup_session.headers.update(s.headers) backup_session.headers['Referer'] = sub_info['link'] backup_session.cookies.update(s.cookies) sub_info['session'] = backup_session return sub_dict
def start(self): all_video_dict = self.get_path_name(self.arg_name) for one_video, video_info in all_video_dict.items(): self.s_error = '' # 重置错误记录 self.f_error = '' try: keywords, info_dict = self.sort_keyword(one_video) print('\n' + prefix + ' ' + one_video) # 打印当前视频及其路径 print(prefix + ' ' + video_info['path'] + '\n' + prefix) if video_info['have_subtitle'] and not self.over: print(prefix + " subtitle already exists, add '-o' to replace it.") continue sub_dict = order_dict() for i, downloader in enumerate(self.downloader): try: sub_dict.update( downloader.get_subtitles(tuple(keywords))) except (exceptions.Timeout, exceptions.ConnectionError) as e: print(prefix + ' connect timeout, search next site.') if i < (len(self.downloader) - 1): continue else: print(prefix + ' PLEASE CHECK YOUR NETWORK STATUS') sys.exit(0) if len(sub_dict) >= self.sub_num: break if len(sub_dict) == 0: self.s_error += 'no search results. ' continue extract_sub_name = None # 遍历字幕包直到有猜测字幕 while not extract_sub_name and len(sub_dict) > 0: sub_choice, link = self.choose_subtitle(sub_dict) sub_dict.pop(sub_choice) if py == 2: encoding = chardet.detect(sub_choice)['encoding'] if isinstance(sub_choice, str): sub_choice = sub_choice.decode(encoding) try: sub_choice = sub_choice.encode( GetSubtitles.output_encode) except: if isinstance(sub_choice, str): sub_choice = sub_choice.encode(encoding) sub_choice = sub_choice.decode('utf8') sub_choice = sub_choice.encode( GetSubtitles.output_encode) if self.query: print(prefix + ' ') if '[ZMZ]' in sub_choice: datatype, sub_data_bytes = self.zimuzu.download_file( sub_choice, link) elif '[SUBHD]' in sub_choice: datatype, sub_data_bytes, msg = self.subhd.\ download_file(sub_choice, link) if msg == 'false': print(prefix + ' error: ' 'download too frequently ' 'with subhd downloader, ' 'please change to other downloaders') return elif '[ZIMUKU]' in sub_choice: datatype, sub_data_bytes = self.zimuku.download_file( sub_choice, link) if datatype in self.support_file_list: # 获得猜测字幕名称 # 查询模式必有返回值,自动模式无猜测值返回None try: extract_sub_name = self.extract_subtitle( one_video, video_info['path'], datatype, sub_data_bytes, info_dict, self.single) except rarfile.BadRarFile: continue if extract_sub_name: extract_sub_name = extract_sub_name.split('/')[-1] try: # zipfile: Historical ZIP filename encoding # try cp437 encoding extract_sub_name = extract_sub_name.\ encode('cp437').decode('gbk') except: pass try: if py == 2: if isinstance(extract_sub_name, str): encoding = chardet.\ detect(extract_sub_name) encoding = encoding['encoding'] if 'ISO' in encoding: encoding = 'gbk' extract_sub_name = extract_sub_name.\ decode(encoding) extract_sub_name = extract_sub_name.\ encode(GetSubtitles.output_encode) else: extract_sub_name = extract_sub_name.\ encode(GetSubtitles.output_encode) print(prefix + ' ' + extract_sub_name + '\n') except UnicodeDecodeError: print(prefix + ' ' + extract_sub_name.encode('gbk') + '\n') elif self.query: # 查询模式下下载字幕包为不支持类型 print(prefix + ' unsupported file type %s' % datatype[1:]) except rarfile.RarCannotExec: self.s_error += 'Unrar not installed?' except AttributeError: self.s_error += 'unknown error. try again.' self.f_error += format_exc() except Exception as e: self.s_error += str(e) + '. ' self.f_error += format_exc() finally: if ('extract_sub_name' in dir() and not extract_sub_name and len(sub_dict) == 0): # 自动模式下所有字幕包均没有猜测字幕 self.s_error += " failed to guess one subtitle," self.s_error += "use '-q' to try query mode." if self.s_error and not self.debug: self.s_error += "add --debug to get more info of the error" if self.s_error: self.failed_list.append({ 'name': one_video, 'path': video_info['path'], 'error': self.s_error, 'trace_back': self.f_error }) print(prefix + ' error:' + self.s_error) if len(self.failed_list): print('\n===============================', end='') print('FAILED LIST===============================\n') for i, one in enumerate(self.failed_list): print('%2s. name: %s' % (i + 1, one['name'])) print('%3s path: %s' % ('', one['path'])) print('%3s info: %s' % ('', one['error'])) if self.debug: print('%3s TRACE_BACK: %s' % ('', one['trace_back'])) print('\ntotal: %s success: %s fail: %s\n' % (len(all_video_dict), len(all_video_dict) - len(self.failed_list), len(self.failed_list)))
def get_subtitles(self, video_name, sub_num=5): print(prefix + ' Searching SUBHD...', end='\r') keywords, info_dict = Downloader.get_keywords(video_name) keyword = ' '.join(keywords) sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(SubHDDownloader.search_url + keyword, headers=Downloader.header, timeout=10) bs_obj = BeautifulSoup(r.text, 'html.parser') try: small_text = bs_obj.find('small').text except AttributeError as e: char_error = 'The URI you submitted has disallowed characters' if char_error in bs_obj.text: print(prefix + ' [SUBHD ERROR] ' + char_error + ': ' + keyword) return sub_dict # 搜索验证按钮 time.sleep(2) continue if "总共 0 条" not in small_text: results = bs_obj.find_all( "div", class_="mb-4 bg-white rounded shadow-sm") for one_box in results: if info_dict['type'] == 'movie' \ and not one_box.find('div', class_="px-1 rounded-sm bg-danger text-white"): continue a = one_box.find('div', class_="f12 pt-1").find('a') sub_url = SubHDDownloader.site_url + a.attrs['href'] sub_name = SubHDDownloader.choice_prefix + a.text text = one_box.text if '/a' in a.attrs['href']: type_score = 0 type_score += ('英文' in text) * 1 type_score += ('繁体' in text) * 2 type_score += ('简体' in text) * 4 type_score += ('双语' in text) * 8 sub_dict[sub_name] = { 'lan': type_score, 'link': sub_url, 'session': None } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], '') keywords.pop(-1) continue break if (len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8): # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True)) return sub_dict
def get_subtitles(self, video_name, sub_num=5): print(prefix + ' Searching SUBHD...', end='\r') keywords, info_dict = Downloader.get_keywords(video_name) keyword = ' '.join(keywords) sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(SubHDDownloader.search_url + keyword, headers=Downloader.header, timeout=10) bs_obj = BeautifulSoup(r.text, 'html.parser') try: if py == 2: small_text = bs_obj.find('small').text.encode('utf8') else: small_text = bs_obj.find('small').text except AttributeError as e: char_error = 'The URI you submitted has disallowed characters' if char_error in bs_obj.text: print(prefix + ' [SUBHD ERROR] ' + char_error + ': ' + keyword) return sub_dict # 搜索验证按钮 time.sleep(2) continue if '总共 0 条' not in small_text: for one_box in bs_obj.find_all('div', {'class': 'box'}): if info_dict['type'] == 'movie' \ and not one_box.find('div', {'class': 'movielist'}): continue a = one_box.find('div', {'class': 'd_title'}).find('a') sub_url = SubHDDownloader.site_url + a.attrs['href'] sub_name = SubHDDownloader.choice_prefix + a.text.encode('utf8') if py == 2 \ else SubHDDownloader.choice_prefix + a.text if py == 2: text = one_box.text.encode('utf8') else: text = one_box.text if '/ar' in a.attrs['href']: type_score = 0 type_score += ('英文' in text) * 1 type_score += ('繁体' in text) * 2 type_score += ('简体' in text) * 4 type_score += ('双语' in text) * 8 # no_dot_text=text.replace('.',' ').lower() # for qkeyword in keywords: # if no_dot_text.find(qkeyword.strip().lower()) != -1: # type_score += 3 sub_dict[sub_name] = { 'lan': type_score, 'link': sub_url, 'session': None } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], '') keywords.pop(-1) continue break if (len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8): # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True)) return sub_dict
def get_subtitles(self, keywords, sub_num=10): print(prefix + " Searching ZIMUKU...", end="\r") keywords = list(keywords) keyword = " ".join(keywords) info = guessit(keyword) keywords.pop(0) keywords.insert(0, info["title"]) if info.get("season"): season = str(info["season"]).zfill(2) keywords.insert(1, "s" + season) sub_dict = order_dict() s = requests.session() s.headers.update(self.headers) while True: # 当前关键字搜索 r = s.get(self.search_url + keyword, timeout=10) html = r.text if "搜索不到相关字幕" not in html: bs_obj = BeautifulSoup(r.text, "html.parser") if bs_obj.find("div", {"class": "item"}): # 综合搜索页面 for item in bs_obj.find_all("div", {"class": "item"}): title_boxes = item.find("div", { "class": "title" }).find_all("p") title_box = title_boxes[0] sub_title_box = title_boxes[1] item_title = title_box.text item_sub_title = sub_title_box.text item_info = guessit(item_title) if info.get("year") and item_info.get("year"): if info["year"] != item_info["year"]: # 年份不匹配,跳过 continue item_titles = [ item_info.get("title", "").lower(), item_info.get("alternative_title", "").lower(), ] + item_sub_title.lower().strip().split(",") title_included = sum([ 1 for _ in item_sub_title if info["title"].lower() not in _ ]) if title_included == 0: # guessit抽取标题不匹配,跳过 item_title_split = [ one.split() for one in item_titles ] info_title_split = info["title"].lower().split() sum1 = sum([ 1 for _ in info_title_split if _ in item_title_split[0] ]) sum2 = sum([ 1 for _ in info_title_split if _ in item_title_split[1] ]) if not (sum1 / len(info_title_split) >= 0.5 or sum2 / len(info_title_split) >= 0.5): # 标题不匹配,跳过 continue for a in item.find_all("td", {"class": "first"})[:3]: a = a.a a_link = self.site_url + a.attrs["href"] a_title = a.text a_title = "[ZIMUKU]" + a_title sub_dict[a_title] = { "type": "default", "link": a_link } elif bs_obj.find("div", {"class": "persub"}): # 射手字幕页面 for persub in bs_obj.find_all("div", {"class": "persub"}): a_title = persub.h1.text a_link = self.site_url + persub.h1.a.attrs["href"] a_title = "[ZIMUKU]" + a_title sub_dict[a_title] = {"type": "shooter", "link": a_link} else: raise ValueError("Zimuku搜索结果出现未知结构页面") if len(sub_dict) >= sub_num: del keywords[:] break if len(keywords) > 1: keyword = keyword.replace(keywords[-1], "").strip() keywords.pop(-1) continue break for sub_name, sub_info in sub_dict.items(): if sub_info["type"] == "default": # 综合搜索字幕页面 r = s.get(sub_info["link"], timeout=60) bs_obj = BeautifulSoup(r.text, "html.parser") lang_box = bs_obj.find("ul", {"class": "subinfo"}).find("li") type_score = 0 for lang in lang_box.find_all("img"): if "uk" in lang.attrs["src"]: type_score += 1 elif "hongkong" in lang.attrs["src"]: type_score += 2 elif "china" in lang.attrs["src"]: type_score += 4 elif "jollyroger" in lang.attrs["src"]: type_score += 8 sub_info["lan"] = type_score download_link = bs_obj.find("a", {"id": "down1"}).attrs["href"] download_link = urljoin(self.site_url, download_link) r = s.get(download_link, timeout=60) bs_obj = BeautifulSoup(r.text, "html.parser") download_link = bs_obj.find("a", {"rel": "nofollow"}) download_link = download_link.attrs["href"] download_link = urljoin(self.site_url, download_link) sub_info["link"] = download_link else: # 射手字幕页面 r = s.get(sub_info["link"], timeout=60) bs_obj = BeautifulSoup(r.text, "html.parser") lang_box = bs_obj.find("ul", {"class": "subinfo"}).find("li") text = lang_box.text sub_info["lan"] = get_type_score(text) download_link = bs_obj.find("a", {"id": "down1"}).attrs["href"] sub_info["link"] = download_link backup_session = requests.session() backup_session.headers.update(s.headers) backup_session.headers["Referer"] = sub_info["link"] backup_session.cookies.update(s.cookies) sub_info["session"] = backup_session return sub_dict
def start(self): all_video_dict = self.get_path_name(self.arg_name) for one_video, video_info in all_video_dict.items(): self.s_error = '' # 重置错误记录 self.f_error = '' print('\n├ ' + one_video) # 打印当前视频及其路径 print('├ ' + video_info['path'] + '\n├') if video_info['have_subtitle'] and not self.over: print("├ subtitle already exists, add '-o' to replace it.") continue try: keywords, info_dict = self.sort_keyword(one_video) sub_dict = order_dict() for downloader in self.downloader: sub_dict.update( downloader.get_subtitles(tuple(keywords), sub_num=self.sub_num)) if len(sub_dict) >= self.sub_num: break if len(sub_dict) == 0: self.s_error += 'no search results' continue extract_sub_name = None while not extract_sub_name and len( sub_dict) > 0: # 遍历字幕包直到有猜测字幕 sub_choice = self.choose_subtitle(sub_dict) if self.query: print('├ ') if '[ZMZ]' in sub_choice: datatype, sub_data_bytes = self.zimuzu.download_file( sub_choice, sub_dict[sub_choice]['link']) elif '[SUBHD]' in sub_choice: datatype, sub_data_bytes = self.subhd.download_file( sub_choice, sub_dict[sub_choice]['link']) if datatype in self.support_file_list: # 获得猜测字幕名称,查询模式必有返回值,自动模式无猜测值返回None extract_sub_name = self.extract_subtitle( one_video, video_info['path'], datatype, sub_data_bytes, info_dict) if extract_sub_name: print('├ ' + extract_sub_name + '\n') elif self.query: # 查询模式下下载字幕包为不支持类型 print('├ unsupported file type %s' % datatype[1:]) sub_dict.pop(sub_choice) except exceptions.Timeout or exceptions.ConnectionError: self.s_error += 'connect failed, check network status.' except rarfile.RarCannotExec: self.s_error += 'Unrar not installed?' except AttributeError: self.s_error += 'unknown error. try again.' self.f_error += format_exc() except Exception as e: self.s_error += str(e) + '. ' self.f_error += format_exc() finally: if 'extract_sub_name' in dir( ) and not extract_sub_name and len(sub_dict) == 0: # 自动模式下所有字幕包均没有猜测字幕 self.s_error += " failed to guess one subtitle, use '-q' to try query mode." if self.s_error and not self.debug: self.s_error += "add --debug to get more info of the error" if self.s_error: self.failed_list.append({ 'name': one_video, 'path': video_info['path'], 'error': self.s_error, 'trace_back': self.f_error }) print('├ error:' + self.s_error) if len(self.failed_list): print( '\n===============================FAILED LIST===============================\n' ) for i, one in enumerate(self.failed_list): print('%2s. name: %s' % (i + 1, one['name'])) print('%3s path: %s' % ('', one['path'])) print('%3s info: %s' % ('', one['error'])) if self.debug: print('%3s TRACE_BACK: %s' % ('', one['trace_back'])) print('\ntotal: %s success: %s fail: %s\n' % (len(all_video_dict), len(all_video_dict) - len(self.failed_list), len(self.failed_list)))
def get_subtitles(self, keywords, sub_num=5): """ 传入关键字列表,返回有序字典。 keywords:重要度降序的关键字列表 sub_num: 字幕结果数,默认为5 返回: 字幕字典:{ '字幕名': {'lan': '字幕包含语言值', 'link': '字幕链接'} } 按语言值降序排列 字幕包含语言值:英文加1, 繁体加2, 简体加4, 双语加8 """ print(prefix + " Searching SUBHD...", end="\r") keywords = list(keywords) keyword = "" for one in keywords: keyword += one + " " sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(self.search_url + keyword, headers=self.headers, timeout=10) bs_obj = BeautifulSoup(r.text, "html.parser") try: small_text = bs_obj.find("small").text except AttributeError as e: char_error = "The URI you submitted has disallowed characters" if char_error in bs_obj.text: print(prefix + " [SUBHD ERROR] " + char_error + ": " + keyword) return None # 搜索验证按钮 time.sleep(2) continue if "总共 0 条" not in small_text: for one_box in bs_obj.find_all("div", {"class": "box"}): a = one_box.find("div", {"class": "d_title"}).find("a") sub_url = self.site_url + a.attrs["href"] sub_name = "[SUBHD]" + a.text text = one_box.text if "/ar" in a.attrs["href"]: sub_dict[sub_name] = { "lan": get_type_score(text), "link": sub_url, "version": a.attrs["title"], } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], "") keywords.pop(-1) continue break if len(sub_dict.items()) > 0 and list( sub_dict.items())[0][1]["lan"] < 8: # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]["lan"], reverse=False)) return sub_dict
def get_path_name(self, args, args1): """ 传入输入的视频名称或路径, 构造一个包含视频路径和是否存在字幕信息的字典返回。 video_dict: {'path': path, 'have_subtitle': sub_exists} """ mix_str = args.replace('"', '') if args1: store_path = args1.replace('"', '') else: store_path = '' store_path_files = [] if not os.path.isdir(store_path): print( 'no valid path specfied,download sub file to video file location.' ) store_path = '' else: for root, dirs, files in os.walk(store_path): store_path_files.extend(files) video_dict = order_dict() if os.path.isdir(mix_str): # 一个文件夹 for root, dirs, files in os.walk(mix_str): for one_name in files: suffix = os.path.splitext(one_name)[1] # 检查后缀是否为视频格式 if suffix not in self.video_format_list: continue v_name_no_format = os.path.splitext(one_name)[0] sub_exists = max( list( map( lambda sub_type: int(v_name_no_format + sub_type in files + store_path_files or v_name_no_format + '.zh' + sub_type in files + store_path_files), self.sub_format_list))) video_dict[one_name] = { 'path': next(item for item in [store_path, os.path.abspath(root)] if item != ''), 'have_subtitle': sub_exists } elif os.path.isabs(mix_str): # 视频绝对路径 v_path, v_name = os.path.split(mix_str) v_name_no_format = os.path.splitext(v_name)[0] if os.path.isdir(store_path): s_path = os.path.abspath(store_path) else: s_path = v_path sub_exists = max( list( map( lambda sub_type: os.path.exists( os.path.join(s_path, v_name_no_format + sub_type)), self.sub_format_list))) video_dict[v_name] = {'path': s_path, 'have_subtitle': sub_exists} else: # 单个视频名字,无路径 if not os.path.isdir(store_path): video_dict[mix_str] = {'path': os.getcwd(), 'have_subtitle': 0} else: video_dict[mix_str] = { 'path': os.path.abspath(store_path), 'have_subtitle': 0 } return video_dict
def get_subtitles(self, video_name, sub_num=5): print("Searching SUBHD...", end="\r") keywords, info_dict = Downloader.get_keywords(video_name) keyword = " ".join(keywords) sub_dict = order_dict() s = requests.session() s.headers.update(Downloader.header) while True: # 当前关键字查询 r = s.get( SubHDDownloader.search_url + keyword, timeout=10, ) bs_obj = BeautifulSoup(r.text, "html.parser") try: small_text = bs_obj.find("small").text except AttributeError: char_error = "The URI you submitted has disallowed characters" if char_error in bs_obj.text: print("[SUBHD ERROR] " + char_error + ": " + keyword) return sub_dict # 搜索验证按钮 time.sleep(2) continue if "总共 0 条" not in small_text: results = bs_obj.find_all( "div", class_="mb-4 bg-white rounded shadow-sm") for one_box in results: if info_dict["type"] == "movie" and not one_box.find( "div", class_="px-1 rounded-sm bg-danger text-white"): continue a = one_box.find("div", class_="f12 pt-1").find("a") sub_url = SubHDDownloader.site_url + a.attrs["href"] sub_name = SubHDDownloader.choice_prefix + a.text text = one_box.text if "/a" in a.attrs["href"]: type_score = 0 type_score += ("英文" in text) * 1 type_score += ("繁体" in text) * 2 type_score += ("简体" in text) * 4 type_score += ("双语" in text) * 8 sub_dict[sub_name] = { "lan": type_score, "link": sub_url, "session": None, } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], "") keywords.pop(-1) continue break if len(sub_dict.items()) > 0 and list( sub_dict.items())[0][1]["lan"] < 8: # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]["lan"], reverse=True)) return sub_dict
def get_subtitles(self, keywords, sub_num=5): """ 传入关键字列表,返回有序字典。 keywords:重要度降序的关键字列表 sub_num: 字幕结果数,默认为5 返回: 字幕字典:{ '字幕名': {'lan': '字幕包含语言值', 'link': '字幕链接'} } 按语言值降序排列 字幕包含语言值:英文加1, 繁体加2, 简体加4, 双语加8 """ print(prefix + ' Searching SUBHD...', end='\r') keywords = list(keywords) keyword = '' for one in keywords: keyword += (one + ' ') sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(self.search_url + keyword, headers=self.headers, timeout=10) bs_obj = BeautifulSoup(r.text, 'html.parser') try: if py == 2: small_text = bs_obj.find('small').text.encode('utf8') else: small_text = bs_obj.find('small').text except AttributeError: char_error = 'The URI you submitted has disallowed characters' if char_error in bs_obj.text: print(prefix + ' [SUBHD ERROR] ' + char_error + ': ' + keyword) return None if '总共 0 条' not in small_text: for one_box in bs_obj.find_all('div', {'class': 'box'}): a = one_box.find('div', {'class': 'd_title'}).find('a') sub_url = self.site_url + a.attrs['href'] sub_name = '[SUBHD]' + a.text.encode('utf8') if py == 2 \ else '[SUBHD]' + a.text if py == 2: text = one_box.text.encode('utf8') else: text = one_box.text if '/ar' in a.attrs['href']: type_score = 0 type_score += ('英文' in text) * 1 type_score += ('繁体' in text) * 2 type_score += ('简体' in text) * 4 type_score += ('双语' in text) * 8 sub_dict[sub_name] = { 'lan': type_score, 'link': sub_url, 'ref': self.search_url + keyword } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], '') keywords.pop(-1) continue break if (len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8): # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True)) return sub_dict