def _parse_subtitles_page(self, video, response, language): subtitles = [] page_soup = ParserBeautifulSoup( response.content.decode('iso-8859-1', 'ignore'), ['lxml', 'html.parser']) title_soups = page_soup.find_all("div", {'id': 'menu_detalle_buscador'}) body_soups = page_soup.find_all("div", {'id': 'buscador_detalle'}) for subtitle in range(0, len(title_soups)): title_soup, body_soup = title_soups[subtitle], body_soups[subtitle] # title title = title_soup.find("a").text.replace("Subtitulos de ", "") page_link = title_soup.find("a")["href"] # description description = body_soup.find("div", { 'id': 'buscador_detalle_sub' }).text description = description.replace(",", " ").lower() # uploader uploader = body_soup.find("a", {'class': 'link1'}).text subtitle = self.subtitle_class(language, video, page_link, title, description, uploader) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def _parse_subtitles_page(self, video, response, language): subtitles = [] page_soup = ParserBeautifulSoup( response.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) title_soups = page_soup.find_all("div", {"id": "menu_detalle_buscador"}) body_soups = page_soup.find_all("div", {"id": "buscador_detalle"}) episode = isinstance(video, Episode) for subtitle in range(0, len(title_soups)): title_soup, body_soup = title_soups[subtitle], body_soups[subtitle] # title title = _clean_title(title_soup.find("a").text) # Forced subtitles are not supported if title.lower().rstrip().endswith(("forzado", "forzados")): logger.debug("Skipping forced subtitles: %s", title) continue # Check movie title (if the video is a movie) if not episode and not _check_movie(video, title): continue # Data datos = body_soup.find("div", { "id": "buscador_detalle_sub_datos" }).text # Ignore multi-disc and non-srt subtitles if not any(item in datos for item in ("Cds:</b> 1", "SubRip")): continue spain = "/pais/7.gif" in datos language = Language.fromalpha2("es") if spain else Language( "spa", "MX") # description sub_details = body_soup.find("div", { "id": "buscador_detalle_sub" }).text description = sub_details.replace(",", " ") # uploader uploader = body_soup.find("a", {"class": "link1"}).text download_url = _get_download_url(body_soup) page_link = title_soup.find("a")["href"] subtitle = self.subtitle_class(language, video, page_link, title, description, uploader, download_url) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def _parse_subtitles_page(self, video, response, language): subtitles = [] page_soup = ParserBeautifulSoup( response.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) title_soups = page_soup.find_all("div", {"id": "menu_detalle_buscador"}) body_soups = page_soup.find_all("div", {"id": "buscador_detalle"}) for subtitle in range(0, len(title_soups)): title_soup, body_soup = title_soups[subtitle], body_soups[subtitle] # title title = self._clean_title(title_soup.find("a").text) # discard subtitles if a year between parenthesis is present in title and doesn't match the one provided # in video object if re.match(r'(\(\d{4}\))', title): if video.year and str(video.year) not in title: continue # Data datos = body_soup.find("div", { "id": "buscador_detalle_sub_datos" }).text # Ignore multi-disc and non-srt subtitles if not any(item in datos for item in ("Cds:</b> 1", "SubRip")): continue spain = "/pais/7.gif" in datos language = Language.fromalpha2("es") if spain else Language( "spa", "MX") # description sub_details = body_soup.find("div", { "id": "buscador_detalle_sub" }).text description = sub_details.replace(",", " ").lower() # uploader uploader = body_soup.find("a", {"class": "link1"}).text page_link = title_soup.find("a")["href"] subtitle = self.subtitle_class(language, video, page_link, title, description, uploader) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def query(self, series, season, episode, year=None, video=None): # Search for s01e03 instead of s1e3 seasona = "%02d" % season episodea = "%02d" % episode seriesa = fix_inconsistent_naming(series) seriesa = series.replace(' ', '+') # get the episode page logger.info('Getting the page for episode %s', episode) url = self.server_url + "sorozatok.php?cim=" + seriesa + "&evad="+str(seasona) + \ "&resz="+str(episodea)+"&nyelvtipus=%25&x=24&y=8" logger.info('Url %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) subtitles = [] for num, temp in enumerate(soup.find_all("table")): if "this.style.backgroundImage='url(css/over2.jpg)" in str( temp) and "css/infooldal.png" in str(temp): logger.debug("Found valid table (%d index)", num) subtitles += self._loop_over_table(temp, season, episode, video) return subtitles
def query(self, series, season, episode, year=None): # get the show id show_id = self.get_show_id(series, year) if show_id is None: logger.error('No show id found for %s (%r)', series, year) return [] # get the episode url episode_url = self.get_episode_url(show_id, series, season, episode, year) if episode_url is None: logger.error('No episode url found for %s, season %d, episode %d', series, season, episode) return [] # get the page of the episode of the show r = self.session.get(episode_url, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get episode title title_pattern = re.compile('Subt.+tulos de {}(.+){}x{:02d} - (.+)'.format(series, season, episode).lower()) title = title_pattern.search(soup.select('#cabecera-subtitulo')[0].get_text().strip().lower()).group(2) # loop over subtitle rows subtitles = [] for sub in soup.find_all('div', attrs={'id': re.compile('version([0-9]+)')}): # read the release subtitle release = sanitize_release_group(release_pattern.search(sub.find('p', class_='title-sub') .contents[2]).group(1)) for html_language in sub.select('ul.sslist'): language = Language.fromtusubtitulo(html_language.find_next('b').get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language('spa', 'MX'): language = Language('spa') hearing_impaired = True # ignore incomplete subtitles status = sanitize(html_language.find_next('li', class_=re.compile('li-estado')).get_text()) if status != 'completado': logger.debug('Ignoring subtitle with status %s', status) continue # get the most updated version of the subtitle and if it doesn't exist get the original version html_status = html_language.select('a[href^="updated/"]') if len(html_status) == 0: html_status = html_language.select('a[href^="original/"]') subtitle_url = self.server_url + html_status[0]['href'] subtitle = TuSubtituloSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, series, season, episode, year=None): # get the episode url episode_url = self._search_url_titles(series, season, episode, year) if episode_url is None: logger.info( f"[{self.provider_name}]: No episode url found for {series}, season {season}, episode {episode}" ) return [] r = self.session.get(episode_url, headers={"Referer": self.server_url}, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ["lxml", "html.parser"]) # get episode title title_pattern = re.compile("{}(.+){}x{:02d}- (.+)".format( series, season, episode).lower()) title = title_pattern.search( soup.select("#episode_title")[0].get_text().strip().lower()).group( 2) subtitles = [] for sub in soup.find_all("div", attrs={"id": "progress_buttons_row"}): # read the language language = Language.fromsubtitulamos( sub.find_previous( "div", class_="subtitle_language").get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language("spa", "MX"): language = Language("spa") hearing_impaired = True # read the release subtitle release = sub.find_next("div", class_="version_name").get_text().strip() # ignore incomplete subtitles status = sub.find_next("div", class_="subtitle_buttons").contents[1] if status.name != "a": logger.debug("Ignoring subtitle in [%s] not finished", language) continue # read the subtitle url subtitle_url = self.server_url + status["href"][1:] subtitle = SubtitulamosSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def query(self, series, season, episode, year=None): # get the episode url episode_url = self._search_url_titles(series, season, episode, year) if episode_url is None: logger.error('No episode url found for %s, season %d, episode %d', series, season, episode) return [] r = self.session.get(episode_url, headers={'Referer': self.server_url}, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get episode title title_pattern = re.compile('{}(.+){}x{:02d}- (.+)'.format( series, season, episode).lower()) title = title_pattern.search( soup.select('#episode_title')[0].get_text().strip().lower()).group( 2) subtitles = [] for sub in soup.find_all('div', attrs={'id': 'progress_buttons_row'}): # read the language language = Language.fromsubtitulamos( sub.find_previous( 'div', class_='subtitle_language').get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language('spa', 'MX'): language = Language('spa') hearing_impaired = True # read the release subtitle release = sub.find_next('div', class_='version_name').get_text().strip() # ignore incomplete subtitles status = sub.find_next('div', class_='subtitle_buttons').contents[1] if status.name != 'a': logger.debug('Ignoring subtitle in [%s] not finished', language) continue # read the subtitle url subtitle_url = self.server_url + status['href'][1:] subtitle = SubtitulamosSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, series, season, episode, year=None): # get the show id show_id = self.get_show_id(series, year) if show_id is None: logger.error("No show id found for %s (%r)", series, year) return [] # get the episode url episode_url = self.get_episode_url(show_id, series, season, episode, year) if episode_url is None: logger.info(f"[{self.provider_name}]: No episode url found for {series}, season {season}, episode {episode}") return [] # get the page of the episode of the show r = self.session.get(episode_url, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ["lxml", "html.parser"]) # get episode title title_pattern = re.compile("Subt.+tulos de {}(.+){}x{:02d} - (.+)".format(series, season, episode).lower()) title = title_pattern.search(soup.select("#cabecera-subtitulo")[0].get_text().strip().lower()).group(2) # loop over subtitle rows subtitles = [] for sub in soup.find_all("div", attrs={"id": re.compile("version([0-9]+)")}): # read the release subtitle release = sanitize_release_group(release_pattern.search(sub.find("p", class_="title-sub").contents[2]).group(1)) for html_language in sub.select("ul.sslist"): language = Language.fromtusubtitulo(html_language.find_next("b").get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language("spa", "MX"): language = Language("spa") hearing_impaired = True # ignore incomplete subtitles status = sanitize(html_language.find_next("li", class_=re.compile("li-estado")).get_text()) if status != "completado": logger.debug("Ignoring subtitle with status %s", status) continue # get the most updated version of the subtitle and if it doesn't exist get the original version html_status = html_language.select('a[href^="updated/"]') if len(html_status) == 0: html_status = html_language.select('a[href^="original/"]') subtitle_url = self.server_url + html_status[0]["href"] subtitle = TuSubtituloSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def _get_download_link(self, subtitle): response = self.session.get(subtitle.page_link, timeout=20) self._check_response(response) try: page_soup = ParserBeautifulSoup( response.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) links_soup = page_soup.find_all("a", {"class": "detalle_link"}) for link_soup in links_soup: if link_soup["href"].startswith("bajar"): return self.server_url + link_soup["href"] links_soup = page_soup.find_all("a", {"class": "link1"}) for link_soup in links_soup: if "bajar.php" in link_soup["href"]: return link_soup["href"] except Exception as e: raise APIThrottled(f"Error parsing download link: {e}") raise APIThrottled("Download link not found")
def _get_download_link(self, subtitle): response = self.session.get(subtitle.page_link, timeout=20) self._check_response(response) try: page_soup = ParserBeautifulSoup( response.content.decode('iso-8859-1', 'ignore'), ['lxml', 'html.parser']) links_soup = page_soup.find_all("a", {'class': 'detalle_link'}) for link_soup in links_soup: if link_soup['href'].startswith('bajar'): return self.server_url + link_soup['href'] links_soup = page_soup.find_all("a", {'class': 'link1'}) for link_soup in links_soup: if "bajar.php" in link_soup['href']: return link_soup['href'] except Exception as e: raise APIThrottled('Error parsing download link: ' + str(e)) raise APIThrottled('Download link not found')
def _parse_subtitles_page(self, video, response, language): subtitles = [] page_soup = ParserBeautifulSoup( response.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) title_soups = page_soup.find_all("div", {"id": "menu_detalle_buscador"}) body_soups = page_soup.find_all("div", {"id": "buscador_detalle"}) for subtitle in range(0, len(title_soups)): title_soup, body_soup = title_soups[subtitle], body_soups[subtitle] # title title = title_soup.find("a").text.replace("Subtitulos de ", "") # filter by year if video.year and str(video.year) not in title: continue page_link = title_soup.find("a")["href"] # description description = body_soup.find("div", { "id": "buscador_detalle_sub" }).text description = description.replace(",", " ").lower() # uploader uploader = body_soup.find("a", {"class": "link1"}).text subtitle = self.subtitle_class(language, video, page_link, title, description, uploader) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def query(self, keyword, season=None, episode=None, year=None): params = keyword if season: params += ".S{season:02d}".format(season=season) elif year: params += " {:4d}".format(year) logger.debug("Searching subtitles %r", params) subtitles = [] search_link = self.server_url + text_type( self.search_url).format(params) r = self.session.get(search_link, timeout=30) r.raise_for_status() if not r.content: logger.debug("No data returned from provider") return [] soup = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) # non-shooter result page if soup.find("div", {"class": "item"}): logger.debug("enter a non-shooter page") for item in soup.find_all("div", {"class": "item"}): title_a = item.find("p", class_="tt clearfix").find("a") subs_year = re.findall(r"\d{4}", title_a.text) or None if season: title = title_a.text season_cn1 = re.search("第(.*)季", title) if not season_cn1: season_cn1 = "一" else: season_cn1 = season_cn1.group(1).strip() season_cn2 = num_to_cn(str(season)) if season_cn1 != season_cn2: continue episode_link = self.server_url + title_a.attrs["href"] new_subs = self._parse_episode_page(episode_link, subs_year) subtitles += new_subs # NOTE: shooter result pages are ignored due to the existence of assrt provider return subtitles
def find_imdb_id(self, sub_id): """ """ url = self.server_url + "index.php?tipus=adatlap&azon=a_" + sub_id # url = https://www.feliratok.info/index.php?tipus=adatlap&azon=a_1518600916 logger.info('Get IMDB id from URL %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) links = soup.find_all("a") for value in links: if "imdb.com" in str(value): # <a alt="iMDB" href="http://www.imdb.com/title/tt2357547/" target="_blank"><img alt="iMDB" src="img/adatlap/imdb.png"/></a> imdb_id = re.findall(r'(?<=www\.imdb\.com/title/).*(?=/")', str(value))[0] return imdb_id return None
def query(self, keyword, season=None, episode=None, year=None): query = keyword if season and episode: query += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) elif year: query += ' {:4d}'.format(year) params = { 'buscar': query, # search string 'accion': 5, # action search 'oxdown': 1, # order by downloads descending 'pg': 1 # page 1 } logger.debug('Searching subtitles %r', query) subtitles = [] language = self.language_list[0] search_link = self.server_url + 'index.php' while True: r = self.session.get(search_link, params=params, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] page_soup = ParserBeautifulSoup( r.content.decode('iso-8859-1', 'ignore'), ['lxml', 'html.parser']) title_soups = page_soup.find_all("div", {'id': 'menu_detalle_buscador'}) body_soups = page_soup.find_all("div", {'id': 'buscador_detalle'}) if len(title_soups) != len(body_soups): logger.debug('Error in provider data') return [] for subtitle in range(0, len(title_soups)): title_soup, body_soup = title_soups[subtitle], body_soups[ subtitle] # title title = title_soup.find("a").text.replace("Subtitulo de ", "") page_link = title_soup.find("a")["href"].replace( 'http://', 'https://') # body description = body_soup.find("div", { 'id': 'buscador_detalle_sub' }).text download_link = body_soup.find( "div", { 'id': 'buscador_detalle_sub_datos' }).find("a", {'target': 'new'})["href"].replace( 'http://', 'https://') subtitle = self.subtitle_class(language, page_link, download_link, description, title) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) if len(title_soups) >= 20: params['pg'] += 1 # search next page time.sleep(self.multi_result_throttle) else: break return subtitles
def process_subs(self, series, video, url): subtitles = [] logger.info('URL for subtitles %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) tables = soup.find_all("table") tables = tables[0].find_all("tr") i = 0 series_imdb_id = None for table in tables: if "vilagit" in str(table) and i > 1: try: sub_hun_name = table.findAll("div", {"class": "magyar"})[0] if isinstance(video, Episode): if "vad)" not in str(sub_hun_name): # <div class="magyar">A pletykaf�szek (3. �vad)</div> sub_hun_name = re.findall(r'(?<=<div class="magyar">).*(?= -)', str(sub_hun_name))[0] else: # <div class="magyar">A holnap legend�i - 3x11</div> sub_hun_name = re.findall(r'(?<=<div class="magyar">).*(?= \()', str(sub_hun_name))[0] if isinstance(video, Movie): sub_hun_name = re.findall(r'(?<=<div class="magyar">).*(?=</div)', str(sub_hun_name))[0] except IndexError: sub_hun_name = "" asked_for_episode = None sub_season = None sub_episode = None sub_english = table.findAll("div", {"class": "eredeti"}) if isinstance(video, Episode): asked_for_episode = video.episode if "Season" not in str(sub_english): # [<div class="eredeti">Gossip Girl (Season 3) (DVDRip-REWARD)</div>] sub_english_name = re.findall(r'(?<=<div class="eredeti">).*?(?= -)', str(sub_english))[0] sub_season = int((re.findall(r"(?<=- ).*?(?= - )", str(sub_english))[0].split('x')[0]).strip()) sub_episode = int((re.findall(r"(?<=- ).*?(?= - )", str(sub_english))[0].split('x')[1]).strip()) else: # [<div class="eredeti">DC's Legends of Tomorrow - 3x11 - Here I Go Again (HDTV-AFG, HDTV-RMX, 720p-SVA, 720p-PSA </div>] sub_english_name = \ re.findall(r'(?<=<div class="eredeti">).*?(?=\(Season)', str(sub_english))[0] sub_season = int(re.findall(r"(?<=Season )\d+(?=\))", str(sub_english))[0]) sub_episode = int(video.episode) if isinstance(video, Movie): sub_english_name = re.findall(r'(?<=<div class="eredeti">).*?(?=\()', str(sub_english))[0] sub_version = (str(sub_english).split('(')[len(str(sub_english).split('(')) - 1]).split(')')[0] # <small>Angol</small> lang = table.findAll("small")[0] sub_language = self.get_language(re.findall(r"(?<=<small>).*(?=</small>)", str(lang))[0]) # <a href="/index.php?action=letolt&fnev=DCs Legends of Tomorrow - 03x11 - Here I Go Again.SVA.English.C.orig.Addic7ed.com.srt&felirat=1519162191"> link = str(table.findAll("a")[len(table.findAll("a")) - 1]).replace("amp;", "") sub_downloadlink = self.server_url + re.findall(r'(?<=href="/).*(?=">)', link)[0] sub_id = re.findall(r"(?<=felirat\=).*(?=\"\>)", link)[0] sub_year = video.year sub_releases = [s.strip() for s in sub_version.split(',')] # For episodes we open the series page so all subtitles imdb_id must be the same. no need to check all if isinstance(video, Episode) and series_imdb_id is not None: sub_imdb_id = series_imdb_id else: sub_imdb_id = self.find_imdb_id(sub_id) series_imdb_id = sub_imdb_id subtitle = SuperSubtitlesSubtitle(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season, sub_episode, sub_version, sub_releases, sub_year, sub_imdb_id, asked_for_episode, asked_for_release_group=video.release_group ) subtitles.append(subtitle) i = i + 1 return subtitles
def query(self, keyword, season=None, episode=None, year=None): params = keyword if season: params += ".S{season:02d}".format(season=season) elif year: params += " {:4d}".format(year) logger.debug("Searching subtitles %r", params) subtitles = [] search_link = self.server_url + text_type( self.search_url).format(params) r = self.session.get(search_link, timeout=30) r.raise_for_status() if not r.content: logger.debug("No data returned from provider") return [] html = r.content.decode("utf-8", "ignore") # parse window location pattern = r"url\s*=\s*'([^']*)'\s*\+\s*url" parts = re.findall(pattern, html) redirect_url = search_link while parts: parts.reverse() redirect_url = urljoin(self.server_url, "".join(parts)) r = self.session.get(redirect_url, timeout=30) html = r.content.decode("utf-8", "ignore") parts = re.findall(pattern, html) logger.debug("search url located: " + redirect_url) soup = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) # non-shooter result page if soup.find("div", {"class": "item"}): logger.debug("enter a non-shooter page") for item in soup.find_all("div", {"class": "item"}): title_a = item.find("p", class_="tt clearfix").find("a") subs_year = year if season: # episode year in zimuku is the season's year not show's year actual_subs_year = re.findall(r"\d{4}", title_a.text) or None if actual_subs_year: subs_year = int(actual_subs_year[0]) - season + 1 title = title_a.text season_cn1 = re.search("第(.*)季", title) if not season_cn1: season_cn1 = "一" else: season_cn1 = season_cn1.group(1).strip() season_cn2 = num_to_cn(str(season)) if season_cn1 != season_cn2: continue episode_link = self.server_url + title_a.attrs["href"] new_subs = self._parse_episode_page(episode_link, subs_year) subtitles += new_subs # NOTE: shooter result pages are ignored due to the existence of assrt provider return subtitles
def query(self, keyword, season=None, episode=None, year=None, video=None): params = keyword if season and episode: params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) elif year: params += '&ARok={:4d}'.format(year) logger.debug('Searching subtitles %r', params) subtitles = [] if season and episode: search_link = self.server_url + text_type( self.search_url_series).format(params) elif year: search_link = self.server_url + text_type( self.search_url_movies).format(params) r = self.session.get(search_link, timeout=30) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] # soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # for entity in soup.select('table .main_table > tbody > tr'): # for entity in soup.find_all("table", class_="main_table"): # moviename = entity.text # entity_url = self.server_url + entity['href'] # logger.debug(entity_url) # r = self.session.get(entity_url, timeout=30) # r.raise_for_status() # logger.debug('looking into ' + entity_url) soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']).find( "table", class_="main_table") # loop over subtitles cells if soup: subs = soup.find_all("tr", class_="row1") subs += soup.find_all("tr", class_="row2") for sub in subs: page_link = '%s%s' % (self.server_url, sub.a.get('href').encode('utf-8')) title = sub.find_all('td')[0:1] title = [x.text for x in title] version = sub.find(class_="fixedTip") if version is None: version = "" else: version = version['title'] try: r = sub.find_all('td')[6:7] # r2 = td.find("td", "img") langs = [x.text.encode('utf-8') for x in r] pass except: langs = 'CZ' name = '%s (%s)' % (version, langs) if b'CZ' in langs: language = Language('ces') elif b'SK' in langs: language = Language('slk') # read the item # subtitle = self.subtitle_class(language, page_link, year, version, page_link.replace("detail", "dld")) download_link = sub.find('a', class_='titulkydownloadajax') download_link = self.download_url + download_link.get('href') subtitle = self.subtitle_class( language, page_link, season, episode, version, download_link, year, title, asked_for_release_group=video.release_group, asked_for_episode=episode) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles
def query(self, series, season, episode, year=None, video=None): # Search for s01e03 instead of s1e3 seasona = "%02d" % season episodea = "%02d" % episode series = fix_inconsistent_naming(series) seriesa = series.replace(' ', '+') # get the episode page logger.info('Getting the page for episode %s', episode) url = self.server_url + "sorozatok.php?cim=" + seriesa + "&evad="+str(seasona) + \ "&resz="+str(episodea)+"&nyelvtipus=%25&x=24&y=8" logger.info('Url %s', url) r = self.session.get(url, timeout=10).content i = 0 soup = ParserBeautifulSoup(r, ['lxml']) table = soup.find_all("table")[9] subtitles = [] # loop over subtitles rows for row in table.find_all("tr"): i = i + 1 if "this.style.backgroundImage='url(css/over2.jpg)" in str(row) and i > 5: datas = row.find_all("td") # Currently subliminal not use these params, but maybe later will come in handy # hunagrian_name = re.split('s(\d{1,2})', datas[1].find_all('b')[0].getText())[0] # Translator of subtitle # sub_translator = datas[3].getText() # Posting date of subtitle # sub_date = datas[4].getText() sub_year = sub_english_name = sub_version = None # Handle the case when '(' in subtitle if datas[1].getText().count('(') == 1: sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText())[3] if datas[1].getText().count('(') == 2: sub_year = re.findall(r"(?<=\()(\d{4})(?=\))", datas[1].getText().strip())[0] sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText().split('(')[0])[0] if not sub_english_name: continue sub_season = int((re.findall('s(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0]) .lstrip('0')) sub_episode = int((re.findall('e(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0]) .lstrip('0')) if sub_season == season and sub_episode == episode: sub_language = self.get_language(datas[2].find_all('img')[0]['src'].split('/')[1]) sub_downloadlink = datas[6].find_all('a')[1]['href'] sub_id = sub_downloadlink.split('=')[1].split('.')[0] if datas[1].getText().count('(') == 1: sub_version = datas[1].getText().split('(')[1].split(')')[0] if datas[1].getText().count('(') == 2: sub_version = datas[1].getText().split('(')[2].split(')')[0] # One subtitle can be used for several releases sub_releases = [s.strip() for s in sub_version.split(',')] subtitle = self.subtitle_class(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season, sub_episode, sub_version, sub_releases, sub_year, asked_for_release_group=video.release_group, asked_for_episode=episode) logger.debug('Found subtitle: %r', subtitle) subtitles.append(subtitle) return subtitles
def process_subs(self, languages, video, url): subtitles = [] logger.info('URL for subtitles %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) tables = soup.find_all("table") tables = tables[0].find_all("tr") i = 0 series_imdb_id = None for table in tables: if "vilagit" in str(table) and i > 1: asked_for_episode = None sub_season = None sub_episode = None sub_english = table.findAll("div", {"class": "eredeti"}) sub_english_name = None if isinstance(video, Episode): asked_for_episode = video.episode if "Season" not in str(sub_english): # [<div class="eredeti">Gossip Girl (Season 3) (DVDRip-REWARD)</div>] sub_english_name = re.search( r'(?<=<div class="eredeti">).*?(?= -)', str(sub_english)) sub_english_name = sub_english_name.group( ) if sub_english_name else '' sub_season = re.search(r"(?<=- ).*?(?= - )", str(sub_english)) sub_season = sub_season.group() if sub_season else '' sub_season = int((sub_season.split('x')[0]).strip()) sub_episode = re.search(r"(?<=- ).*?(?= - )", str(sub_english)) sub_episode = sub_episode.group( ) if sub_episode else '' sub_episode = int((sub_episode.split('x')[1]).strip()) else: # [<div class="eredeti">DC's Legends of Tomorrow - 3x11 - Here I Go Again (HDTV-AFG, HDTV-RMX, # 720p-SVA, 720p-PSA </div>] sub_english_name = \ re.search(r'(?<=<div class="eredeti">).*?(?=\(Season)', str(sub_english)) sub_english_name = sub_english_name.group( ) if sub_english_name else '' sub_season = re.search(r"(?<=Season )\d+(?=\))", str(sub_english)) sub_season = int( sub_season.group()) if sub_season else None sub_episode = int(video.episode) if isinstance(video, Movie): sub_english_name = re.search( r'(?<=<div class="eredeti">).*?(?=</div>)', str(sub_english)) sub_english_name = sub_english_name.group( ) if sub_english_name else '' sub_english_name = sub_english_name.split(' (')[0] sub_version = 'n/a' if len(str(sub_english).split('(')) > 1: sub_version = (str(sub_english).split('(')[ len(str(sub_english).split('(')) - 1]).split(')')[0] # <small>Angol</small> lang = table.find("small") sub_language = re.search(r"(?<=<small>).*(?=</small>)", str(lang)) sub_language = sub_language.group() if sub_language else '' sub_language = self.get_language(sub_language) # <a href="/index.php?action=letolt&fnev=DCs Legends of Tomorrow - 03x11 - Here I Go Again.SVA. # English.C.orig.Addic7ed.com.srt&felirat=1519162191"> link = str(table.findAll("a")[len(table.findAll("a")) - 1]).replace("amp;", "") sub_downloadlink = re.search(r'(?<=href="/).*(?=">)', link) sub_downloadlink = sub_downloadlink.group( ) if sub_downloadlink else '' sub_downloadlink = self.server_url + sub_downloadlink sub_id = re.search(r"(?<=felirat=).*(?=\">)", link) sub_id = sub_id.group() if sub_id else '' sub_year = video.year sub_releases = [s.strip() for s in sub_version.split(',')] uploader = '' for item in table.contents[7].contents: if isinstance(item, Tag): uploader = item.text.lstrip('\r\n\t\t\t\t\t').rstrip( '\r\n\t\t\t\t') elif isinstance(item, NavigableString): uploader = item.lstrip('\r\n\t\t\t\t\t').rstrip( '\r\n\t\t\t\t') # For episodes we open the series page so all subtitles imdb_id must be the same. no need to check all if isinstance(video, Episode) and series_imdb_id is not None: sub_imdb_id = series_imdb_id else: sub_imdb_id = self.find_imdb_id(sub_id) series_imdb_id = sub_imdb_id subtitle = SuperSubtitlesSubtitle( sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season, sub_episode, sub_version, sub_releases, sub_year, sub_imdb_id, uploader, asked_for_episode, asked_for_release_group=video.release_group) if subtitle.language in languages: subtitles.append(subtitle) i = i + 1 return subtitles
def process_subs(self, languages, video, url): if isinstance(video, Episode): return None subtitles = [] logger.info('URL for subtitles %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) tables = soup.find_all("table") tables = tables[0].find_all("tr") i = 0 for table in tables: if "vilagit" in str(table) and i > 1: asked_for_episode = None sub_season = None sub_episode = None sub_english = table.findAll("div", {"class": "eredeti"}) sub_english_name = re.search( r'(?<=<div class="eredeti">).*?(?=</div>)', str(sub_english)) sub_english_name = sub_english_name.group( ) if sub_english_name else '' sub_english_name = sub_english_name.split(' (')[0] sub_english_name = sub_english_name.replace('&', '&') sub_version = 'n/a' if len(str(sub_english).split('(')) > 1: sub_version = (str(sub_english).split('(')[ len(str(sub_english).split('(')) - 1]).split(')')[0] # <small>Angol</small> lang = table.find("small") sub_language = re.search(r"(?<=<small>).*(?=</small>)", str(lang)) sub_language = sub_language.group() if sub_language else '' sub_language = self.get_language(sub_language) # <a href="/index.php?action=letolt&fnev=DCs Legends of Tomorrow - 03x11 - Here I Go Again.SVA. # English.C.orig.Addic7ed.com.srt&felirat=1519162191"> link = str(table.findAll("a")[len(table.findAll("a")) - 1]).replace("amp;", "") sub_downloadlink = re.search(r'(?<=href="/).*(?=">)', link) sub_downloadlink = sub_downloadlink.group( ) if sub_downloadlink else '' sub_downloadlink = self.server_url + sub_downloadlink sub_id = re.search(r"(?<=felirat=).*(?=\">)", link) sub_id = sub_id.group() if sub_id else '' sub_year = video.year sub_releases = [s.strip() for s in sub_version.split(',')] uploader = '' for item in table.contents[7].contents: if isinstance(item, Tag): uploader = item.text.lstrip('\r\n\t\t\t\t\t').rstrip( '\r\n\t\t\t\t') elif isinstance(item, NavigableString): uploader = item.lstrip('\r\n\t\t\t\t\t').rstrip( '\r\n\t\t\t\t') sub_imdb_id = self.find_imdb_id(sub_id) subtitle = SuperSubtitlesSubtitle( sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season, sub_episode, sub_version, sub_releases, sub_year, sub_imdb_id, uploader, asked_for_episode, asked_for_release_group=video.release_group) if subtitle.language in languages: subtitles.append(subtitle) i = i + 1 return subtitles