def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :return: the show id, if found. :rtype: int """ # addic7ed doesn't support search with quotes series = series.replace('\'', ' ') # build the params series_year = '%s %d' % (series, year) if year is not None else series params = {'search': series_year, 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) r = self.session.get(self.server_url + 'srch.php', params=params, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get the suggestion suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not sanitize(suggestion[0].i.text.replace('\'', ' ')) == sanitize(series_year): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() # LXML parser seems to fail when parsing Addic7ed.com HTML markup. # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) # Assuming the site's markup is bad, and stripping it down to only contain what's needed. show_cells = re.findall(show_cells_re, r.content) if show_cells: soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser']) else: # If RegEx fails, fall back to original r.content and use 'html.parser' soup = ParserBeautifulSoup(r.content, ['html.parser']) # populate the show ids show_ids = {} for show in soup.select('td.version > h3 > a[href^="/show/"]'): show_ids[sanitize(show.text)] = int(show['href'][6:]) logger.debug('Found %d show ids', len(show_ids)) return show_ids
def _search_url_titles(self, title): """Search the URL titles by kind for the given `title`. :param str title: title to search for. :return: the URL titles by kind. :rtype: collections.defaultdict """ # make the search logger.info('Searching title name for %r', title) r = self.session.get(self.server_url + 'subtitle/search/', params={'q': title}, timeout=10) r.raise_for_status() # check for redirections if r.history and all([h.status_code == 302 for h in r.history]): logger.debug('Redirected to the subtitles page') links = [r.url] else: # get the suggestions (if needed) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) links = [ link.attrs['href'] for link in soup.select('#processes div.generalWindowTop a') ] logger.debug('Found %d suggestions', len(links)) url_titles = defaultdict(list) for link in links: parts = link.split('/') url_titles[parts[-3]].append(parts[-2]) return url_titles
def query(self, show_id, series, season, episode, year=None): # get the episode ids episode_ids = self.get_episode_ids(show_id, season) # Provider doesn't store multi episode information episode = min(episode) if episode and isinstance(episode, list) else episode if episode not in episode_ids: logger.error('Episode %d not found', episode) return [] # get the episode page logger.info('Getting the page for episode %d', episode_ids[episode]) r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitles rows subtitles = [] for row in soup.select('.subtitlen'): # read the item language = Language.fromtvsubtitles(row.h5.img['src'][13:-4]) subtitle_id = int(row.parent['href'][10:-5]) page_link = self.server_url + 'subtitle-%d.html' % subtitle_id rip = row.find('p', title='rip').text.strip() or None release = row.find('h5').text.strip() or None subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip, release) logger.info('Found subtitle %s', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles
def query(self, series, season, episode, year=None, video=None): # Search for s01e03 instead of s1e3 seasona = "%02d" % season episodea = "%02d" % episode seriesa = fix_inconsistent_naming(series) seriesa = series.replace(' ', '+') # get the episode page logger.info('Getting the page for episode %s', episode) url = self.server_url + "sorozatok.php?cim=" + seriesa + "&evad="+str(seasona) + \ "&resz="+str(episodea)+"&nyelvtipus=%25&x=24&y=8" logger.info('Url %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) subtitles = [] for num, temp in enumerate(soup.find_all("table")): if "this.style.backgroundImage='url(css/over2.jpg)" in str( temp) and "css/infooldal.png" in str(temp): logger.debug("Found valid table (%d index)", num) subtitles += self._loop_over_table(temp, season, episode, video) return subtitles
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `series.php` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') r = self.session.get(self.series_url, timeout=10) r.raise_for_status() if r.status_code != 200: logger.error('Error getting show ids') raise ProviderError('Error getting show ids') soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # populate the show ids show_ids = {} for show in soup.select('td > a[href^="/show/"]'): show_ids[sanitize(show.get_text())] = int(show['href'][6:]) logger.debug('Found %d show ids', len(show_ids)) return show_ids
def _get_suggestions(self, title): """Search the show or movie id from the `title` and `year`. :param str title: title of the show. :return: the show suggestions found. :rtype: list of dict """ # make the search logger.info('Searching show ids with %r', title) r = self.session.get(self.server_url + self.search_url.format(title), headers={'Referer': self.server_url}, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['html.parser']) suggestions = [{ 'link': l.attrs['value'], 'title': l.text } for l in soup.select('select[name="Mov_sel"] > option[value]')] logger.debug('Found suggestions: %r', suggestions) return suggestions
def _search_movie(self, movie_id): subs = [] url = self.server_url + self.movie_info_url + movie_id r = self.session.get(url, timeout=10) r.raise_for_status() if len(r.content) < 10: logger.debug( "Too short content-length in response: [{}]. Treating as No Subtitles Found " .format(str(r.content))) return [] html = ParserBeautifulSoup(r.content, ["html.parser"]) sub_rows = html.select("table#subtitlesList tbody > tr") for row in sub_rows: columns = row.find_all("td") sub = {"id": movie_id} for index, column in enumerate(columns): if index == 0: sub["rls"] = column.get_text().strip().split("\n")[0] if index == 5: sub["sub_id"] = column.find( "a", attrs={"data-subtitle-id": True})["data-subtitle-id"] if 'sub_id' in sub: subs.append(sub) return subs
def _parse_episode_page(self, link, year): r = self.session.get(link) bs_obj = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"), ["html.parser"]) subs_body = bs_obj.find("div", class_="subs box clearfix").find("tbody") subs = [] for sub in subs_body.find_all("tr"): a = sub.find("a") name = _extract_name(a.text) name = os.path.splitext(name)[ 0] # remove ext because it can be an archive type language = Language("eng") for img in sub.find("td", class_="tac lang").find_all("img"): if ("hongkong" in img.attrs["src"] or "china" in img.attrs["src"] or "jollyroger" in img.attrs["src"]): language = Language("zho") break sub_page_link = urljoin(self.server_url, a.attrs["href"]) backup_session = copy.deepcopy(self.session) backup_session.headers["Referer"] = link subs.append( self.subtitle_class(language, sub_page_link, name, backup_session, year)) return subs
def search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param string series: series of the episode. :param year: year of the series, if any. :type year: int or None :return: the show id, if any. :rtype: int or None """ # make the search logger.info('Searching show id for %r', series) r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10) r.raise_for_status() # get the series out of the suggestions soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) show_id = None for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'): match = link_re.match(suggestion.text) if not match: logger.error('Failed to match %s', suggestion.text) continue if sanitize(match.group('series')).lower() == series.lower(): if year is not None and int(match.group('first_year')) != year: logger.debug('Year does not match') continue show_id = int(suggestion['href'][8:-5]) logger.debug('Found show id %d', show_id) break soup.decompose() soup = None return show_id
def query(self, series, season, episode, year=None): # search the show id show_id = self.search_show_id(series, year) if show_id is None: logger.error('No show id found for %r (%r)', series, {'year': year}) return [] # get the episode ids episode_ids = self.retry(lambda: self.get_episode_ids(show_id, season)) if episode not in episode_ids: logger.error('Episode %d not found', episode) return [] # get the episode page logger.info('Getting the page for episode %d', episode_ids[episode]) r = self.retry(lambda: self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10)) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitles rows subtitles = [] for row in soup.select('.subtitlen'): # read the item language = Language.fromtvsubtitles(row.h5.img['src'][13:-4]) subtitle_id = int(row.parent['href'][10:-5]) page_link = self.server_url + 'subtitle-%d.html' % subtitle_id rip = row.find('p', title='rip').text.strip() or None release = row.find('p', title='release').text.strip() or None subtitle = PatchedTVsubtitlesSubtitle(language, page_link, subtitle_id, series, season, episode, year, rip, release) logger.info('Found subtitle %s', subtitle) subtitles.append(subtitle) return subtitles
def initialize(self): self.session = Session() self.session.headers[ 'User-Agent'] = 'Subliminal/%s' % __short_version__ # login if self.username and self.password: logger.info('Logging in') data = { '_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password } r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10) raise_for_status(r) soup = ParserBeautifulSoup(r.content, ['html.parser']) if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')): raise AuthenticationError(self.username) logger.debug('Logged in') self.logged_in = True
def _get_show_ids(self): # get the shows page logger.info('Getting show ids') r = self.session.get(self.server_url + self.all_series_url, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # populate the show ids show_ids = {} for show_category in soup.findAll('seriesl'): if show_category.attrs['category'] == u'Σειρές': for show in show_category.findAll('series'): series = show.text series_match = series_sanitize_re.match(series) if series_match: series = series_match.group(1) show_ids[sanitize(series)] = int(show['srsid']) break logger.debug('Found %d show ids', len(show_ids)) return show_ids
def _parse_subtitles_page(self, video, response, language): subtitles = [] page_soup = ParserBeautifulSoup( response.content.decode('iso-8859-1', 'ignore'), ['lxml', 'html.parser']) title_soups = page_soup.find_all("div", {'id': 'menu_detalle_buscador'}) body_soups = page_soup.find_all("div", {'id': 'buscador_detalle'}) for subtitle in range(0, len(title_soups)): title_soup, body_soup = title_soups[subtitle], body_soups[subtitle] # title title = title_soup.find("a").text.replace("Subtitulos de ", "") page_link = title_soup.find("a")["href"] # description description = body_soup.find("div", { 'id': 'buscador_detalle_sub' }).text description = description.replace(",", " ").lower() # uploader uploader = body_soup.find("a", {'class': 'link1'}).text subtitle = self.subtitle_class(language, video, page_link, title, description, uploader) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def login(self): logger.info('Logging in') self.headers['Referer'] = self.site + '/index.php' self.session.headers.update(self.headers.items()) res = self.session.get(self.loginpage) bsoup = ParserBeautifulSoup(res.content, ['lxml']) _allinputs = bsoup.findAll('input') fields = {} for field in _allinputs: fields[field.get('name')] = field.get('value') fields['username'] = self.username fields['password'] = self.password fields['autologin'] = '******' fields['viewonline'] = 'on' self.headers['Referer'] = self.loginpage self.session.headers.update(self.headers.items()) res = self.session.post(self.loginpage, fields) try: logger.debug('Got session id %s' % self.session.cookies.get_dict()['PHPSESSID']) except KeyError as e: logger.error(repr(e)) logger.error("Didn't get session id, check your credentials") return False except Exception as e: logger.error(repr(e)) logger.error('uncached error #legendasdivx #AA') return False return True
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=20, cookies=self.cookies) r.raise_for_status() # LXML parser seems to fail when parsing Addic7ed.com HTML markup. # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) # Assuming the site's markup is bad, and stripping it down to only contain what's needed. show_cells = re.findall(show_cells_re, r.content) if show_cells: soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser']) else: # If RegEx fails, fall back to original r.content and use 'html.parser' soup = ParserBeautifulSoup(r.content, ['html.parser']) # populate the show ids show_ids = {} for show in soup.select('td.vr > h3 > a[href^="/show/"]'): show_ids[sanitize(show.text)] = int(show['href'][6:]) logger.debug('Found %d show ids', len(show_ids)) return show_ids
def search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param string series: series of the episode. :param year: year of the series, if any. :type year: int or None :return: the show id, if any. :rtype: int or None """ # make the search series_clean = self.clean_punctuation(series).lower() logger.info('Searching show id for %r', series_clean) r = self.session.post(self.server_url + 'search.php', data={'q': series_clean}, timeout=10) r.raise_for_status() # get the series out of the suggestions soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) show_id = None for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'): match = link_re.match(self.clean_punctuation(suggestion.text)) if not match: logger.error('Failed to match %s', suggestion.text) continue if self.clean_punctuation(match.group('series')).lower() == series_clean: if year is not None and int(match.group('first_year')) != year: logger.debug('Year does not match') continue show_id = int(suggestion['href'][8:-5]) logger.debug('Found show id %d', show_id) break return show_id
def _search_url_titles(self, title): """Search the URL titles by kind for the given `title`. :param str title: title to search for. :return: the URL titles by kind. :rtype: collections.defaultdict """ # make the search logger.info('Searching title name for %r', title) r = self.session.get(self.server_url + 'subtitle/search/', params={'q': title}, timeout=10) r.raise_for_status() # check for redirections if r.history and all([h.status_code == 302 for h in r.history]): logger.debug('Redirected to the subtitles page') links = [r.url] else: # get the suggestions (if needed) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) links = [link.attrs['href'] for link in soup.select('#processes div.generalWindowTop a')] logger.debug('Found %d suggestions', len(links)) url_titles = defaultdict(list) for link in links: parts = link.split('/') url_titles[parts[-3]].append(parts[-2]) return url_titles
def query(self, series, season, episode, year=None): # get the show id show_id = self.get_show_id(series, year) if show_id is None: logger.error('No show id found for %s (%r)', series, year) return [] # get the episode url episode_url = self.get_episode_url(show_id, series, season, episode, year) if episode_url is None: logger.error('No episode url found for %s, season %d, episode %d', series, season, episode) return [] # get the page of the episode of the show r = self.session.get(episode_url, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get episode title title_pattern = re.compile('Subt.+tulos de {}(.+){}x{:02d} - (.+)'.format(series, season, episode).lower()) title = title_pattern.search(soup.select('#cabecera-subtitulo')[0].get_text().strip().lower()).group(2) # loop over subtitle rows subtitles = [] for sub in soup.find_all('div', attrs={'id': re.compile('version([0-9]+)')}): # read the release subtitle release = sanitize_release_group(release_pattern.search(sub.find('p', class_='title-sub') .contents[2]).group(1)) for html_language in sub.select('ul.sslist'): language = Language.fromtusubtitulo(html_language.find_next('b').get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language('spa', 'MX'): language = Language('spa') hearing_impaired = True # ignore incomplete subtitles status = sanitize(html_language.find_next('li', class_=re.compile('li-estado')).get_text()) if status != 'completado': logger.debug('Ignoring subtitle with status %s', status) continue # get the most updated version of the subtitle and if it doesn't exist get the original version html_status = html_language.select('a[href^="updated/"]') if len(html_status) == 0: html_status = html_language.select('a[href^="original/"]') subtitle_url = self.server_url + html_status[0]['href'] subtitle = TuSubtituloSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, series, season, episode, year=None): # get the episode url episode_url = self._search_url_titles(series, season, episode, year) if episode_url is None: logger.info( f"[{self.provider_name}]: No episode url found for {series}, season {season}, episode {episode}" ) return [] r = self.session.get(episode_url, headers={"Referer": self.server_url}, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ["lxml", "html.parser"]) # get episode title title_pattern = re.compile("{}(.+){}x{:02d}- (.+)".format( series, season, episode).lower()) title = title_pattern.search( soup.select("#episode_title")[0].get_text().strip().lower()).group( 2) subtitles = [] for sub in soup.find_all("div", attrs={"id": "progress_buttons_row"}): # read the language language = Language.fromsubtitulamos( sub.find_previous( "div", class_="subtitle_language").get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language("spa", "MX"): language = Language("spa") hearing_impaired = True # read the release subtitle release = sub.find_next("div", class_="version_name").get_text().strip() # ignore incomplete subtitles status = sub.find_next("div", class_="subtitle_buttons").contents[1] if status.name != "a": logger.debug("Ignoring subtitle in [%s] not finished", language) continue # read the subtitle url subtitle_url = self.server_url + status["href"][1:] subtitle = SubtitulamosSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def query(self, languages=None, title=None, imdb_id=None, video=None): subtitles = [] params = self.getQueryParams(imdb_id, title) search_response = self.session.get(self.api_url, params=params, timeout=15) search_response.raise_for_status() if not search_response.content: logger.debug('[#### Provider: titrari.ro] No data returned from provider') return [] soup = ParserBeautifulSoup(search_response.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitle cells rows = soup.select('td[rowspan=\'5\']') for index, row in enumerate(rows): result_anchor_el = row.select_one('a') # Download link href = result_anchor_el.get('href') download_link = self.api_url + href fullTitle = row.parent.find("h1").find("a").text #Get title try: title = fullTitle.split("(")[0] except: logger.error("[#### Provider: titrari.ro] Error parsing title.") # Get downloads count try: downloads = int(row.parent.parent.select("span")[index].text[12:]) except: logger.error("[#### Provider: titrari.ro] Error parsing downloads.") # Get year try: year = int(fullTitle.split("(")[1].split(")")[0]) except: year = None logger.error("[#### Provider: titrari.ro] Error parsing year.") # Get imdbId sub_imdb_id = self.getImdbIdFromSubtitle(row) try: comments = row.parent.parent.find_all("td", class_=re.compile("comment"))[index*2+1].text except: logger.error("Error parsing comments.") subtitle = self.subtitle_class(next(iter(languages)), download_link, index, None, title, sub_imdb_id, year, downloads, comments) logger.debug('[#### Provider: titrari.ro] Found subtitle %r', str(subtitle)) subtitles.append(subtitle) ordered_subs = self.order(subtitles, video) return ordered_subs
def query(self, title): subtitles = [] data = { 'ajax': '1', 'sSearch': title, } r = self.session.post(self.search_url, data=data, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitle cells rows = soup.select('tbody > tr') for row in rows: # title title_anchor_el = row.select_one('.title > a') title_inner_text = [ element for element in title_anchor_el if isinstance(element, NavigableString) ] title = title_inner_text[0].strip() # year year = row.select_one('.year').text.strip('()') # download link href = title_anchor_el.get('href') download_link = self.server_url + href # imdb id imdb_td = row.select_one('td:nth-of-type(4)') imdb_link = imdb_td.select_one('a').get('href') imdb_id = imdb_link.split('/')[-2] # fps fps = row.select_one('.fps').text.strip() # additional notes notes = row.select_one('.notes').text.strip() # page link = download link (there is no seperate subtitle page link) page_link = download_link # create/add the subitle subtitle = self.subtitle_class(Language.fromalpha2('lv'), page_link, download_link, title, year, imdb_id, fps, notes) logger.debug('nekur: Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, series, season, year=None, country=None): # get the show id show_id = self.get_show_id(series, year, country) if show_id is None: logger.error('No show id found for %r (%r)', series, { 'year': year, 'country': country }) return [] # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) r.raise_for_status() if not r.content: # Provider returns a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows match = series_year_re.match( soup.select('#header font')[0].text.strip()[:-10]) series = match.group('series') year = int(match.group('year')) if match.group('year') else None subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, series, season, episode, year=None): # get the episode url episode_url = self._search_url_titles(series, season, episode, year) if episode_url is None: logger.error('No episode url found for %s, season %d, episode %d', series, season, episode) return [] r = self.session.get(episode_url, headers={'Referer': self.server_url}, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get episode title title_pattern = re.compile('{}(.+){}x{:02d}- (.+)'.format( series, season, episode).lower()) title = title_pattern.search( soup.select('#episode_title')[0].get_text().strip().lower()).group( 2) subtitles = [] for sub in soup.find_all('div', attrs={'id': 'progress_buttons_row'}): # read the language language = Language.fromsubtitulamos( sub.find_previous( 'div', class_='subtitle_language').get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language('spa', 'MX'): language = Language('spa') hearing_impaired = True # read the release subtitle release = sub.find_next('div', class_='version_name').get_text().strip() # ignore incomplete subtitles status = sub.find_next('div', class_='subtitle_buttons').contents[1] if status.name != 'a': logger.debug('Ignoring subtitle in [%s] not finished', language) continue # read the subtitle url subtitle_url = self.server_url + status['href'][1:] subtitle = SubtitulamosSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def download_subtitle(self, subtitle): if isinstance(subtitle, Subs4FreeSubtitle): # download the subtitle logger.info('Downloading subtitle %r', subtitle) r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() if not r.content: logger.debug( 'Unable to download subtitle. No data returned from provider' ) return soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) download_element = soup.select_one('input[name="id"]') image_element = soup.select_one('input[type="image"]') subtitle_id = download_element[ 'value'] if download_element else None width = int(str( image_element['width']).strip('px')) if image_element else 0 height = int(str( image_element['height']).strip('px')) if image_element else 0 if not subtitle_id: logger.debug( 'Unable to download subtitle. No download link found') return self.apply_anti_block(subtitle) download_url = self.server_url + self.download_url r = self.session.post(download_url, data={ 'id': subtitle_id, 'x': random.randint(0, width), 'y': random.randint(0, height) }, headers={'Referer': subtitle.download_link}, timeout=10) r.raise_for_status() if not r.content: logger.debug( 'Unable to download subtitle. No data returned from provider' ) return archive = _get_archive(r.content) subtitle_content = _get_subtitle_from_archive( archive) if archive else r.content if subtitle_content: subtitle.content = fix_line_ending(subtitle_content) else: logger.debug('Could not extract subtitle from %r', archive)
def query(self, series, season, episode, year=None): # get the show id show_id = self.get_show_id(series, year) if show_id is None: logger.error("No show id found for %s (%r)", series, year) return [] # get the episode url episode_url = self.get_episode_url(show_id, series, season, episode, year) if episode_url is None: logger.info(f"[{self.provider_name}]: No episode url found for {series}, season {season}, episode {episode}") return [] # get the page of the episode of the show r = self.session.get(episode_url, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ["lxml", "html.parser"]) # get episode title title_pattern = re.compile("Subt.+tulos de {}(.+){}x{:02d} - (.+)".format(series, season, episode).lower()) title = title_pattern.search(soup.select("#cabecera-subtitulo")[0].get_text().strip().lower()).group(2) # loop over subtitle rows subtitles = [] for sub in soup.find_all("div", attrs={"id": re.compile("version([0-9]+)")}): # read the release subtitle release = sanitize_release_group(release_pattern.search(sub.find("p", class_="title-sub").contents[2]).group(1)) for html_language in sub.select("ul.sslist"): language = Language.fromtusubtitulo(html_language.find_next("b").get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language("spa", "MX"): language = Language("spa") hearing_impaired = True # ignore incomplete subtitles status = sanitize(html_language.find_next("li", class_=re.compile("li-estado")).get_text()) if status != "completado": logger.debug("Ignoring subtitle with status %s", status) continue # get the most updated version of the subtitle and if it doesn't exist get the original version html_status = html_language.select('a[href^="updated/"]') if len(html_status) == 0: html_status = html_language.select('a[href^="original/"]') subtitle_url = self.server_url + html_status[0]["href"] subtitle = TuSubtituloSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def login(self): logger.debug('Legendasdivx.pt :: Logging in') try: # sleep for a 1 second before another request sleep(1) res = self.session.get(self.loginpage) res.raise_for_status() bsoup = ParserBeautifulSoup(res.content, ['lxml']) _allinputs = bsoup.findAll('input') data = {} # necessary to set 'sid' for POST request for field in _allinputs: data[field.get('name')] = field.get('value') # sleep for a 1 second before another request sleep(1) data['username'] = self.username data['password'] = self.password res = self.session.post(self.loginpage, data) res.raise_for_status() # make sure we're logged in logger.debug( 'Legendasdivx.pt :: Logged in successfully: PHPSESSID: %s', self.session.cookies.get_dict()['PHPSESSID']) cj = self.session.cookies.copy() store_cks = ("PHPSESSID", "phpbb3_2z8zs_sid", "phpbb3_2z8zs_k", "phpbb3_2z8zs_u", "lang") for cn in iter(self.session.cookies.keys()): if cn not in store_cks: del cj[cn] # store session cookies on cache logger.debug( "Legendasdivx.pt :: Storing legendasdivx session cookies: %r", cj) region.set("legendasdivx_cookies2", cj) except KeyError: logger.error( "Legendasdivx.pt :: Couldn't get session ID, check your credentials" ) raise AuthenticationError( "Legendasdivx.pt :: Couldn't get session ID, check your credentials" ) except HTTPError as e: if "bloqueado" in res.text.lower(): logger.error( "LegendasDivx.pt :: Your IP is blocked on this server.") raise IPAddressBlocked( "LegendasDivx.pt :: Your IP is blocked on this server.") logger.error("Legendasdivx.pt :: HTTP Error %s", e) raise TooManyRequests("Legendasdivx.pt :: HTTP Error %s", e) except Exception as e: logger.error("LegendasDivx.pt :: Uncaught error: %r", e) raise ServiceUnavailable("LegendasDivx.pt :: Uncaught error: %r", e)
def _parse_subtitles_page(self, video, response, language): subtitles = [] page_soup = ParserBeautifulSoup( response.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) title_soups = page_soup.find_all("div", {"id": "menu_detalle_buscador"}) body_soups = page_soup.find_all("div", {"id": "buscador_detalle"}) episode = isinstance(video, Episode) for subtitle in range(0, len(title_soups)): title_soup, body_soup = title_soups[subtitle], body_soups[subtitle] # title title = _clean_title(title_soup.find("a").text) # Forced subtitles are not supported if title.lower().rstrip().endswith(("forzado", "forzados")): logger.debug("Skipping forced subtitles: %s", title) continue # Check movie title (if the video is a movie) if not episode and not _check_movie(video, title): continue # Data datos = body_soup.find("div", { "id": "buscador_detalle_sub_datos" }).text # Ignore multi-disc and non-srt subtitles if not any(item in datos for item in ("Cds:</b> 1", "SubRip")): continue spain = "/pais/7.gif" in datos language = Language.fromalpha2("es") if spain else Language( "spa", "MX") # description sub_details = body_soup.find("div", { "id": "buscador_detalle_sub" }).text description = sub_details.replace(",", " ") # uploader uploader = body_soup.find("a", {"class": "link1"}).text download_url = _get_download_url(body_soup) page_link = title_soup.find("a")["href"] subtitle = self.subtitle_class(language, video, page_link, title, description, uploader, download_url) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def query(self, keyword, season=None, episode=None, year=None): params = keyword if season and episode: params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) elif year: params += ' {:4d}'.format(year) logger.debug('Searching subtitles %r', params) subtitles = [] search_link = self.server_url + text_type( self.search_url).format(params) while True: r = self.session.get(search_link, timeout=30) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitles cells for cell in soup.select('td.latest_name > a:nth-of-type(1)'): # read the item subtitle_id = int(cell['href'].rsplit('/', 2)[1]) page_link = cell['href'] language = Language.fromalpha2( cell.parent.find('img')['src'].split('/')[-1].split('.') [0]) version = cell.text.strip() or None if version is None: version = "" subtitle = self.subtitle_class( language, page_link, version, self.download_url.format(subtitle_id)) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) anchors = soup.select('td a') next_page_available = False for anchor in anchors: if 'Next' in anchor.text and 'search.php' in anchor['href']: search_link = self.server_url + anchor['href'] next_page_available = True break if not next_page_available: break return subtitles
def download_subtitle(self, subtitle): if isinstance(subtitle, Subs4SeriesSubtitle): # download the subtitle logger.info('Downloading subtitle %r', subtitle) r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() if not r.content: logger.debug( 'Unable to download subtitle. No data returned from provider' ) return soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) download_element = soup.select_one('a.style55ws') if not download_element: download_element = soup.select_one('form[method="post"]') target = download_element[ 'action'] if download_element else None else: target = download_element['href'] if not target: logger.debug( 'Unable to download subtitle. No download link found') return self.apply_anti_block(subtitle) download_url = self.server_url + target r = self.session.get(download_url, headers={'Referer': subtitle.download_link}, timeout=10) r.raise_for_status() if not r.content: logger.debug( 'Unable to download subtitle. No data returned from provider' ) return archive = _get_archive(r.content) subtitle_content = _get_subtitle_from_archive( archive) if archive else r.content if subtitle_content: subtitle.content = fix_line_ending(subtitle_content) else: logger.debug('Could not extract subtitle from %r', archive)
def query(self, series, season, year=None, country=None): # get the show id show_id = self.get_show_id(series, year, country) if show_id is None: logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country}) return [] # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) r.raise_for_status() if not r.content: # Provider returns a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows match = series_year_re.match(soup.select('#header font')[0].text.strip()[:-10]) series = match.group('series') year = int(match.group('year')) if match.group('year') else None subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def get_episode_url(self, show_id, series, season, episode, year=None): """Get the url best matching show id for `series`, `season`, `episode` and `year`. :param int show_id: show id of the series :param str series: serie of the episode. :param int season: season of the episode. :param int episode: number of the episode. :param int year: year of the series. :return: the episode url, if found. :rtype: str """ # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) series_sanitized = sanitize(series) episode_url = None r = self.session.get(self.subtitles_url, params={ 'show': show_id, 'season': season }, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over episodes rows for html_episode in soup.select('td > a[href*="/episodes/"]'): title = sanitize(html_episode.get_text()) # attempt series with year if sanitize('{} {} {}x{:02d}'.format(series_sanitized, year, season, episode)) in title: episode_url = 'https://' + html_episode['href'][2:] logger.debug( 'Subtitle found for %s, season: %d, episode: %d. URL: %s', series, season, episode, episode_url) break elif sanitize('{} {}x{:02d}'.format(series_sanitized, season, episode)) in title: episode_url = 'https://' + html_episode['href'][2:] logger.debug( 'Subtitle found for %s, season: %d, episode: %d. URL: %s', series, season, episode, episode_url) break return episode_url
def query(self, movie_id, title, year): # get the season list of the show logger.info('Getting the subtitle list of show id %s', movie_id) if movie_id: page_link = self.server_url + '/' + movie_id else: page_link = self.server_url + self.search_url.format(' '.join( [title, str(year)])) r = self.session.get(page_link, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['html.parser']) year = None year_element = soup.select_one('td#dates_header > table div') matches = False if year_element: matches = year_re.match(str(year_element.contents[2]).strip()) if matches: year = int(matches.group(1)) title_tag = soup.select_one('td#dates_header > table u') show_title = str(title_tag.contents[0]).strip() if title_tag else None subtitles = [] # loop over episode rows for subs_tag in soup.select('.movie-details'): # read common info version = subs_tag.find('span').text download_link = self.server_url + subs_tag.find('a')['href'] uploader = subs_tag.select_one('.movie-info').find('p').find( 'a').text language_code = subs_tag.select_one('.sprite')['class'][1].split( 'gif')[0] language = Language.fromietf(language_code) subtitle = self.subtitle_class(language, page_link, show_title, year, version, download_link, uploader) logger.debug('Found subtitle {!r}'.format(subtitle)) subtitles.append(subtitle) return subtitles
def _parse_subtitles_page(self, video, response, language): subtitles = [] page_soup = ParserBeautifulSoup( response.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) title_soups = page_soup.find_all("div", {"id": "menu_detalle_buscador"}) body_soups = page_soup.find_all("div", {"id": "buscador_detalle"}) for subtitle in range(0, len(title_soups)): title_soup, body_soup = title_soups[subtitle], body_soups[subtitle] # title title = self._clean_title(title_soup.find("a").text) # discard subtitles if a year between parenthesis is present in title and doesn't match the one provided # in video object if re.match(r'(\(\d{4}\))', title): if video.year and str(video.year) not in title: continue # Data datos = body_soup.find("div", { "id": "buscador_detalle_sub_datos" }).text # Ignore multi-disc and non-srt subtitles if not any(item in datos for item in ("Cds:</b> 1", "SubRip")): continue spain = "/pais/7.gif" in datos language = Language.fromalpha2("es") if spain else Language( "spa", "MX") # description sub_details = body_soup.find("div", { "id": "buscador_detalle_sub" }).text description = sub_details.replace(",", " ").lower() # uploader uploader = body_soup.find("a", {"class": "link1"}).text page_link = title_soup.find("a")["href"] subtitle = self.subtitle_class(language, video, page_link, title, description, uploader) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def download_subtitle(self, subtitle): if isinstance(subtitle, ZimukuSubtitle): # download the subtitle logger.info('Downloading subtitle %r', subtitle) r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=30) r.raise_for_status() if not r.content: logger.debug( 'Unable to download subtitle. No data returned from provider' ) return soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) links = soup.find("div", {"class": "clearfix"}).find_all('a') # TODO: add settings for choice for down_link in links: url = down_link.get('href').encode('utf-8') url = self.server_url + url r = self.session.get( url, headers={'Referer': subtitle.download_link}, timeout=30) r.raise_for_status() if len(r.content) > 1024: break archive_stream = io.BytesIO(r.content) archive = None if rarfile.is_rarfile(archive_stream): logger.debug('Identified rar archive') archive = rarfile.RarFile(archive_stream) subtitle_content = _get_subtitle_from_archive(archive) elif zipfile.is_zipfile(archive_stream): logger.debug('Identified zip archive') archive = zipfile.ZipFile(archive_stream) subtitle_content = _get_subtitle_from_archive(archive) else: subtitle_content = r.content if subtitle_content: subtitle.content = fix_line_ending(subtitle_content) else: logger.debug('Could not extract subtitle from %r', archive)
def get_episode_ids(self, show_id, season): """Get episode ids from the show id and the season. :param int show_id: show id. :param int season: season of the episode. :return: episode ids per episode number. :rtype: dict """ # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'tvshow-%d-%d.html' % (show_id, season), timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over episode rows episode_ids = {} for row in soup.select('table#table5 tr'): # skip rows that do not have a link to the episode page if not row('a', href=episode_id_re): continue # extract data from the cells cells = row('td') episode = int(cells[0].text.split('x')[1]) episode_id = int(cells[1].a['href'][8:-5]) episode_ids[episode] = episode_id if episode_ids: logger.debug('Found episode ids %r', episode_ids) else: logger.warning('No episode ids found') soup.decompose() soup = None return episode_ids
def query(self, series, season, episode, year=None, video=None): # Search for s01e03 instead of s1e3 seasona = "%02d" % season episodea = "%02d" % episode series = fix_inconsistent_naming(series) seriesa = series.replace(' ', '+') # get the episode page logger.info('Getting the page for episode %s', episode) url = self.server_url + "sorozatok.php?cim=" + seriesa + "&evad="+str(seasona) + \ "&resz="+str(episodea)+"&nyelvtipus=%25&x=24&y=8" logger.info('Url %s', url) r = self.session.get(url, timeout=10).content i = 0 soup = ParserBeautifulSoup(r, ['lxml']) table = soup.find_all("table")[9] subtitles = [] # loop over subtitles rows for row in table.find_all("tr"): i = i + 1 if "this.style.backgroundImage='url(css/over2.jpg)" in str(row) and i > 5: datas = row.find_all("td") # Currently subliminal not use these params, but maybe later will come in handy # hunagrian_name = re.split('s(\d{1,2})', datas[1].find_all('b')[0].getText())[0] # Translator of subtitle # sub_translator = datas[3].getText() # Posting date of subtitle # sub_date = datas[4].getText() sub_year = sub_english_name = sub_version = None # Handle the case when '(' in subtitle if datas[1].getText().count('(') == 1: sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText())[3] if datas[1].getText().count('(') == 2: sub_year = re.findall(r"(?<=\()(\d{4})(?=\))", datas[1].getText().strip())[0] sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText().split('(')[0])[0] if not sub_english_name: continue sub_season = int((re.findall('s(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0]) .lstrip('0')) sub_episode = int((re.findall('e(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0]) .lstrip('0')) if sub_season == season and sub_episode == episode: sub_language = self.get_language(datas[2].find_all('img')[0]['src'].split('/')[1]) sub_downloadlink = datas[6].find_all('a')[1]['href'] sub_id = sub_downloadlink.split('=')[1].split('.')[0] if datas[1].getText().count('(') == 1: sub_version = datas[1].getText().split('(')[1].split(')')[0] if datas[1].getText().count('(') == 2: sub_version = datas[1].getText().split('(')[2].split(')')[0] # One subtitle can be used for several releases sub_releases = [s.strip() for s in sub_version.split(',')] subtitle = self.subtitle_class(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season, sub_episode, sub_version, sub_releases, sub_year, asked_for_release_group=video.release_group, asked_for_episode=episode) logger.debug('Found subtitle: %r', subtitle) subtitles.append(subtitle) return subtitles