def fetch(self, url, params=None, data=None, **request_params): self.response = super(LostFilmScraper, self).fetch(url, params, data, **request_params) encoding = self.response.encoding if encoding == 'ISO-8859-1': encoding = 'windows-1251' return HtmlDocument.from_string(self.response.content, encoding)
def _load_proxies(self): self.log.info("Getting hideme.ru proxy list...") try: response = self.requests_session.get(self.BASE_URL + "/proxy-list/", params=self._prepare_params(), headers=self._prepare_headers()) response.raise_for_status() except requests.exceptions.RequestException as e: raise ProxyListException("Can't obtain proxies list", cause=e) doc = HtmlDocument.from_string(response.content, response.encoding) table = doc.find('table', {'class': 'pl'}) port_images = [] proxies = [] for row in table.find('tr'): td = row.find('td') if not td: continue img = td[1].find('img', {'src': '/images/proxylist_port_\d+.gif'}).attr('src') if not img: self.log.warn("Can't get image port URL, skipping...") continue proto = td[5].text.lower().split(", ") try: ping = int(td[4].text.split(" ")[0]) except (ValueError, KeyError): ping = 0 anon = td[6].text if anon == u'Нет': anon = Anonymity.NONE elif anon == u'Низкая': anon = Anonymity.LOW elif anon == u'Средняя': anon = Anonymity.AVG elif anon == u'Высокая': anon = Anonymity.HIGH else: anon = None proxy = Proxy(td[0].text, False, td[2].text, proto, ping, anon) port_images.append(self.BASE_URL + img) proxies.append(proxy) with ThreadPoolExecutor(max_workers=5) as e: ports = e.map(self._download_port_gif_and_recognize, port_images) for i, port in enumerate(ports): proxies[i].port = port proxies = [p for p in proxies if p.port] if not proxies: raise ProxyListException("Can't obtain proxies list (hideme.ru structure has changed?)") self.log.info("%d proxy(ies) have successfully obtained." % len(proxies)) self.log.debug(proxies) return proxies
def get_files(self, section, media_id, folder_id): """ Get media files by folder ID :param section: Section :param media_id: Media ID :param folder_id: Folder ID """ url = "%s/media_show_page.php?section=%s&id=%d&cid=%d" % (self.base_url, section.filter_val, media_id, folder_id) with Timer(logger=self.log, name='Fetching URL'): html = self.fetch_page(url) document = HtmlDocument.from_string(html) return self._parse_files(document, section, media_id, folder_id)
def get_folders(self, section, media_id): """ Get media folders by media ID :param section: Section :param media_id: Media ID """ url = "%s/media_show_page.php?section=%s&id=%d" % (self.base_url, section.filter_val, media_id) with Timer(logger=self.log, name='Fetching URL'): html = self.fetch_page(url) folders = [] warnings = 0 with Timer(logger=self.log, name='Parsing folders'): document = HtmlDocument.from_string(html) copies_table = document.find('table', {'class': 'copies'}) copies = copies_table.find("table", {'class': 'copy'}) if not copies: self.log.warn("No folders found.") return [] for c in copies: try: folder_id = int(c.attr('id')[4:]) title_td = c.find("td", {'class': 'copy_title'}) icons = title_td.find('img') flag = fmt = None for alt in icons.attrs('alt'): f = Flag.find(alt) if f: flag = f f = Format.find(alt) if f: fmt = f title = title_td.text server_td = c.find("td", {'class': 'server'}) link = server_td.find("a", {'href': '/playlist\.php.*?'}).attr('href') if link: # noinspection PyAugmentAssignment link = self.base_url + link else: self.log.warn('Torrent link is undefined') warnings += 1 br_td = c.find("td", {'class': 'br'}) languages = None video_quality = audio_quality = None embedded_subtitles = external_subtitles = None size = 0 for p in br_td.find('p'): name, val = (p.before_text.split(":", 2) + [""])[:2] val = val.lstrip() if name == 'Язык': languages = [] for lang in p.find('img', {'class': 'flag'}).attrs('alt'): language = Language.find(lang) if not language: self.log.warn('Unknown audio language: %s', lang) language = lang warnings += 1 languages.append(language) elif name == 'Качество звука': audio_quality = AudioQuality.find(val) if not audio_quality: self.log.warn('Unknown audio quality: %s', val) audio_quality = val warnings += 1 elif name == 'Качество изображения': video_quality = VideoQuality.find(val) if not video_quality: self.log.warn('Unknown video quality: %s', val) video_quality = val warnings += 1 elif name == 'Встроенные субтитры': embedded_subtitles = [] for lang in p.find('img', {'class': 'flag'}).attrs('alt'): language = Language.find(lang) if not language: self.log.warn('Unknown embedded subtitles language: %s', lang) language = lang warnings += 1 embedded_subtitles.append(language) elif name == 'Внешние или отключаемые субтитры': external_subtitles = [] for lang in p.find('img', {'class': 'flag'}).attrs('alt'): language = Language.find(lang) if not language: self.log.warn('Unknown external subtitles language: %s', lang) language = lang warnings += 1 external_subtitles.append(language) elif name == 'Размер файлов': size = self._parse_size(val) if size is None: self.log.warn("Can't parse size: %s", val) warnings += 1 else: self.log.warn("Unknown folder property: %s", name) warnings += 1 quality = Quality(fmt, video_quality, audio_quality) files = self._parse_files(copies_table, section, media_id, folder_id) folder = Folder(folder_id, media_id, title, flag, link, quality, languages, fmt, embedded_subtitles, external_subtitles, size, files, section) self.log.debug(repr(folder).decode("unicode-escape")) folders.append(folder) except Exception as e: self.log.exception(e) warnings += 1 self.log.info("Got %d folder(s) successfully, %d warning(s)." % (len(folders), warnings)) return folders
def _parse_details(self, html, section, media_id): details = None warnings = 0 with Timer(logger=self.log, name='Parsing'): document = HtmlDocument.from_string(html) contents = document.find('td', {'class': 'contents'}) info_bar = contents.find('table', {'class': 'infobar'}) if not info_bar: raise ScraperError(32003, "No media found with ID %d" % media_id) info_cols = info_bar.find('td') title = info_cols[0].find('span', {'class': 'title'}).text original_title = info_cols[0].find('span', {'class': 'subtitle'}).strings genres = [] for name in info_cols[1].before_text.split(", "): genre = Genre.find(name) if not genre: self.log.warn('Unknown genre: %s', name) genre = name warnings += 1 genres.append(genre) countries = [] temp_list = info_cols[1].after_text.split(", ") for name in temp_list[:-1]: country = Country.find(name) if not country: self.log.warn('Unknown country: %s', name) country = name warnings += 1 countries.append(country) year = temp_list[-1] studios = [] creators = [] actors = [] voice_artists = [] release_date = release_date_russia = rating = user_rating = None user_rating_re = re.search('(\d+\.\d+)', info_cols[2].before_text) if user_rating_re: user_rating = user_rating_re.group(0) rating_re = re.search('(\d+\.\d+)', info_cols[2].after_text) if rating_re: rating = rating_re.group(0) properties = contents.find('p', {'class': 'property'}) for prop in properties: try: label = prop.before_text value = prop.find('span') if label == 'Дата премьеры:': release_date = value.text elif label == 'Дата российской премьеры:': release_date_russia = value.text elif label == 'Студия:': studios = [item.text for item in value.find('a')] elif label == 'Создатели:': creators = value.find('a').strings elif label == 'В ролях:': actors = value.find('a').strings elif label == 'Роли озвучивали:': voice_artists = value.find('a').strings elif label == 'Серии:': pass else: self.log.warn('Unknown description block: %s', label) warnings += 1 except Exception as e: self.log.exception(e) warnings += 1 plot = contents.find('div', {'style': 'display:table-cell;.*?'}).text poster = contents.find("div", {'class': 'media_pic'}).find("a").attr('href') screenshots = contents.find('div', {'id': 'imgsContainer'}).find('a').attrs('href') details = Details(title, original_title, countries, year, release_date, release_date_russia, studios, genres, plot, creators, actors, voice_artists, rating, user_rating, poster, media_id, section, screenshots) self.log.info("Got details successfully, %d warning(s)." % warnings) self.log.debug(repr(details).decode("unicode-escape")) return details
def search(self, search_filter=None, skip=None): """ Search media :type search_filter: MediaPoiskSearchFilter :param search_filter: Use SearchFilter :param skip: How many results to skip (for paging) """ url = self.base_url + '/media_page.php' query = {} cookie_jar = None if search_filter: cookie_jar = search_filter.cookies query.update(search_filter.encoded_query) self.log.info('Using search filter: %s', search_filter) if skip: query['skip'] = skip if query: url += "?" + urllib.urlencode(query) with Timer(logger=self.log, name='Fetching URL'): html = self.fetch_page(url, cookie_jar=cookie_jar) section = search_filter.section results = [] warnings = 0 with Timer(logger=self.log, name='Parsing'): document = HtmlDocument.from_string(html) self.has_more = False table = document.find('table', {'class': 'zebra'}) navbar = table.find('tr', {'class': 'navbar'}).first if "Ничего не найдено" in navbar.text: self.log.info("No results found.") return [] cur_pages = navbar.find('b').text self.has_more = not navbar.text.endswith(cur_pages) rows = table.find('tr', {'class': 'even|odd'}) for row in rows: try: cols = row.find('td') title_td = cols[2] link = title_td.find('a').attr('href') media_id = int(link.split('=')[-1]) title = title_td.find('span', {'class': 'title'}).text original_title = title_td.find('span', {'class': 'subtitle'}).strings year = cols[7].text rating = cols[-2].text rating = "%01.01f" % (float(rating)) if rating == '0.0': rating = None user_rating = cols[-1].text user_rating = "%01.01f" % (float(user_rating)) if user_rating == '0.0': user_rating = None added_date = cols[1].find('span').attr('title').split(" ")[0] added_date = str_to_date(added_date) flag = Flag.find(cols[0].find('img').attr('title')) qua_td = cols[3] fmt_str = qua_td.find('img').attr('alt') fmt = Format.find(fmt_str) if not format: self.log.warn('Unknown format: %s', fmt_str) warnings += 1 qua_str = qua_td.find('span').attr('title').split(",") video_qua_str = qua_str[0].split(": ")[1] audio_qua_str = qua_str[1].split(": ")[1] video_quality = VideoQuality.find(video_qua_str) if not video_quality: self.log.warn('Unknown video quality: %s', video_qua_str) warnings += 1 audio_quality = AudioQuality.find(audio_qua_str) if not audio_quality: self.log.warn('Unknown audio quality: %s', audio_qua_str) warnings += 1 quality = Quality(fmt, video_quality, audio_quality) languages = [] for img in cols[4].find('img'): name = img.attr('alt') language = Language.find(name) if not language: self.log.warn('Unknown language: %s', name) language = name warnings += 1 languages.append(language) genres = [] for a in cols[5].find('a'): name = a.text genre = Genre.find(name) if not genre: self.log.warn('Unknown genre: %s', name) genre = name warnings += 1 genres.append(genre) countries = [] for a in cols[6].find('a'): name = a.text country = Country.find(name) if not country: self.log.warn('Unknown country: %s', name) country = name warnings += 1 countries.append(country) media = Media(media_id, title, original_title, added_date, flag, quality, genres, languages, countries, year, rating, user_rating, section) self.log.debug(repr(media).decode("unicode-escape")) results.append(media) except Exception as e: self.log.exception(e) warnings += 1 self.log.info("Found %d result(s), %d warning(s).", len(results), warnings) return results