def _get_episode_url(self, show_url, video): force_title = scraper_utils.force_title(video) title_fallback = kodi.get_setting('title-fallback') == 'true' norm_title = scraper_utils.normalize_title(video.ep_title) page_url = [show_url] too_old = False while page_url and not too_old: html = self._http_get(page_url[0], require_debrid=True, cache_limit=1) for _attr, post in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')}): if self.__too_old(post): too_old = True break if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post: match = dom_parser2.parse_dom(post, 'a', req='href') if match: url, title = match[0].attrs['href'], match[0].content if not force_title: if scraper_utils.release_check(video, title, require_title=False): return scraper_utils.pathify_url(url) else: if title_fallback and norm_title: match = re.search('</strong>(.*?)</p>', post) if match and norm_title == scraper_utils.normalize_title(match.group(1)): return scraper_utils.pathify_url(url) page_url = dom_parser2.parse_dom(html, 'a', {'class': 'nextpostslink'}, req='href') if page_url: page_url = [page_url[0].attrs['href']]
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = '/search/' + urllib.quote_plus(title) html = self._http_get(search_url, require_debrid=True, cache_limit=1) if video_type == VIDEO_TYPES.TVSHOW: seen_urls = {} for _attr, post in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')}): if CATEGORIES[video_type] not in post: continue match = re.search('<span>\s*TAGS:\s*</span>\s*<a\s+href="([^"]+)[^>]+>([^<]+)', post, re.I) if match: show_url, match_title = match.groups() if show_url in seen_urls: continue result = {'url': scraper_utils.pathify_url(show_url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''} seen_urls[show_url] = result results.append(result) elif video_type == VIDEO_TYPES.MOVIE: norm_title = scraper_utils.normalize_title(title) headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html) posts = [result.content for result in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')})] for heading, post in zip(headings, posts): if CATEGORIES[video_type] not in post or self.__too_old(post): continue post_url, post_title = heading meta = scraper_utils.parse_movie_link(post_title) full_title = '%s [%s] (%sp)' % (meta['title'], meta['extra'], meta['height']) match_year = meta['year'] match_norm_title = scraper_utils.normalize_title(meta['title']) if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(post_url), 'title': scraper_utils.cleanse_title(full_title), 'year': match_year} results.append(result) return results
def _get_episode_url(self, show_url, video): episode_pattern = 'href="([^"]+-s0*%se0*%s(?!\d)[^"]*)' % (video.season, video.episode) result = self._default_get_episode_url(show_url, video, episode_pattern) if result: return result url = urlparse.urljoin(self.base_url, show_url) html = self._http_get(url, cache_limit=2) fragment = dom_parser.parse_dom(html, "ul", {"class": "episode_list"}) if fragment: ep_urls = dom_parser.parse_dom(fragment[0], "a", ret="href") ep_dates = dom_parser.parse_dom(fragment[0], "span", {"class": "episode_air_d"}) ep_titles = dom_parser.parse_dom(fragment[0], "span", {"class": "episode_name"}) force_title = scraper_utils.force_title(video) if not force_title and kodi.get_setting("airdate-fallback") == "true" and video.ep_airdate: for ep_url, ep_date in zip(ep_urls, ep_dates): log_utils.log( "Quikr Ep Airdate Matching: %s - %s - %s" % (ep_url, ep_date, video.ep_airdate), log_utils.LOGDEBUG, ) if video.ep_airdate == scraper_utils.to_datetime(ep_date, "%Y-%m-%d").date(): return scraper_utils.pathify_url(ep_url) if force_title or kodi.get_setting("title-fallback") == "true": norm_title = scraper_utils.normalize_title(video.ep_title) for ep_url, ep_title in zip(ep_urls, ep_titles): ep_title = re.sub("<span>.*?</span>\s*", "", ep_title) log_utils.log( "Quikr Ep Title Matching: %s - %s - %s" % (ep_url, norm_title, video.ep_title), log_utils.LOGDEBUG, ) if norm_title == scraper_utils.normalize_title(ep_title): return scraper_utils.pathify_url(ep_url)
def search(self, video_type, title, year, season=""): results = [] norm_title = scraper_utils.normalize_title(title) if video_type == VIDEO_TYPES.MOVIE: if year: base_url = urlparse.urljoin(self.base_url, "/Film/") html = self._http_get(base_url, cache_limit=48) for link in self.__parse_directory(html): if year == link["title"]: url = urlparse.urljoin(base_url, link["link"]) for movie in self.__get_files(url, cache_limit=24): match_title, match_year, _height, _extra = scraper_utils.parse_movie_link(movie["link"]) if ( not movie["directory"] and norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year) ): result = {"url": scraper_utils.pathify_url(url), "title": match_title, "year": year} results.append(result) else: base_url = urlparse.urljoin(self.base_url, "/Serial/") html = self._http_get(base_url, cache_limit=48) for link in self.__parse_directory(html): if link["directory"] and norm_title in scraper_utils.normalize_title(link["title"]): url = urlparse.urljoin(base_url, link["link"]) result = {"url": scraper_utils.pathify_url(url), "title": link["title"], "year": ""} results.append(result) return results
def _get_episode_url(self, show_url, video): force_title = scraper_utils.force_title(video) title_fallback = kodi.get_setting('title-fallback') == 'true' norm_title = scraper_utils.normalize_title(video.ep_title) page_url = [show_url] too_old = False while page_url and not too_old: url = urlparse.urljoin(self.base_url, page_url[0]) html = self._http_get(url, require_debrid=True, cache_limit=1) posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}) for post in posts: if self.__too_old(post): too_old = True break if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post: match = re.search('<a\s+href="([^"]+)[^>]+>(.*?)</a>', post) if match: url, title = match.groups() if not force_title: if scraper_utils.release_check(video, title, require_title=False): return scraper_utils.pathify_url(url) else: if title_fallback and norm_title: match = re.search('</strong>(.*?)</p>', post) if match and norm_title == scraper_utils.normalize_title(match.group(1)): return scraper_utils.pathify_url(url) page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')
def _get_episode_url(self, show_url, video): force_title = scraper_utils.force_title(video) title_fallback = kodi.get_setting('title-fallback') == 'true' norm_title = scraper_utils.normalize_title(video.ep_title) page_url = [show_url] too_old = False while page_url and not too_old: url = scraper_utils.urljoin(self.base_url, page_url[0]) html = self._http_get(url, require_debrid=True, cache_limit=1) headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html) posts = [r.content for r in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')})] for heading, post in zip(headings, posts): if self.__too_old(post): too_old = True break if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post: url, title = heading if not force_title: if scraper_utils.release_check(video, title, require_title=False): return scraper_utils.pathify_url(url) else: if title_fallback and norm_title: match = re.search('<strong>(.*?)</strong>', post) if match and norm_title == scraper_utils.normalize_title(match.group(1)): return scraper_utils.pathify_url(url) page_url = dom_parser2.parse_dom(html, 'a', {'class': 'nextpostslink'}, req='href') if page_url: page_url = [page_url[0].attrs['href']]
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] if video_type == VIDEO_TYPES.TVSHOW and title: test_url = '/tv-show/%s/' % (scraper_utils.to_slug(title)) test_url = scraper_utils.urljoin(self.base_url, test_url) html = self._http_get(test_url, require_debrid=True, cache_limit=24) posts = dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')}) if posts: result = {'url': scraper_utils.pathify_url(test_url), 'title': scraper_utils.cleanse_title(title), 'year': ''} results.append(result) elif video_type == VIDEO_TYPES.MOVIE: search_title = re.sub('[^A-Za-z0-9 ]', '', title.lower()) html = self._http_get(self.base_url, params={'s': search_title}, require_debrid=True, cache_limit=1) norm_title = scraper_utils.normalize_title(title) for _attrs, post in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')}): match = re.search('<h\d+[^>]*>\s*<a\s+href="([^"]+)[^>]*>(.*?)</a>', post) if match: post_url, post_title = match.groups() if '/tv-show/' in post or self.__too_old(post): continue post_title = re.sub('<[^>]*>', '', post_title) meta = scraper_utils.parse_movie_link(post_title) full_title = '%s [%s] (%sp)' % (meta['title'], meta['extra'], meta['height']) match_year = meta['year'] match_norm_title = scraper_utils.normalize_title(meta['title']) if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(post_url), 'title': scraper_utils.cleanse_title(full_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] if title: html = self._http_get(self.base_url, cache_limit=48) norm_title = scraper_utils.normalize_title(title) fragment = dom_parser2.parse_dom(html, 'div', {'class': 'container seo'}) if fragment: match_year = '' for attrs, match_title in dom_parser2.parse_dom(fragment[0].content, 'a', {'class': 'link'}, req='href'): if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(attrs['href']), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) for _attrs, table in dom_parser2.parse_dom(html, 'table'): for _attrs, td in dom_parser2.parse_dom(table, 'td'): match_url = dom_parser2.parse_dom(td, 'a', req='href') match_title = dom_parser2.parse_dom(td, 'div', {'class': 'searchTVname'}) match_year = dom_parser2.parse_dom(td, 'span', {'class': 'right'}) if match_url and match_title: match_url = match_url[0].attrs['href'] match_title = match_title[0].content match_year = match_year[0].content if match_year else '' if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def _get_episode_url(self, show_url, video): url = urlparse.urljoin(self.base_url, show_url) html = self._http_get(url, cache_limit=2) if html: force_title = scraper_utils.force_title(video) episodes = dom_parser.parse_dom(html, 'div', {'class': '\s*el-item\s*'}) if not force_title: episode_pattern = 'href="([^"]*-[sS]%02d[eE]%02d(?!\d)[^"]*)' % (int(video.season), int(video.episode)) match = re.search(episode_pattern, html) if match: return scraper_utils.pathify_url(match.group(1)) if kodi.get_setting('airdate-fallback') == 'true' and video.ep_airdate: airdate_pattern = '%02d-%02d-%d' % (video.ep_airdate.day, video.ep_airdate.month, video.ep_airdate.year) for episode in episodes: ep_url = dom_parser.parse_dom(episode, 'a', ret='href') ep_airdate = dom_parser.parse_dom(episode, 'div', {'class': 'date'}) if ep_url and ep_airdate: ep_airdate = ep_airdate[0].strip() if airdate_pattern == ep_airdate: return scraper_utils.pathify_url(ep_url[0]) if (force_title or kodi.get_setting('title-fallback') == 'true') and video.ep_title: norm_title = scraper_utils.normalize_title(video.ep_title) for episode in episodes: ep_url = dom_parser.parse_dom(episode, 'a', ret='href') ep_title = dom_parser.parse_dom(episode, 'div', {'class': 'e-name'}) if ep_url and ep_title and norm_title == scraper_utils.normalize_title(ep_title[0]): return scraper_utils.pathify_url(ep_url[0])
def _get_episode_url(self, show_url, video): query = scraper_utils.parse_query(show_url) if 'id' in query: url = scraper_utils.urljoin(self.base_url, '/api/v2/shows/%s' % (query['id'])) js_data = self._http_get(url, cache_limit=.5) if 'episodes' in js_data: force_title = scraper_utils.force_title(video) if not force_title: for episode in js_data['episodes']: if int(video.season) == int(episode['season']) and int(video.episode) == int(episode['number']): return scraper_utils.pathify_url('?id=%s' % (episode['id'])) if kodi.get_setting('airdate-fallback') == 'true' and video.ep_airdate: for episode in js_data['episodes']: if 'airdate' in episode: ep_airdate = scraper_utils.to_datetime(episode['airdate'], "%Y-%m-%d").date() if video.ep_airdate == (ep_airdate - datetime.timedelta(days=1)): return scraper_utils.pathify_url('?id=%s' % (episode['id'])) else: logger.log('Skipping S&E matching as title search is forced on: %s' % (video.trakt_id), log_utils.LOGDEBUG) if (force_title or kodi.get_setting('title-fallback') == 'true') and video.ep_title: norm_title = scraper_utils.normalize_title(video.ep_title) for episode in js_data['episodes']: if 'name' in episode and norm_title in scraper_utils.normalize_title(episode['name']): return scraper_utils.pathify_url('?id=%s' % (episode['id']))
def _default_get_episode_url(self, html, video, episode_pattern, title_pattern='', airdate_pattern=''): logger.log('Default Episode Url: |%s|%s|' % (self.get_name(), video), log_utils.LOGDEBUG) if not html: return try: html = html[0].content except AttributeError: pass force_title = scraper_utils.force_title(video) if not force_title: if episode_pattern: match = re.search(episode_pattern, html, re.DOTALL | re.I) if match: return scraper_utils.pathify_url(match.group(1)) if kodi.get_setting('airdate-fallback') == 'true' and airdate_pattern and video.ep_airdate: airdate_pattern = airdate_pattern.replace('{year}', str(video.ep_airdate.year)) airdate_pattern = airdate_pattern.replace('{month}', str(video.ep_airdate.month)) airdate_pattern = airdate_pattern.replace('{p_month}', '%02d' % (video.ep_airdate.month)) airdate_pattern = airdate_pattern.replace('{month_name}', MONTHS[video.ep_airdate.month - 1]) airdate_pattern = airdate_pattern.replace('{short_month}', SHORT_MONS[video.ep_airdate.month - 1]) airdate_pattern = airdate_pattern.replace('{day}', str(video.ep_airdate.day)) airdate_pattern = airdate_pattern.replace('{p_day}', '%02d' % (video.ep_airdate.day)) logger.log('Air Date Pattern: %s' % (airdate_pattern), log_utils.LOGDEBUG) match = re.search(airdate_pattern, html, re.DOTALL | re.I) if match: return scraper_utils.pathify_url(match.group(1)) else: logger.log('Skipping S&E matching as title search is forced on: %s' % (video.trakt_id), log_utils.LOGDEBUG) if (force_title or kodi.get_setting('title-fallback') == 'true') and video.ep_title and title_pattern: norm_title = scraper_utils.normalize_title(video.ep_title) for match in re.finditer(title_pattern, html, re.DOTALL | re.I): episode = match.groupdict() if norm_title == scraper_utils.normalize_title(episode['title']): return scraper_utils.pathify_url(episode['url'])
def search(self, video_type, title, year): results = [] norm_title = scraper_utils.normalize_title(title) if video_type == VIDEO_TYPES.MOVIE: if year: base_url = urlparse.urljoin(self.base_url, '/Film/') html = self._http_get(base_url, cache_limit=48) for link in self.__parse_directory(html): if year == link['title']: url = urlparse.urljoin(base_url, link['link']) for movie in self.__get_files(url, cache_limit=24): match_title, match_year, _height, _extra = scraper_utils.parse_movie_link(movie['link']) if not movie['directory'] and norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(url), 'title': match_title, 'year': year} results.append(result) else: base_url = urlparse.urljoin(self.base_url, '/Serial/') html = self._http_get(base_url, cache_limit=48) for link in self.__parse_directory(html): if link['directory'] and norm_title in scraper_utils.normalize_title(link['title']): url = urlparse.urljoin(base_url, link['link']) result = {'url': scraper_utils.pathify_url(url), 'title': link['title'], 'year': ''} results.append(result) return results
def _get_episode_url(self, show_url, video): sxe = '(\.|_| )S%02dE%02d(\.|_| )' % (int(video.season), int(video.episode)) force_title = scraper_utils.force_title(video) title_fallback = kodi.get_setting('title-fallback') == 'true' norm_title = scraper_utils.normalize_title(video.ep_title) try: airdate_pattern = video.ep_airdate.strftime('(\.|_| )%Y(\.|_| )%m(\.|_| )%d(\.|_| )') except: airdate_pattern = '' page_url = [show_url] too_old = False while page_url and not too_old: url = urlparse.urljoin(self.base_url, page_url[0]) html = self._http_get(url, require_debrid=True, cache_limit=1) posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}) for post in posts: if self.__too_old(post): too_old = True break if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post: match = re.search('<a\s+href="([^"]+)[^>]+>(.*?)</a>', post) if match: url, title = match.groups() if not force_title: if re.search(sxe, title) or (airdate_pattern and re.search(airdate_pattern, title)): return scraper_utils.pathify_url(url) else: if title_fallback and norm_title: match = re.search('</strong>(.*?)</p>', post) if match and norm_title == scraper_utils.normalize_title(match.group(1)): return scraper_utils.pathify_url(url) page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')
def _get_episode_url(self, show_url, video): sxe = '.S%02dE%02d.' % (int(video.season), int(video.episode)) force_title = scraper_utils.force_title(video) title_fallback = kodi.get_setting('title-fallback') == 'true' norm_title = scraper_utils.normalize_title(video.ep_title) try: ep_airdate = video.ep_airdate.strftime('.%Y.%m.%d.') except: ep_airdate = '' page_url = [show_url] too_old = False while page_url and not too_old: url = urlparse.urljoin(self.base_url, page_url[0]) html = self._http_get(url, require_debrid=True, cache_limit=1) headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html) posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}) for heading, post in zip(headings, posts): if self.__too_old(post): too_old = True break if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post: url, title = heading if not force_title: if (sxe in title) or (ep_airdate and ep_airdate in title): return scraper_utils.pathify_url(url) else: if title_fallback and norm_title: match = re.search('<strong>(.*?)</strong>', post) if match and norm_title == scraper_utils.normalize_title(match.group(1)): return scraper_utils.pathify_url(url) page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')
def _get_episode_url(self, show_url, video): url = scraper_utils.urljoin(self.base_url, show_url) html = self._http_get(url, cache_limit=2) episode_pattern = 'href="([^"]+-s0*%se0*%s(?!\d)[^"]*)' % (video.season, video.episode) parts = dom_parser2.parse_dom(html, 'ul', {'class': 'episode_list'}) fragment = '\n'.join(part.content for part in parts) result = self._default_get_episode_url(fragment, video, episode_pattern) if result: return result ep_urls = [r.attrs['href'] for r in dom_parser2.parse_dom(fragment, 'a', req='href')] ep_dates = [r.content for r in dom_parser2.parse_dom(fragment, 'span', {'class': 'episode_air_d'})] ep_titles = [r.content for r in dom_parser2.parse_dom(fragment, 'span', {'class': 'episode_name'})] force_title = scraper_utils.force_title(video) if not force_title and kodi.get_setting('airdate-fallback') == 'true' and video.ep_airdate: for ep_url, ep_date in zip(ep_urls, ep_dates): logger.log('Quikr Ep Airdate Matching: %s - %s - %s' % (ep_url, ep_date, video.ep_airdate), log_utils.LOGDEBUG) if video.ep_airdate == scraper_utils.to_datetime(ep_date, '%Y-%m-%d').date(): return scraper_utils.pathify_url(ep_url) if force_title or kodi.get_setting('title-fallback') == 'true': norm_title = scraper_utils.normalize_title(video.ep_title) for ep_url, ep_title in zip(ep_urls, ep_titles): ep_title = re.sub('<span>.*?</span>\s*', '', ep_title) logger.log('Quikr Ep Title Matching: %s - %s - %s' % (ep_url.encode('utf-8'), ep_title.encode('utf-8'), video.ep_title), log_utils.LOGDEBUG) if norm_title == scraper_utils.normalize_title(ep_title): return scraper_utils.pathify_url(ep_url)
def _get_episode_url(self, season_url, video): url = urlparse.urljoin(self.base_url, season_url) html = self._http_get(url, cache_limit=2) if int(video.episode) == 1: return scraper_utils.pathify_url(url) else: pattern = 'location\.href="([^&]*season-%s[^/]*/%s)"' % (video.season, video.episode) match = re.search(pattern, html) if match: return scraper_utils.pathify_url(match.group(1))
def _get_episode_url(self, show_url, video): season_url = show_url + '-season-%s/' % (video.season) url = urlparse.urljoin(self.base_url, season_url) html = self._http_get(url, allow_redirect=False, cache_limit=.5) if html != '/': if int(video.episode) == 1: return scraper_utils.pathify_url(url) else: pattern = 'location\.href="([^&]*season-%s/%s)"' % (video.season, video.episode) match = re.search(pattern, html) if match: return scraper_utils.pathify_url(match.group(1))
def search(self, video_type, title, year, season=''): url = urlparse.urljoin(self.base_url, 'http://ororo.tv/en') if video_type == VIDEO_TYPES.MOVIE: url += '/movies' html = self._http_get(url, cache_limit=.25) results = [] norm_title = scraper_utils.normalize_title(title) include_paid = kodi.get_setting('%s-include_premium' % (self.get_name())) == 'true' for match in re.finditer( '''<span class='value'>(\d{4})(.*?)href="([^"]+)[^>]+>([^<]+)''', html, re.DOTALL): match_year, middle, url, match_title = match.groups() if not include_paid and video_type == VIDEO_TYPES.MOVIE and 'paid accounts' in middle: continue if norm_title in scraper_utils.normalize_title(match_title) and ( not year or not match_year or year == match_year): result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/index.php?menu=search&query=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results = [] sections = {VIDEO_TYPES.MOVIE: 'movies', VIDEO_TYPES.TVSHOW: 'series'} fragment = dom_parser.parse_dom(html, 'div', {'id': sections[video_type]}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'figcaption'): match = re.search('title="([^"]+)[^>]+href="([^"]+)', item) if match: match_title_year, url = match.groups() match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if match_title.startswith('Watch '): match_title = match_title.replace('Watch ', '') if match_title.endswith(' Online'): match_title = match_title.replace(' Online', '') if not year or not match_year or year == match_year: result = {'title': match_title, 'url': scraper_utils.pathify_url(url), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/movies.php?list=search&search=') search_url += urllib.quote_plus(title) cookies = {'onlylanguage': 'en', 'lang': 'en'} html = self._http_get(search_url, cookies=cookies, cache_limit=.25) results = [] pattern = 'id="tdmovies">\s*<a\s+href="([^"]+)">([^<]+).*?id="f7">(.*?)</TD>' for match in re.finditer(pattern, html, re.DOTALL): url, title, extra = match.groups('') if (video_type == VIDEO_TYPES.MOVIE and '(TVshow)' in title) or (video_type == VIDEO_TYPES.TVSHOW and '(TVshow)' not in title): continue title = title.replace('(TVshow)', '') title = title.strip() r = re.search('>(\d{4})<', extra) if r: match_year = r.group(1) else: match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search/%s.html' % urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=.25) results = [] fragment = dom_parser.parse_dom(html, 'div', {'class': 'list-movie'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'movie'}): match = re.search('class="movie-name".*?href="([^"]+)[^>]+>([^<]+)', item) if match: url, match_title = match.groups() is_season = re.search('\s+-\s+[Ss](\d+)$', match_title) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: match_year = '' if video_type == VIDEO_TYPES.MOVIE: for info_frag in dom_parser.parse_dom(item, 'p', {'class': 'info'}): match = re.search('(\d{4})', info_frag) if match: match_year = match.group(1) break if not match_year: match = re.search('(\d{4})$', url) if match: match_year = match.group(1) else: if season and int(is_season.group(1)) != int(season): continue if (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): if video_type == VIDEO_TYPES.MOVIE: is_series = 1 else: is_series = 2 search_url = urlparse.urljoin( self.base_url, '/advanced-search/?q[title]=%s&q[is_series]=%s&q[year_from]=%s&q[year_to]=%s' ) search_url = search_url % (urllib.quote_plus(title), is_series, year, year) results = [] html = self._http_get(search_url, cache_limit=.25) if not re.search('Nothing was found', html): for match in re.finditer( 'class="name">\s*<a\s+title="([^"]+)\s+\((\d{4})\)"\s+href="([^"]+)', html): title, year, url = match.groups('') if re.search('/season-\d+/episode-\d+', url): continue # exclude episodes result = { 'url': scraper_utils.pathify_url(url), 'title': title, 'year': year } results.append(result) return results
def search(self, video_type, title, year, season=''): url = urlparse.urljoin(self.base_url, '/tv-series-a-z-list') html = self._http_get(url, cache_limit=8) results = [] pattern = '<li>\s*<a.*?href="([^"]+)[^>]*>([^<]+)' norm_title = scraper_utils.normalize_title(title) for match in re.finditer(pattern, html, re.DOTALL): url, match_title_year = match.groups() r = re.search('(.*?)\s+\((\d{4})\)', match_title_year) if r: match_title, match_year = r.groups() else: match_title = match_title_year match_year = '' if norm_title in scraper_utils.normalize_title(match_title) and ( not year or not match_year or year == match_year): result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) title_strip = [word.decode('utf-8') for word in TITLE_STRIP] for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}): match_url = re.search('href="([^"]+)', item) match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'}) if match_url and match_title: item_type = dom_parser.parse_dom(item, 'span', {'class': 'calidad2'}) if item_type and item_type[0] in SEARCH_EXCLUDE: continue match_url = match_url.group(1) match_title = match_title[0] if 'SEZON' in match_title.upper(): continue year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'}) if year_frag: match_year = year_frag[0] else: match_year = '' match_title = ' '.join([word for word in match_title.split() if word.upper() not in title_strip]) if (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] html = self._http_get(self.base_url, params={'s': title}, cache_limit=1) if re.search('Sorry, but nothing matched', html, re.I): return results fragment = dom_parser2.parse_dom(html, 'ul', {'class': 'listing-videos'}) if not fragment: return results for attrs, match_title_year in dom_parser2.parse_dom( fragment[0].content, 'a', req='href'): match_url = attrs['href'] match_title_year = re.sub('</?[^>]*>', '', match_title_year) match_title, match_year = scraper_utils.extra_year( match_title_year) if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus('%s %s' % (title, year)) html = self._http_get(search_url, cache_limit=.25) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}): match = re.search('href="([^"]+).*?alt="([^"]+)', item, re.DOTALL) if match: url, match_title_year = match.groups() match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year year_fragment = dom_parser.parse_dom( item, 'span', {'class': 'year'}) if year_fragment: match_year = year_fragment[0] else: match_year = '' if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(url), 'title': match_title, 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin( self.base_url, '/results?q=%s' % urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=.25) results = [] for result in dom_parser.parse_dom(html, 'div', {'class': 'cell'}): match = re.search( 'class="video_title".*?href="([^"]+)"[^>]*>\s*([^<]+)', result, re.DOTALL) if match: url, match_title_year = match.groups() match = re.search('(.*?)\s+\((\d{4})\)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match = re.search( 'class="video_quality".*?Year\s*(?:</b>)?\s*:\s*(\d{4})', result, re.DOTALL) if match: match_year = match.group(1) else: match_year = '' if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(url), 'title': match_title, 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus('%s %s' % (title, year)) html = self._http_get(search_url, cache_limit=.25) results = [] if not re.search('Sorry, but nothing matched', html): norm_title = scraper_utils.normalize_title(title) for item in dom_parser.parse_dom( html, 'li', {'class': '[^"]*box-shadow[^"]*'}): match = re.search('href="([^"]+)"\s+title="([^"]+)', item) if match: url, match_title_year = match.groups() if re.search('S\d{2}E\d{2}', match_title_year): continue # skip episodes if re.search('TV\s*SERIES', match_title_year, re.I): continue # skip shows match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if (not year or not match_year or year == match_year ) and norm_title in scraper_utils.normalize_title( match_title): result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url) } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/index.php') data = {'subaction': 'search', 'story': title, 'do': 'search'} headers = {'Referer': search_url} html = self._http_get(search_url, params={'do': 'search'}, data=data, headers=headers, cache_limit=1) fragment = dom_parser2.parse_dom(html, 'div', {'id': 'dle-content'}) if not fragment: return results for _attrs, item in dom_parser2.parse_dom(fragment[0].content, 'div', {'class': 'short-film'}): match = re.search('<h5><a\s+href="([^"]+)[^>]+title="([^"]+)', item) if not match: continue url, match_title = match.groups('') result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/bestmatch-search-%s.html') search_title = title.replace(' ', '-') search_title = re.sub('[^A-Za-z0-9-]', '', search_title).lower() search_url = search_url % (search_title) html = self._http_get(search_url, cache_limit=1) for item in dom_parser.parse_dom(html, 'div', {'class': 'thumbsTitle'}): match = re.search('href="([^"]+)[^>]*>(.*?)</a>', item) if match: url, match_title_year = match.groups('') match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if (not year or not match_year or year == match_year): result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year): results = [] search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) fragment = dom_parser.parse_dom(html, 'ul', {'class': '[^"]*listing-videos[^"]*'}) if fragment: for match in re.finditer('href="([^"]+)[^>]*>(.*?)</a>', fragment[0]): url, match_title_year = match.groups('') match_title_year = re.sub('<span>|</span>', '', match_title_year) if re.search('S\d{2}E\d{2}', match_title_year): continue # skip episodes match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' match_title = match_title.replace('–', '-') match_title = match_title.replace('’', "'") if (not year or not match_year or year == match_year): result = { 'url': scraper_utils.pathify_url(url), 'title': match_title, 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}): match = re.search('href="([^"]+)', item) match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'}) year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'}) if match and match_title: url = match.group(1) match_title = match_title[0] if re.search('\d+\s*x\s*\d+', match_title): continue # exclude episodes match = re.search('(.*?)\s+\((\d{4})\)', match_title) if match: match_title, match_year = match.groups() else: match_title = match_title match_year = '' if year_frag: match_year = year_frag[0] if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url) } results.append(result) return results
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/movie/search/') search_url += title html = self._http_get(search_url, cache_limit=1) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}): match_title = dom_parser.parse_dom(item, 'span', {'class': 'mli-info'}) match_url = re.search('href="([^"]+)', item, re.DOTALL) match_year = re.search('class="jt-info">(\d{4})<', item) is_episodes = dom_parser.parse_dom(item, 'span', {'class': 'mli-eps'}) if match_title and match_url and not is_episodes: match_title = match_title[0] match_title = re.sub('</?h2>', '', match_title) match_title = re.sub('\s+\d{4}$', '', match_title) url = urlparse.urljoin(match_url.group(1), 'watching.html') match_year = match_year.group(1) if match_year else '' if not year or not match_year or year == match_year: result = { 'title': match_title, 'year': match_year, 'url': scraper_utils.pathify_url(url) } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/index.php?search_keywords=') search_url += urllib.quote_plus(title) search_url += '&year=' + urllib.quote_plus(str(year)) if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]: search_url += '&search_section=2' else: search_url += '&search_section=1' results = [] html = self._http_get(self.base_url, cache_limit=0) match = re.search('input type="hidden" name="key" value="([0-9a-f]*)"', html) if match: key = match.group(1) search_url += '&key=' + key html = self._http_get(search_url, cache_limit=.25) pattern = r'class="index_item.+?href="(.+?)" title="Watch (.+?)"?\(?([0-9]{4})?\)?"?>' for match in re.finditer(pattern, html): url, title, year = match.groups('') result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': year } results.append(result) else: log_utils.log('Unable to locate PW search key', log_utils.LOGWARNING) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/ajax/search.php') timestamp = int(time.time() * 1000) query = { 'q': title, 'limit': '100', 'timestamp': timestamp, 'verifiedCheck': '' } html = self._http_get(search_url, data=query, headers=XHR, cache_limit=1) if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]: media_type = 'TV SHOW' else: media_type = 'MOVIE' js_data = scraper_utils.parse_json(html, search_url) for item in js_data: if item['meta'].upper().startswith(media_type): result = { 'title': scraper_utils.cleanse_title(item['title']), 'url': scraper_utils.pathify_url(item['permalink']), 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin( self.base_url, '/search?keyword=%s' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=1) results = [] match_year = '' fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*movie-list[^"]*'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'item'}): links = dom_parser.parse_dom(item, 'a', {'class': 'name'}, ret='href') titles = dom_parser.parse_dom(item, 'a', {'class': 'name'}) is_season = dom_parser.parse_dom(item, 'div', {'class': 'status'}) for match_url, match_title in zip(links, titles): if (not is_season and video_type == VIDEO_TYPES.MOVIE) or ( is_season and video_type == VIDEO_TYPES.SEASON): if video_type == VIDEO_TYPES.SEASON: if season and not re.search( '\s+%s$' % (season), match_title): continue if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'year': '', 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search-movies/%s.html') search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=8) results = [] for thumb in dom_parser.parse_dom(html, 'div', {'class': 'thumb'}): match_title = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='title') url = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='href') if match_title and url: match_title, url = match_title[0], url[0] is_season = re.search('Season\s+(\d+)$', match_title, re.I) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: match_year = '' if video_type == VIDEO_TYPES.MOVIE: match_year = dom_parser.parse_dom( thumb, 'div', {'class': '[^"]*status-year[^"]*'}) if match_year: match_year = match_year[0] else: if season and int(is_season.group(1)) != int(season): continue if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/?s=%s' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=1) results = [] match_year = '' for entry in dom_parser.parse_dom(html, 'header', {'class': 'entry-header'}): match = re.search('href="([^"]+)[^>]+>([^<]+)', entry) if match: match_url, match_title_year = match.groups() match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = { 'title': match_title, 'year': match_year, 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] url = urlparse.urljoin(self.base_url, '/search.html') data = {'search': title} headers = {'Referer': self.base_url} html = self._http_get(url, data=data, headers=headers, cache_limit=2) if video_type == VIDEO_TYPES.MOVIE: query_type = 'watch-movie-' else: query_type = 'watch-tvshow-' norm_title = scraper_utils.normalize_title(title) for item in dom_parser.parse_dom(html, 'a', {'href': '#'}): match = re.search('href="(%s[^"]+)' % (query_type), item) if match: link = match.group(1) match_title = self.__make_title(link, query_type) match_year = '' if norm_title in scraper_utils.normalize_title( match_title) and (not year or not match_year or int(year) == int(match_year)): result = { 'url': scraper_utils.pathify_url(link), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, SEARCH_URL) referer = scraper_utils.urljoin(self.base_url, '/search/?q=%s') referer = referer % (urllib.quote_plus(title)) headers = {'Referer': referer} headers.update(XHR) params = { 'searchTerm': title, 'type': SEARCH_TYPES[video_type], 'limit': 500 } html = self._http_get(search_url, params=params, headers=headers, auth=False, cache_limit=2) js_data = scraper_utils.parse_json(html, search_url) if 'results' in js_data: for result in js_data['results']: match_year = str(result.get('year', '')) match_url = result.get('permalink', '') match_title = result.get('title', '') if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def search(self, video_type, title, year, season=''): if not self.include_paid and video_type != VIDEO_TYPES.MOVIE: return [] search_url = urlparse.urljoin(self.base_url, '/search.php?q=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results = [] if video_type == VIDEO_TYPES.MOVIE: pattern = '<i>\s*Movies\s*</i>(.*)' else: pattern = '<i>\s*TV Series\s*</i>(.*)' match = re.search(pattern, html) if match: container = match.group(1) pattern = "href='([^']+)'>([^<]+)\s*</a>\s*(?:\((\d{4})\))?" for match in re.finditer(pattern, container): url, match_title, match_year = match.groups('') if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/suggest.php') headers = {'Referer': self.base_url} headers.update(XHR) params = {'ajax': 1, 's': title, 'type': 'TVShows'} html = self._http_get(search_url, params=params, cache_limit=8) for attrs, match_title in dom_parser2.parse_dom(html, 'a', req='href'): match_url = attrs['href'] match_title = re.sub('</?[^>]*>', '', match_title) match = re.search('\((\d{4})\)$', match_url) if match: match_year = match.group(1) else: match_year = '' if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, SEARCH_URL) search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*items[^"]*'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'item'}): match_url = dom_parser.parse_dom(item, 'a', {'class': 'header'}, ret='href') match_title_year = dom_parser.parse_dom(item, 'a', {'class': 'header'}) if match_url and match_title_year: match_url = match_url[0] match_title_year = match_title_year[0] r = re.search('(.*?)\s+\((\d{4})\)', match_title_year) if r: match_title, match_year = r.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(match_url), 'title': match_title, 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/?s=%s' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=8) for movie in dom_parser.parse_dom(html, 'div', {'class': 'movie'}): match = re.search('href="([^"]+)', movie) if match: match_url = match.group(1) match_title_year = dom_parser.parse_dom(movie, 'img', ret='alt') if match_title_year: match_title_year = match_title_year[0] match = re.search('(.*?)\s+\((\d{4})\)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = dom_parser.parse_dom( movie, 'div', {'class': 'year'}) try: match_year = match_year[0] except: match_year = '' if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] if video_type == VIDEO_TYPES.TVSHOW: url = urlparse.urljoin(self.base_url, '/series/all/') html = self._http_get(url, cache_limit=8) links = dom_parser.parse_dom(html, 'a', {'class': 'underilne'}, 'href') titles = dom_parser.parse_dom(html, 'a', {'class': 'underilne'}) items = zip(links, titles) else: url = urlparse.urljoin(self.base_url, '/search?=%s' % urllib.quote_plus(title)) data = {'q': title, 'go': 'Search'} html = self._http_get(url, data=data, cache_limit=8) match = re.search('you can search again in (\d+) seconds', html, re.I) if match: wait = int(match.group(1)) if wait > self.timeout: wait = self.timeout time.sleep(wait) html = self._http_get(url, data=data, cache_limit=0) pattern = 'class="movie_box.*?href="([^"]+).*?<h1>([^<]+)' items = re.findall(pattern, html, re.DOTALL) norm_title = scraper_utils.normalize_title(title) for item in items: url, match_title = item if norm_title in scraper_utils.normalize_title(match_title): result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] xml_url = urlparse.urljoin(self.base_url, '/series.xml') xml = self._http_get(xml_url, cache_limit=24) if xml: norm_title = scraper_utils.normalize_title(title) match_year = '' try: for element in ET.fromstring(xml).findall('.//dizi'): name = element.find('adi') if name is not None and norm_title in scraper_utils.normalize_title( name.text): url = element.find('url') if url is not None and (not year or not match_year or year == match_year): result = { 'url': scraper_utils.pathify_url(url.text), 'title': name.text, 'year': '' } results.append(result) except (ParseError, ExpatError) as e: log_utils.log('Dizilab Search Parse Error: %s' % (e), log_utils.LOGWARNING) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) fragment = dom_parser.parse_dom(html, 'ul', {'class': '[^"]*listing-videos[^"]*'}) if fragment: for match in re.finditer('href="([^"]+)[^>]*>(.*?)</a>', fragment[0]): url, match_title_year = match.groups('') match_title_year = re.sub('<span>|</span>', '', match_title_year) if re.search('S\d{2}E\d{2}', match_title_year): continue # skip episodes match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' match_title = match_title.replace('–', '-') match_title = match_title.replace('’', "'") if (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(url), 'title': match_title, 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] media_type = 'series' if video_type == VIDEO_TYPES.TVSHOW else 'movie' search_url = scraper_utils.urljoin( self.base_url, '/typeahead/%s' % (urllib.quote(title))) headers = {'Referer': self.base_url} headers.update(XHR) html = self._http_get(search_url, headers=headers, require_debrid=True, cache_limit=.5) for item in scraper_utils.parse_json(html, search_url): match_title = item.get('title') match_url = item.get('link') match_year = '' if item.get('type') == media_type and match_title and match_url: if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def __tv_search(self, title, year): results = [] if title: norm_title = scraper_utils.normalize_title(title) url = '/series/letra/%s/' % (title[0]) url = urlparse.urljoin(self.base_url, url) html = self._http_get(url, cache_limit=48) for item in dom_parser.parse_dom(html, 'li', {'class': '[^"]*bpM12[^"]*'}): title_frag = dom_parser.parse_dom(item, 'h2') year_frag = dom_parser.parse_dom(item, 'div', {'class': '[^"]*sectionDetail[^"]*'}) match_url = dom_parser.parse_dom(item, 'a', ret='href') if title_frag and match_url: match_url = match_url[0] match = re.search('(.*?)<br>', title_frag[0]) if match: match_title = match.group(1) else: match_title = title_frag[0] match_year = '' if year_frag: match = re.search('(\d{4})', year_frag[0]) if match: match_year = match.group(1) if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/?s=%s') search_url = search_url % (urllib.quote(title)) html = self._http_get(search_url, cache_limit=1) for item in dom_parser.parse_dom(html, 'h3', {'class': 'post-box-title'}): match = re.search('href="([^"]+)[^>]*>([^<]+)', item) if match: match_url, match_title_year = match.groups() is_season = re.search('Season\s+(\d+)$', match_title_year, re.I) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: match_year = '' if video_type == VIDEO_TYPES.SEASON: match_title = match_title_year if season and int(is_season.group(1)) != int(season): continue else: match = re.search('(.*?)\s+(\d{4})$', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = self.base_url if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]: search_url += '/tvshow' search_url += '/advanced-search.php?search=' search_url += urllib.quote_plus(title) search_url += '&year=' + urllib.quote_plus(str(year)) search_url += '&advanced_search=Search' html = self._http_get(search_url, cache_limit=.25) results = [] for element in dom_parser.parse_dom(html, 'div', {'class': 'list_box_title'}): match = re.search('href="([^"]+)"\s+title="(?:Watch )?([^"]+)', element) if match: url, match_title_year = match.groups() match = re.search('(.*?)(?:\s+\(?\s*(\d{4})\s*\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/results') params = {'q': title} referer = search_url + '?' + urllib.urlencode(params) headers = {'Referer': referer} headers.update(XHR) _html = self._http_get(scraper_utils.urljoin(self.base_url, 'av'), headers=headers, method='POST', cache_limit=0) cookies = {'begin_referer': referer, 'prounder': 1} html = self._http_get(search_url, params=params, cookies=cookies, cache_limit=8) if any('jquery.js' in match.attrs['src'] for match in dom_parser2.parse_dom(html, 'script', req='src')): html = self._http_get(search_url, params=params, cookies=cookies, cache_limit=0) for _attrs, result in dom_parser2.parse_dom(html, 'div', {'class': 'cell'}): title_frag = dom_parser2.parse_dom(result, 'div', {'class': 'video_title'}) year_frag = dom_parser2.parse_dom(result, 'div', {'class': 'video_quality'}) if not title_frag: continue match = dom_parser2.parse_dom(title_frag[0].content, 'a', req='href') if not match: continue match_url = match[0].attrs['href'] match_title = match[0].content try: match = re.search('\s+(\d{4})\s+', year_frag[0].content) match_year = match.group(1) except: match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def __movie_search(self, title, year): results = [] search_url = urlparse.urljoin(self.base_url, '/search?q=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) norm_title = scraper_utils.normalize_title(title) for item in dom_parser.parse_dom(html, 'div', {'class': 'video_item'}): match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title = dom_parser.parse_dom(item, 'img', ret='alt') match_year = '' if match_url and match_title: match_url = match_url[0] match_title = match_title[0] if match_year: match_year = match_year[0] else: match_year = '' if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/bestmatch-fund-movies-%s.html') search_title = title.replace(' ', '-') search_title = re.sub('[^A-Za-z0-9-]', '', search_title).lower() search_url = search_url % (search_title) html = self._http_get(search_url, cache_limit=1) for _attrs, item in dom_parser2.parse_dom(html, 'div', {'class': 'thumbsTitle'}): match = dom_parser2.parse_dom(item, 'a', req='href') if not match: continue match_url, match_title_year = match[0].attrs['href'], match[ 0].content match_title, match_year = scraper_utils.extra_year( match_title_year) if (not year or not match_year or year == match_year): result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/advanced-search/') headers = {'Referer': self.base_url} params = {'search_query': title, 'orderby': '', 'order': '', 'wpas': 1} html = self._http_get(search_url, params=params, headers=headers, cache_limit=8) norm_title = scraper_utils.normalize_title(title) for _attrs, item in dom_parser2.parse_dom(html, 'div', {'class': 'datos'}): match = dom_parser2.parse_dom(item, 'a', req='href') if not match: continue match_url = match[0].attrs['href'] is_tvshow = '/tvshows/' in match_url if is_tvshow and video_type == VIDEO_TYPES.MOVIE or not is_tvshow and video_type == VIDEO_TYPES.TVSHOW: continue match_title = match[0].content match_title, match_year = scraper_utils.extra_year(match_title) if scraper_utils.normalize_title(match_title) in norm_title and ( not year or not match_year or year == match_year): result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def search(self, video_type, title, year, season=''): self.__get_token() results = [] search_url = urlparse.urljoin(self.base_url, '/api/v1/caut') timestamp = int(time.time() * 1000) query = { 'q': title, 'limit': '100', 'timestamp': timestamp, 'verifiedCheck': self.__token } html = self._http_get(search_url, data=query, headers=XHR, cache_limit=1) if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]: media_type = 'TV SHOW' else: media_type = 'MOVIE' for item in scraper_utils.parse_json(html, search_url): if item['meta'].upper().startswith(media_type): match_year = str( item['year']) if 'year' in item and item['year'] else '' if not year or not match_year or year == match_year: result = { 'title': item['title'], 'url': scraper_utils.pathify_url(item['permalink']), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=""): if not self.include_paid and video_type != VIDEO_TYPES.MOVIE: return [] search_url = urlparse.urljoin(self.base_url, "/search.php") html = self._http_get(search_url, params={"q": title}, cache_limit=0.25) results = [] if video_type == VIDEO_TYPES.MOVIE: pattern = "<i>\s*Movies\s*</i>(.*)" else: pattern = "<i>\s*TV Series\s*</i>(.*)" match = re.search(pattern, html) if match: container = match.group(1) pattern = "href='([^']+)'>([^<]+)\s*</a>\s*(?:\((\d{4})\))?" for match in re.finditer(pattern, container): url, match_title, match_year = match.groups("") if not year or not match_year or year == match_year: result = { "url": scraper_utils.pathify_url(url), "title": scraper_utils.cleanse_title(match_title), "year": match_year, } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin( self.base_url, '/search/%s' % (urllib.quote(title))) html = self._http_get(search_url, cache_limit=8) fragment = dom_parser2.parse_dom(html, 'div', {'id': 'who-likes'}) if not fragment: return results fragment = fragment[0].content match_url = dom_parser2.parse_dom(fragment, 'a', req='href') match_title_year = dom_parser2.parse_dom(fragment, 'img', req='alt') if match_url and match_title_year: match_url = match_url[0].attrs['href'] match_title_year = match_title_year[0].attrs['alt'] match_title, match_year = scraper_utils.extra_year( match_title_year) if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] headers = {'Referer': self.base_url} params = {'search': title} html = self._http_get(self.base_url, params=params, headers=headers, cache_limit=8) for _attrs, item in dom_parser2.parse_dom(html, 'div', {'class': 'listCard'}): match_title = dom_parser2.parse_dom(item, 'p', {'class': 'extraTitle'}) match_url = dom_parser2.parse_dom(item, 'a', req='href') match_year = dom_parser2.parse_dom(item, 'p', {'class': 'cardYear'}) if match_url and match_title: match_url = match_url[0].attrs['href'] match_title = match_title[0].content match_year = match_year[0].content if match_year else '' if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] url = scraper_utils.urljoin(self.base_url, '/arsiv') html = self._http_get(url, cache_limit=48) norm_title = scraper_utils.normalize_title(title) fragment = dom_parser2.parse_dom(html, 'div', {'class': 'ts-list-content'}) if not fragment: return results items = dom_parser2.parse_dom(fragment[0].content, 'h1', {'class': 'ts-list-name'}) details = dom_parser2.parse_dom(fragment[0].content, 'ul') for item, detail in zip(items, details): match = dom_parser2.parse_dom(item.content, 'a', req='href') match_year = re.search('<span>(\d{4})</span>', detail.content) if not match: continue match_url = match[0].attrs['href'] match_title = match[0].content match_year = match_year.group(1) if match_year else '' if norm_title in scraper_utils.normalize_title(match_title): result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results