class DirectDownload_Scraper(scraper.Scraper): base_url = BASE_URL def __init__(self, timeout=scraper.DEFAULT_TIMEOUT): self.timeout = timeout self.db_connection = DB_Connection() self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' % (self.get_name())) self.username = xbmcaddon.Addon().getSetting('%s-username' % (self.get_name())) self.password = xbmcaddon.Addon().getSetting('%s-password' % (self.get_name())) @classmethod def provides(cls): return frozenset([VIDEO_TYPES.EPISODE]) @classmethod def get_name(cls): return 'DirectDownload.tv' def resolve_link(self, link): return link def format_source_label(self, item): return '[%s] (%s) %s' % (item['quality'], item['dd_qual'], item['host']) def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) if html: js_result = json.loads(html) query = urlparse.parse_qs(urlparse.urlparse(url).query) match_quality = Q_ORDER if 'quality' in query: temp_quality = re.sub('\s', '', query['quality'][0]) match_quality = temp_quality.split(',') import urlresolver sxe_str = '.S%02dE%02d.' % (int(video.season), int(video.episode)) airdate_str = video.ep_airdate.strftime('.%Y.%m.%d.') for result in js_result: if sxe_str not in result['release'] and airdate_str not in result['release']: continue if result['quality'] in match_quality: for link in result['links']: if re.search('\.rar(\.|$)', link['url']): continue # validate url since host validation fails for real-debrid; mark links direct to avoid unusable check if urlresolver.HostedMediaFile(link['url']): hostname = urlparse.urlparse(link['url']).hostname hoster = {'multi-part': False, 'class': self, 'views': None, 'url': link['url'], 'rating': None, 'host': hostname, 'quality': QUALITY_MAP[result['quality']], 'dd_qual': result['quality'], 'direct': True} hosters.append(hoster) return hosters def get_url(self, video): url = None result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode) if result: url = result[0][0] log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url)) else: date_match = False search_title = '%s S%02dE%02d' % (video.title, int(video.season), int(video.episode)) results = self.search(video.video_type, search_title, '') if not results and video.ep_airdate is not None: search_title = '%s %s' % (video.title, video.ep_airdate.strftime('%Y.%m.%d')) results = self.search(video.video_type, search_title, '') date_match = True best_q_index = -1 for result in results: if date_match and video.ep_airdate.strftime('%Y.%m.%d') not in result['title']: continue if Q_DICT[result['quality']] > best_q_index: best_q_index = Q_DICT[result['quality']] url = result['url'] self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url) return url @classmethod def get_settings(cls): settings = super(DirectDownload_Scraper, cls).get_settings() settings = cls._disable_sub_check(settings) name = cls.get_name() settings.append(' <setting id="%s-username" type="text" label=" Username" default="" visible="eq(-6,true)"/>' % (name)) settings.append(' <setting id="%s-password" type="text" label=" Password" option="hidden" default="" visible="eq(-7,true)"/>' % (name)) return settings def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/search?query=') search_url += title html = self._http_get(search_url, cache_limit=.25) results = [] if html: js_result = json.loads(html) for match in js_result: url = search_url + '&quality=%s' % match['quality'] result = {'url': url.replace(self.base_url, ''), 'title': match['release'], 'quality': match['quality'], 'year': ''} results.append(result) return results def _http_get(self, url, data=None, cache_limit=8): # return all uncached blank pages if no user or pass if not self.username or not self.password: return '' if 'search?query' in url: log_utils.log('Translating Search Url: %s' % (url), xbmc.LOGDEBUG) url = self.__translate_search(url) html = super(DirectDownload_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, data=data, cache_limit=cache_limit) fake = None try: js_result = json.loads(html) fake = False fake = js_result[0]['fake'] except: pass if fake or (fake is None and not re.search(LOGOUT, html)): log_utils.log('Logging in for url (%s)' % (url), xbmc.LOGDEBUG) self.__login() html = super(DirectDownload_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, data=data, cache_limit=0) return html def __translate_search(self, url): query = urlparse.parse_qs(urlparse.urlparse(url).query) quality = re.sub('\s', '', query['quality'][0]) if 'quality' in query else ','.join(Q_ORDER) return urlparse.urljoin(self.base_url, (SEARCH_URL % (urllib.quote(query['query'][0]), quality))) def __login(self): url = self.base_url data = {'username': self.username, 'password': self.password, 'Login': '******'} html = super(DirectDownload_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, data=data, cache_limit=0) if not re.search(LOGOUT, html): raise Exception('directdownload.tv login failed')
class Scraper(object): __metaclass__ = abc.ABCMeta base_url = BASE_URL def __init__(self, timeout=DEFAULT_TIMEOUT): self.db_connection = DB_Connection() @abstractclassmethod def provides(cls): """ Must return a list/set/frozenset of VIDEO_TYPES that are supported by this scraper. Is a class method so that instances of the class don't have to be instantiated to determine they are not useful * Datatypes set or frozenset are preferred as existence checking is faster with sets """ raise NotImplementedError @abstractclassmethod def get_name(cls): """ Must return a string that is a name that will be used through out the UI and DB to refer to urls from this source Should be descriptive enough to be recognized but short enough to be presented in the UI """ raise NotImplementedError @abc.abstractmethod def resolve_link(self, link): """ Must return a string that is a urlresolver resolvable link given a link that this scraper supports link: a url fragment associated with this site that can be resolved to a hoster link * The purpose is many streaming sites provide the actual hoster link in a separate page from link on the video page. * This method is called for the user selected source before calling urlresolver on it. """ raise NotImplementedError @abc.abstractmethod def format_source_label(self, item): """ Must return a string that is to be the label to be used for this source in the "Choose Source" dialog item: one element of the list that is returned from get_sources for this scraper """ raise NotImplementedError @abc.abstractmethod def get_sources(self, video): """ Must return a list of dictionaries that are potential link to hoster sites (or links to links to hoster sites) Each dictionary must contain elements of at least: * multi-part: True if this source is one part of a whole * class: a reference to an instance of the scraper itself * host: the hostname of the hoster * url: the url that is a link to a hoster, or a link to a page that this scraper can resolve to a link to a hoster * quality: one of the QUALITIES values, or None if unknown; users can sort sources by quality * views: count of the views from the site for this source or None is unknown; Users can sort sources by views * rating: a value between 0 and 100; 0 being worst, 100 the best, or None if unknown. Users can sort sources by rating. * direct: True if url is a direct link to a media file; False if not. If not present; assumption is direct * other keys are allowed as needed if they would be useful (e.g. for format_source_label) video is an object of type ScraperVideo: video_type: one of VIDEO_TYPES for whatever the sources should be for title: the title of the tv show or movie year: the year of the tv show or movie season: only present for tv shows; the season number of the video for which sources are requested episode: only present for tv shows; the episode number of the video for which sources are requested ep_title: only present for tv shows; the episode title if available """ raise NotImplementedError @abc.abstractmethod def get_url(self, video): """ Must return a url for the site this scraper is associated with that is related to this video. video is an object of type ScraperVideo: video_type: one of VIDEO_TYPES this url is for (e.g. EPISODE urls might be different than TVSHOW urls) title: the title of the tv show or movie year: the year of the tv show or movie season: only present for season or episode VIDEO_TYPES; the season number for the url being requested episode: only present for season or episode VIDEO_TYPES; the episode number for the url being requested ep_title: only present for tv shows; the episode title if available * Generally speaking, domain should not be included """ raise NotImplementedError def _default_get_url(self, video): temp_video_type = video.video_type if video.video_type == VIDEO_TYPES.EPISODE: temp_video_type = VIDEO_TYPES.TVSHOW url = None result = self.db_connection.get_related_url(temp_video_type, video.title, video.year, self.get_name()) if result: url = result[0][0] log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (temp_video_type, video.title, video.year, self.get_name(), url)) else: results = self.search(temp_video_type, video.title, video.year) if results: url = results[0]['url'] self.db_connection.set_related_url(temp_video_type, video.title, video.year, self.get_name(), url) if url and video.video_type == VIDEO_TYPES.EPISODE: result = self.db_connection.get_related_url( VIDEO_TYPES.EPISODE, video.title, video.year, self.get_name(), video.season, video.episode) if result: url = result[0][0] log_utils.log('Got local related url: |%s|%s|%s|' % (video, self.get_name(), url)) else: show_url = url url = self._get_episode_url(show_url, video) if url: self.db_connection.set_related_url(VIDEO_TYPES.EPISODE, video.title, video.year, self.get_name(), url, video.season, video.episode) return url @abc.abstractmethod def search(self, video_type, title, year): """ Must return a list of results returned from the site associated with this scraper when doing a search using the input parameters If it does return results, it must be a list of dictionaries. Each dictionary must contain at least the following: * title: title of the result * year: year of the result * url: a url fragment that is the url on the site associated with this scraper for this season result item video_type: one of the VIDEO_TYPES being searched for. Only tvshows and movies are expected generally title: the title being search for year: the year being search for * Method must be provided, but can raise NotImplementedError if search not available on the site """ raise NotImplementedError @classmethod def get_settings(cls): """ Returns a list of settings to be used for this scraper. Settings are automatically checked for updates every time scrapers are imported The list returned by each scraper is aggregated into a big settings.xml string, and then if it differs from the current settings xml in the Scrapers category the existing settings.xml fragment is removed and replaced by the new string """ name = cls.get_name() return [ ' <setting id="%s-enable" type="bool" label="%s Enabled" default="true" visible="true"/>' % (name, name), ' <setting id="%s-base_url" type="text" label=" Base Url" default="%s" visible="eq(-1,true)"/>' % (name, cls.base_url), ' <setting id="%s-sub_check" type="bool" label=" Include in Page Existence checks?" default="true" visible="eq(-2,true)"/>' % (name), ' <setting id="%s_try" type="number" default="0" visible="false"/>' % (name), ' <setting id="%s_fail" type="number" default="0" visible="false"/>' % (name), ' <setting id="%s_check" type="number" default="0" visible="false"/>' % (name), ] @classmethod def _disable_sub_check(cls, settings): for i in reversed(xrange(len(settings))): if 'sub_check' in settings[i]: settings[i] = settings[i].replace('default="true"', 'default="false"') return settings def _cached_http_get(self, url, base_url, timeout, cookies=None, data=None, headers=None, cache_limit=8): if cookies is None: cookies = {} if timeout == 0: timeout = None if headers is None: headers = {} referer = headers['Referer'] if 'Referer' in headers else url log_utils.log( 'Getting Url: %s cookie=|%s| data=|%s| extra headers=|%s|' % (url, cookies, data, headers)) db_connection = DB_Connection() _, html = db_connection.get_cached_url(url, cache_limit) if html: log_utils.log('Returning cached result for: %s' % (url), xbmc.LOGDEBUG) return html try: cj = self._set_cookies(base_url, cookies) if data is not None: data = urllib.urlencode(data, True) request = urllib2.Request(url, data=data) request.add_header('User-Agent', USER_AGENT) request.add_unredirected_header('Host', request.get_host()) request.add_unredirected_header('Referer', referer) for key in headers: request.add_header(key, headers[key]) response = urllib2.urlopen(request, timeout=timeout) cj.save(ignore_discard=True, ignore_expires=True) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) html = f.read() else: html = response.read() except Exception as e: log_utils.log( 'Error (%s) during scraper http get: %s' % (str(e), url), xbmc.LOGWARNING) return '' db_connection.cache_url(url, html) return html def _set_cookies(self, base_url, cookies): domain = urlparse.urlsplit(base_url).hostname cookie_file = os.path.join(COOKIEPATH, '%s_cookies.lwp' % (self.get_name())) cj = cookielib.LWPCookieJar(cookie_file) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) for key in cookies: c = cookielib.Cookie(0, key, cookies[key], port=None, port_specified=False, domain=domain, domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=False, comment=None, comment_url=None, rest={}) cj.set_cookie(c) try: cj.load(ignore_discard=True) except: pass return cj def _do_recaptcha(self, key, tries=None, max_tries=None): challenge_url = CAPTCHA_BASE_URL + '/challenge?k=%s' % (key) html = self._cached_http_get(challenge_url, CAPTCHA_BASE_URL, timeout=DEFAULT_TIMEOUT, cache_limit=0) match = re.search("challenge\s+\:\s+'([^']+)", html) captchaimg = 'http://www.google.com/recaptcha/api/image?c=%s' % ( match.group(1)) img = xbmcgui.ControlImage(450, 0, 400, 130, captchaimg) wdlg = xbmcgui.WindowDialog() wdlg.addControl(img) wdlg.show() header = 'Type the words in the image' if tries and max_tries: header += ' (Try: %s/%s)' % (tries, max_tries) kb = xbmc.Keyboard('', header, False) kb.doModal() solution = '' if kb.isConfirmed(): solution = kb.getText() if not solution: raise Exception( 'You must enter text in the image to access video') wdlg.close() return { 'recaptcha_challenge_field': match.group(1), 'recaptcha_response_field': solution } def _default_get_episode_url(self, show_url, video, episode_pattern, title_pattern=''): log_utils.log( 'Default Episode Url: |%s|%s|%s|' % (self.base_url, show_url, str(video).decode('utf-8', 'replace')), xbmc.LOGDEBUG) url = urlparse.urljoin(self.base_url, show_url) html = self._http_get(url, cache_limit=2) if html: force_title = self._force_title(video) if not force_title: match = re.search(episode_pattern, html, re.DOTALL) if match: url = match.group(1) return url.replace(self.base_url, '') else: log_utils.log( 'Skipping S&E matching as title search is forced on: %s' % (video.slug), xbmc.LOGDEBUG) if (force_title or xbmcaddon.Addon().getSetting('title-fallback') == 'true') and video.ep_title and title_pattern: norm_title = self._normalize_title(video.ep_title) for match in re.finditer(title_pattern, html, re.DOTALL | re.I): url, title = match.groups() if norm_title == self._normalize_title(title): return url.replace(self.base_url, '') def _force_title(self, video): slug_str = xbmcaddon.Addon().getSetting('force_title_match') slug_list = slug_str.split('|') if slug_str else [] return video.slug in slug_list def _normalize_title(self, title): new_title = title.upper() new_title = re.sub('\W', '', new_title) #log_utils.log('In title: |%s| Out title: |%s|' % (title,new_title), xbmc.LOGDEBUG) return new_title def _blog_get_quality(self, video, q_str, host): """ Use the q_str to determine the post quality; then use the host to determine host quality allow the host to drop the quality but not increase it """ q_str.replace(video.title, '') q_str.replace(str(video.year), '') q_str = q_str.upper() # Assume movies are low quality, tv shows are high quality if video.video_type == VIDEO_TYPES.MOVIE: quality = QUALITIES.LOW else: quality = QUALITIES.HIGH post_quality = quality for key in Q_LIST: if any(q in q_str for q in BLOG_Q_MAP[key]): post_quality = key host_quality = None if host: for key in HOST_Q: if any(host in hostname for hostname in HOST_Q[key]): host_quality = key #log_utils.log('q_str: %s, host: %s, post q: %s, host q: %s' % (q_str, host, post_quality, host_quality), xbmc.LOGDEBUG) if host_quality is not None and Q_ORDER[host_quality] < Q_ORDER[ post_quality]: quality = host_quality else: quality = post_quality return quality
class OneClickWatch_Scraper(scraper.Scraper): base_url=BASE_URL def __init__(self, timeout=scraper.DEFAULT_TIMEOUT): self.timeout=timeout self.db_connection = DB_Connection() self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' % (self.get_name())) @classmethod def provides(cls): return frozenset([VIDEO_TYPES.MOVIE, VIDEO_TYPES.EPISODE]) @classmethod def get_name(cls): return 'OneClickWatch' def resolve_link(self, link): return link def format_source_label(self, item): return '[%s] %s (%s/100)' % (item['quality'], item['host'], item['rating']) def get_sources(self, video): source_url= self.get_url(video) hosters=[] if source_url: url = urlparse.urljoin(self.base_url,source_url) html = self._http_get(url, cache_limit=.5) q_str = '' match = re.search('class="title">([^<]+)', html) if match: q_str = match.group(1) pattern = '^<a\s+href="([^"]+)"\s+rel="nofollow"' for match in re.finditer(pattern, html, re.M): url=match.group(1) hoster={'multi-part': False, 'class': self, 'views': None, 'url': url, 'rating': None, 'direct': False} hoster['host']=urlparse.urlsplit(url).hostname hoster['quality']=self._blog_get_quality(video, q_str, hoster['host']) hosters.append(hoster) return hosters def get_url(self, video): url = None result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode) if result: url=result[0][0] log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url)) else: select = int(xbmcaddon.Addon().getSetting('%s-select' % (self.get_name()))) if video.video_type == VIDEO_TYPES.EPISODE: search_title = '%s S%02dE%02d' % (video.title, int(video.season), int(video.episode)) else: search_title = '%s %s' % (video.title, video.year) results = self.search(video.video_type, search_title, video.year) if results: if select == 0: best_result = results[0] else: best_qorder=0 best_qstr='' for result in results: match = re.search('\[(.*)\]$', result['title']) if match: q_str = match.group(1) quality=self._blog_get_quality(video, q_str, '') #print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) if Q_ORDER[quality]>=best_qorder: if Q_ORDER[quality] > best_qorder or (quality == QUALITIES.HD and '1080' in q_str and '1080' not in best_qstr): #print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) best_qstr = q_str best_result=result best_qorder = Q_ORDER[quality] url = best_result['url'] self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url) return url @classmethod def get_settings(cls): settings = super(OneClickWatch_Scraper, cls).get_settings() settings = cls._disable_sub_check(settings) name=cls.get_name() settings.append(' <setting id="%s-filter" type="slider" range="0,180" option="int" label=" Filter results older than (0=No Filter) (days)" default="30" visible="eq(-6,true)"/>' % (name)) settings.append(' <setting id="%s-select" type="enum" label=" Automatically Select (Movies only)" values="Most Recent|Highest Quality" default="0" visible="eq(-7,true)"/>' % (name)) return settings def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results=[] filter_days = datetime.timedelta(days=int(xbmcaddon.Addon().getSetting('%s-filter' % (self.get_name())))) today = datetime.date.today() pattern ='class="title"><a href="([^"]+)[^>]+>([^<]+).*?rel="bookmark">([^<]+)' for match in re.finditer(pattern, html, re.DOTALL): url, title, date_str = match.groups('') if filter_days: try: post_date = datetime.datetime.strptime(date_str, '%B %d, %Y').date() except TypeError: post_date = datetime.datetime(*(time.strptime(date_str, '%B %d, %Y')[0:6])).date() if today - post_date > filter_days: continue match_year = '' if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', title) if match: title, match_year, extra_title = match.groups() title = '%s [%s]' % (title, extra_title) else: match_year = '' match = re.search('(.*?)\s*S\d+E\d+\s*(.*)', title) if match: title, extra_title = match.groups() title = '%s [%s]' % (title, extra_title) if not year or not match_year or year == match_year: result={'url': url.replace(self.base_url,''), 'title': title, 'year': match_year} results.append(result) return results def _http_get(self, url, cache_limit=8): return super(OneClickWatch_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, cache_limit=cache_limit)
class OneClickWatch_Scraper(scraper.Scraper): base_url = BASE_URL def __init__(self, timeout=scraper.DEFAULT_TIMEOUT): self.timeout = timeout self.db_connection = DB_Connection() self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' % (self.get_name())) @classmethod def provides(cls): return frozenset([VIDEO_TYPES.MOVIE, VIDEO_TYPES.EPISODE]) @classmethod def get_name(cls): return 'OneClickWatch' def resolve_link(self, link): return link def format_source_label(self, item): return '[%s] %s (%s/100)' % (item['quality'], item['host'], item['rating']) def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) q_str = '' match = re.search('class="title">([^<]+)', html) if match: q_str = match.group(1) pattern = '^<a\s+href="([^"]+)"\s+rel="nofollow"' for match in re.finditer(pattern, html, re.M): url = match.group(1) hoster = { 'multi-part': False, 'class': self, 'views': None, 'url': url, 'rating': None, 'direct': False } hoster['host'] = urlparse.urlsplit(url).hostname hoster['quality'] = self._blog_get_quality( video, q_str, hoster['host']) hosters.append(hoster) return hosters def get_url(self, video): url = None result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode) if result: url = result[0][0] log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url)) else: select = int(xbmcaddon.Addon().getSetting('%s-select' % (self.get_name()))) if video.video_type == VIDEO_TYPES.EPISODE: if not self._force_title(video): search_title = '%s S%02dE%02d' % ( video.title, int(video.season), int(video.episode)) else: if not video.ep_title: return None search_title = '%s %s' % (video.title, video.ep_title) else: search_title = '%s %s' % (video.title, video.year) results = self.search(video.video_type, search_title, video.year) if results: if select == 0: best_result = results[0] else: best_qorder = 0 best_qstr = '' for result in results: match = re.search('\[(.*)\]$', result['title']) if match: q_str = match.group(1) quality = self._blog_get_quality(video, q_str, '') #print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) if Q_ORDER[quality] >= best_qorder: if Q_ORDER[quality] > best_qorder or ( quality == QUALITIES.HD and '1080' in q_str and '1080' not in best_qstr): #print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) best_qstr = q_str best_result = result best_qorder = Q_ORDER[quality] url = best_result['url'] self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url) return url @classmethod def get_settings(cls): settings = super(OneClickWatch_Scraper, cls).get_settings() settings = cls._disable_sub_check(settings) name = cls.get_name() settings.append( ' <setting id="%s-filter" type="slider" range="0,180" option="int" label=" Filter results older than (0=No Filter) (days)" default="30" visible="eq(-6,true)"/>' % (name)) settings.append( ' <setting id="%s-select" type="enum" label=" Automatically Select (Movies only)" values="Most Recent|Highest Quality" default="0" visible="eq(-7,true)"/>' % (name)) return settings def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results = [] filter_days = datetime.timedelta( days=int(xbmcaddon.Addon().getSetting('%s-filter' % (self.get_name())))) today = datetime.date.today() pattern = 'class="title"><a href="([^"]+)[^>]+>([^<]+).*?rel="bookmark">([^<]+)' for match in re.finditer(pattern, html, re.DOTALL): url, title, date_str = match.groups('') if filter_days: try: post_date = datetime.datetime.strptime( date_str, '%B %d, %Y').date() except TypeError: post_date = datetime.datetime( *(time.strptime(date_str, '%B %d, %Y')[0:6])).date() if today - post_date > filter_days: continue match_year = '' if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', title) if match: title, match_year, extra_title = match.groups() title = '%s [%s]' % (title, extra_title) else: match_year = '' match = re.search('(.*?)\s*S\d+E\d+\s*(.*)', title) if match: title, extra_title = match.groups() title = '%s [%s]' % (title, extra_title) if not year or not match_year or year == match_year: result = { 'url': url.replace(self.base_url, ''), 'title': title, 'year': match_year } results.append(result) return results def _http_get(self, url, cache_limit=8): return super(OneClickWatch_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, cache_limit=cache_limit)
class MyVidLinks_Scraper(scraper.Scraper): base_url = BASE_URL def __init__(self, timeout=scraper.DEFAULT_TIMEOUT): self.timeout = timeout self.db_connection = DB_Connection() self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' % (self.get_name())) @classmethod def provides(cls): return frozenset([VIDEO_TYPES.MOVIE, VIDEO_TYPES.EPISODE]) @classmethod def get_name(cls): return 'MyVideoLinks.eu' def resolve_link(self, link): return link def format_source_label(self, item): return '[%s] %s (%s Views) (%s/100)' % (item['quality'], item['host'], item['views'], item['rating']) def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url: self.__fix_base_url(video.video_type) url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) views = None pattern = '<span[^>]+>(\d+)\s+Views' match = re.search(pattern, html) if match: views = int(match.group(1)) if video.video_type == VIDEO_TYPES.MOVIE: return self.__get_movie_links(video, views, html) else: return self.__get_episode_links(video, views, html) return hosters def __get_movie_links(self, video, views, html): pattern = 'rel="bookmark"\s+title="Permanent Link to ([^"]+)' match = re.search(pattern, html) q_str = '' if match: q_str = match.group(1) return self.__get_links(video, views, html, q_str) def __get_episode_links(self, video, views, html): pattern = '<h4>(.*?)</h4>(.*?)</ul>' hosters = [] for match in re.finditer(pattern, html, re.DOTALL): q_str, fragment = match.groups() hosters += self.__get_links(video, views, fragment, q_str) return hosters def __get_links(self, video, views, html, q_str): pattern = 'li>\s*<a\s+href="(http[^"]+)' hosters = [] for match in re.finditer(pattern, html): url = match.group(1) hoster = { 'multi-part': False, 'class': self, 'views': views, 'url': url, 'rating': None, 'quality': None, 'direct': False } hoster['host'] = urlparse.urlsplit(url).hostname hoster['quality'] = self._blog_get_quality(video, q_str, hoster['host']) hosters.append(hoster) return hosters def __fix_base_url(self, video_type): if video_type == VIDEO_TYPES.MOVIE: if not self.base_url.startswith('http://movies.'): self.base_url = self.base_url.replace('http://', 'http://movies.') else: if not self.base_url.startswith('http://tv.'): self.base_url = self.base_url.replace('http://', 'http://tv.') def get_url(self, video): url = None result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode) if result: url = result[0][0] log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url)) else: select = int(xbmcaddon.Addon().getSetting('%s-select' % (self.get_name()))) if video.video_type == VIDEO_TYPES.EPISODE: if not self._force_title(video): search_title = '%s S%02dE%02d' % ( video.title, int(video.season), int(video.episode)) else: if not video.ep_title: return None search_title = '%s %s' % (video.title, video.ep_title) else: search_title = '%s %s' % (video.title, video.year) results = self.search(video.video_type, search_title, video.year) if results: # episodes don't tell us the quality on the search screen so just return the 1st result if select == 0 or video.video_type == VIDEO_TYPES.EPISODE: best_result = results[0] else: best_qorder = 0 best_qstr = '' for result in results: match = re.search('\[(.*)\]$', result['title']) if match: q_str = match.group(1) quality = self._blog_get_quality(video, q_str, '') #print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) if Q_ORDER[quality] >= best_qorder: if Q_ORDER[quality] > best_qorder or ( quality == QUALITIES.HD and '1080' in q_str and '1080' not in best_qstr): #print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) best_qstr = q_str best_result = result best_qorder = Q_ORDER[quality] url = best_result['url'] self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url) return url @classmethod def get_settings(cls): settings = super(MyVidLinks_Scraper, cls).get_settings() settings = cls._disable_sub_check(settings) name = cls.get_name() settings.append( ' <setting id="%s-filter" type="slider" range="0,180" option="int" label=" Filter results older than (0=No Filter) (days)" default="30" visible="eq(-6,true)"/>' % (name)) settings.append( ' <setting id="%s-select" type="enum" label=" Automatically Select (Movies only)" values="Most Recent|Highest Quality" default="0" visible="eq(-7,true)"/>' % (name)) return settings def search(self, video_type, title, year): self.__fix_base_url(video_type) search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results = [] filter_days = datetime.timedelta( days=int(xbmcaddon.Addon().getSetting('%s-filter' % (self.get_name())))) today = datetime.date.today() pattern = '<h4>\s*<a\s+href="([^"]+)"\s+rel="bookmark"\s+title="([^"]+)' for match in re.finditer(pattern, html, re.DOTALL): url, title = match.groups('') if filter_days: match = re.search('/(\d{4})/(\d{2})/(\d{2})/', url) if match: year, month, day = match.groups() post_date = datetime.date(int(year), int(month), int(day)) if today - post_date > filter_days: continue match_year = '' title = title.replace('–', '-') title = title.replace('’', "'") if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', title) if match: title, match_year, extra_title = match.groups() title = '%s [%s]' % (title, extra_title) if not year or not match_year or year == match_year: result = { 'url': url.replace(self.base_url, ''), 'title': title, 'year': match_year } results.append(result) return results def _http_get(self, url, cache_limit=8): return super(MyVidLinks_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, cache_limit=cache_limit)
class Scraper(object): __metaclass__ = abc.ABCMeta base_url = BASE_URL def __init__(self, timeout=DEFAULT_TIMEOUT): self.db_connection = DB_Connection() @abstractclassmethod def provides(cls): """ Must return a list/set/frozenset of VIDEO_TYPES that are supported by this scraper. Is a class method so that instances of the class don't have to be instantiated to determine they are not useful * Datatypes set or frozenset are preferred as existence checking is faster with sets """ raise NotImplementedError @abstractclassmethod def get_name(cls): """ Must return a string that is a name that will be used through out the UI and DB to refer to urls from this source Should be descriptive enough to be recognized but short enough to be presented in the UI """ raise NotImplementedError @abc.abstractmethod def resolve_link(self, link): """ Must return a string that is a urlresolver resolvable link given a link that this scraper supports link: a url fragment associated with this site that can be resolved to a hoster link * The purpose is many streaming sites provide the actual hoster link in a separate page from link on the video page. * This method is called for the user selected source before calling urlresolver on it. """ raise NotImplementedError @abc.abstractmethod def format_source_label(self, item): """ Must return a string that is to be the label to be used for this source in the "Choose Source" dialog item: one element of the list that is returned from get_sources for this scraper """ raise NotImplementedError @abc.abstractmethod def get_sources(self, video): """ Must return a list of dictionaries that are potential link to hoster sites (or links to links to hoster sites) Each dictionary must contain elements of at least: * multi-part: True if this source is one part of a whole * class: a reference to an instance of the scraper itself * host: the hostname of the hoster * url: the url that is a link to a hoster, or a link to a page that this scraper can resolve to a link to a hoster * quality: one of the QUALITIES values, or None if unknown; users can sort sources by quality * views: count of the views from the site for this source or None is unknown; Users can sort sources by views * rating: a value between 0 and 100; 0 being worst, 100 the best, or None if unknown. Users can sort sources by rating. * direct: True if url is a direct link to a media file; False if not. If not present; assumption is direct * other keys are allowed as needed if they would be useful (e.g. for format_source_label) video is an object of type ScraperVideo: video_type: one of VIDEO_TYPES for whatever the sources should be for title: the title of the tv show or movie year: the year of the tv show or movie season: only present for tv shows; the season number of the video for which sources are requested episode: only present for tv shows; the episode number of the video for which sources are requested ep_title: only present for tv shows; the episode title if available """ raise NotImplementedError @abc.abstractmethod def get_url(self, video): """ Must return a url for the site this scraper is associated with that is related to this video. video is an object of type ScraperVideo: video_type: one of VIDEO_TYPES this url is for (e.g. EPISODE urls might be different than TVSHOW urls) title: the title of the tv show or movie year: the year of the tv show or movie season: only present for season or episode VIDEO_TYPES; the season number for the url being requested episode: only present for season or episode VIDEO_TYPES; the episode number for the url being requested ep_title: only present for tv shows; the episode title if available * Generally speaking, domain should not be included """ raise NotImplementedError @abc.abstractmethod def search(self, video_type, title, year): """ Must return a list of results returned from the site associated with this scraper when doing a search using the input parameters If it does return results, it must be a list of dictionaries. Each dictionary must contain at least the following: * title: title of the result * year: year of the result * url: a url fragment that is the url on the site associated with this scraper for this season result item video_type: one of the VIDEO_TYPES being searched for. Only tvshows and movies are expected generally title: the title being search for year: the year being search for * Method must be provided, but can raise NotImplementedError if search not available on the site """ raise NotImplementedError @classmethod def get_settings(cls): """ Returns a list of settings to be used for this scraper. Settings are automatically checked for updates every time scrapers are imported The list returned by each scraper is aggregated into a big settings.xml string, and then if it differs from the current settings xml in the Scrapers category the existing settings.xml fragment is removed and replaced by the new string """ name = cls.get_name() return [' <setting id="%s-enable" type="bool" label="%s Enabled" default="true" visible="true"/>' % (name, name), ' <setting id="%s-base_url" type="text" label=" Base Url" default="%s" visible="eq(-1,true)"/>' % (name, cls.base_url), ' <setting id="%s-sub_check" type="bool" label=" Include in Page Existence checks?" default="true" visible="eq(-2,true)"/>' % (name), ' <setting id="%s_try" type="number" default="0" visible="false"/>' % (name), ' <setting id="%s_fail" type="number" default="0" visible="false"/>' % (name), ' <setting id="%s_check" type="number" default="0" visible="false"/>' % (name), ] @classmethod def _disable_sub_check(cls, settings): for i in reversed(xrange(len(settings))): if 'sub_check' in settings[i]: settings[i] = settings[i].replace('default="true"', 'default="false"') return settings def _default_get_url(self, video): temp_video_type = video.video_type if video.video_type == VIDEO_TYPES.EPISODE: temp_video_type = VIDEO_TYPES.TVSHOW url = None result = self.db_connection.get_related_url(temp_video_type, video.title, video.year, self.get_name()) if result: url = result[0][0] log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (temp_video_type, video.title, video.year, self.get_name(), url)) else: results = self.search(temp_video_type, video.title, video.year) if results: url = results[0]['url'] self.db_connection.set_related_url(temp_video_type, video.title, video.year, self.get_name(), url) if url and video.video_type == VIDEO_TYPES.EPISODE: result = self.db_connection.get_related_url(VIDEO_TYPES.EPISODE, video.title, video.year, self.get_name(), video.season, video.episode) if result: url = result[0][0] log_utils.log('Got local related url: |%s|%s|%s|' % (video, self.get_name(), url)) else: show_url = url url = self._get_episode_url(show_url, video) if url: self.db_connection.set_related_url(VIDEO_TYPES.EPISODE, video.title, video.year, self.get_name(), url, video.season, video.episode) return url def _cached_http_get(self, url, base_url, timeout, cookies=None, data=None, headers=None, cache_limit=8): if cookies is None: cookies = {} if timeout == 0: timeout = None if headers is None: headers = {} referer = headers['Referer'] if 'Referer' in headers else url log_utils.log('Getting Url: %s cookie=|%s| data=|%s| extra headers=|%s|' % (url, cookies, data, headers)) db_connection = DB_Connection() _, html = db_connection.get_cached_url(url, cache_limit) if html: log_utils.log('Returning cached result for: %s' % (url), xbmc.LOGDEBUG) return html try: cj = self._set_cookies(base_url, cookies) if data is not None: data = urllib.urlencode(data, True) request = urllib2.Request(url, data=data) request.add_header('User-Agent', USER_AGENT) request.add_unredirected_header('Host', request.get_host()) request.add_unredirected_header('Referer', referer) for key in headers: request.add_header(key, headers[key]) response = urllib2.urlopen(request, timeout=timeout) cj.save(ignore_discard=True, ignore_expires=True) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) html = f.read() else: html = response.read() except Exception as e: log_utils.log('Error (%s) during scraper http get: %s' % (str(e), url), xbmc.LOGWARNING) return '' db_connection.cache_url(url, html) return html def _set_cookies(self, base_url, cookies): domain = urlparse.urlsplit(base_url).hostname cookie_file = os.path.join(COOKIEPATH, '%s_cookies.lwp' % (self.get_name())) cj = cookielib.LWPCookieJar(cookie_file) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) for key in cookies: c = cookielib.Cookie(0, key, cookies[key], port=None, port_specified=False, domain=domain, domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=False, comment=None, comment_url=None, rest={}) cj.set_cookie(c) try: cj.load(ignore_discard=True) except: pass return cj def _do_recaptcha(self, key, tries=None, max_tries=None): challenge_url = CAPTCHA_BASE_URL + '/challenge?k=%s' % (key) html = self._cached_http_get(challenge_url, CAPTCHA_BASE_URL, timeout=DEFAULT_TIMEOUT, cache_limit=0) match = re.search("challenge\s+\:\s+'([^']+)", html) captchaimg = 'http://www.google.com/recaptcha/api/image?c=%s' % (match.group(1)) img = xbmcgui.ControlImage(450, 0, 400, 130, captchaimg) wdlg = xbmcgui.WindowDialog() wdlg.addControl(img) wdlg.show() header = 'Type the words in the image' if tries and max_tries: header += ' (Try: %s/%s)' % (tries, max_tries) kb = xbmc.Keyboard('', header, False) kb.doModal() solution = '' if kb.isConfirmed(): solution = kb.getText() if not solution: raise Exception('You must enter text in the image to access video') wdlg.close() return {'recaptcha_challenge_field': match.group(1), 'recaptcha_response_field': solution} def _default_get_episode_url(self, show_url, video, episode_pattern, title_pattern='', airdate_pattern=''): log_utils.log('Default Episode Url: |%s|%s|%s|' % (self.base_url, show_url, str(video).decode('utf-8', 'replace')), xbmc.LOGDEBUG) url = urlparse.urljoin(self.base_url, show_url) html = self._http_get(url, cache_limit=2) if html: force_title = self._force_title(video) if not force_title: match = re.search(episode_pattern, html, re.DOTALL) if match: url = match.group(1) return url.replace(self.base_url, '') if xbmcaddon.Addon().getSetting('airdate-fallback') == 'true' and airdate_pattern and video.ep_airdate: airdate_pattern = airdate_pattern.replace('{year}', str(video.ep_airdate.year)) airdate_pattern = airdate_pattern.replace('{month}', str(video.ep_airdate.month)) airdate_pattern = airdate_pattern.replace('{p_month}', '%02d' % (video.ep_airdate.month)) airdate_pattern = airdate_pattern.replace('{month_name}', MONTHS[video.ep_airdate.month - 1]) airdate_pattern = airdate_pattern.replace('{short_month}', SHORT_MONS[video.ep_airdate.month - 1]) airdate_pattern = airdate_pattern.replace('{day}', str(video.ep_airdate.day)) airdate_pattern = airdate_pattern.replace('{p_day}', '%02d' % (video.ep_airdate.day)) log_utils.log('Air Date Pattern: %s' % (airdate_pattern), xbmc.LOGDEBUG) match = re.search(airdate_pattern, html, re.DOTALL | re.I) if match: url = match.group(1) return url.replace(self.base_url, '') else: log_utils.log('Skipping S&E matching as title search is forced on: %s' % (video.slug), xbmc.LOGDEBUG) if (force_title or xbmcaddon.Addon().getSetting('title-fallback') == 'true') and video.ep_title and title_pattern: norm_title = self._normalize_title(video.ep_title) for match in re.finditer(title_pattern, html, re.DOTALL | re.I): url, title = match.groups() if norm_title == self._normalize_title(title): return url.replace(self.base_url, '') def _force_title(self, video): slug_str = xbmcaddon.Addon().getSetting('force_title_match') slug_list = slug_str.split('|') if slug_str else [] return video.slug in slug_list def _normalize_title(self, title): new_title = title.upper() new_title = re.sub('\W', '', new_title) # log_utils.log('In title: |%s| Out title: |%s|' % (title,new_title), xbmc.LOGDEBUG) return new_title def _blog_proc_results(self, html, post_pattern, date_format, video_type, title, year): results = [] match = re.search('(.*?)\s*S\d+E\d+\s*', title) if match: show_title = match.group(1) else: match = re.search('(.*?)\s*\d{4}\.\d{2}\.\d{2}\s*', title) if match: show_title = match.group(1) else: show_title = title norm_title = self._normalize_title(show_title) filter_days = datetime.timedelta(days=int(xbmcaddon.Addon().getSetting('%s-filter' % (self.get_name())))) today = datetime.date.today() for match in re.finditer(post_pattern, html, re.DOTALL): post_data = match.groupdict() post_title = post_data['post_title'] if 'quality' in post_data: post_title += '- [%s]' % (post_data['quality']) if filter_days: try: post_date = datetime.datetime.strptime(post_data['date'], date_format).date() except TypeError: post_date = datetime.datetime(*(time.strptime(post_data['date'], date_format)[0:6])).date() if today - post_date > filter_days: continue match_year = '' match_title = '' post_title = post_title.replace('–', '-') post_title = post_title.replace('’', "'") full_title = post_title if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', post_title) if match: match_title, match_year, extra_title = match.groups() full_title = '%s [%s]' % (match_title, extra_title) else: match = re.search('(.*?)\s*S\d+E\d+\s*(.*)', post_title) if match: match_title, extra_title = match.groups() full_title = '%s [%s]' % (match_title, extra_title) else: match = re.search('(.*?)\s*\d{4}[ .]?\d{2}[ .]?\d{2}\s*(.*)', post_title) if match: match_title, extra_title = match.groups() full_title = '%s [%s]' % (match_title, extra_title) match_norm_title = self._normalize_title(match_title) if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year): result = {'url': post_data['url'].replace(self.base_url, ''), 'title': full_title, 'year': match_year} results.append(result) return results def _blog_get_url(self, video, delim='.'): url = None result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode) if result: url = result[0][0] log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url)) else: select = int(xbmcaddon.Addon().getSetting('%s-select' % (self.get_name()))) if video.video_type == VIDEO_TYPES.EPISODE: temp_title = re.sub('[^A-Za-z0-9 ]', '', video.title) if not self._force_title(video): search_title = '%s S%02dE%02d' % (temp_title, int(video.season), int(video.episode)) fallback_search = '%s %s' % (temp_title, video.ep_airdate.strftime('%Y{0}%m{0}%d'.format(delim))) else: if not video.ep_title: return None search_title = '%s %s' % (temp_title, video.ep_title) fallback_search = '' else: search_title = '%s %s' % (video.title, video.year) fallback_search = '' results = self.search(video.video_type, search_title, video.year) if not results and fallback_search: results = self.search(video.video_type, fallback_search, video.year) if results: if select == 0: best_result = results[0] else: best_qorder = 0 best_qstr = '' for result in results: match = re.search('\[(.*)\]$', result['title']) if match: q_str = match.group(1) quality = self._blog_get_quality(video, q_str, '') # print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) if Q_ORDER[quality] >= best_qorder: if Q_ORDER[quality] > best_qorder or (quality == QUALITIES.HD and '1080' in q_str and '1080' not in best_qstr): # print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) best_qstr = q_str best_result = result best_qorder = Q_ORDER[quality] url = best_result['url'] self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url) return url def _blog_get_quality(self, video, q_str, host): """ Use the q_str to determine the post quality; then use the host to determine host quality allow the host to drop the quality but not increase it """ q_str.replace(video.title, '') q_str.replace(str(video.year), '') q_str = q_str.upper() post_quality = None for key in Q_LIST: if any(q in q_str for q in BLOG_Q_MAP[key]): post_quality = key return self._get_quality(video, host, post_quality) def _get_quality(self, video, host, base_quality=None): # Assume movies are low quality, tv shows are high quality if base_quality is None: if video.video_type == VIDEO_TYPES.MOVIE: quality = QUALITIES.LOW else: quality = QUALITIES.HIGH else: quality = base_quality host_quality = None if host: hl = host.lower() for key in HOST_Q: if any(hostname in hl for hostname in HOST_Q[key]): host_quality = key break # log_utils.log('q_str: %s, host: %s, post q: %s, host q: %s' % (q_str, host, post_quality, host_quality), xbmc.LOGDEBUG) if host_quality is not None and Q_ORDER[host_quality] < Q_ORDER[quality]: quality = host_quality return quality def _width_get_quality(self, width): width = int(width) if width >= 1280: quality = QUALITIES.HD elif width > 640: quality = QUALITIES.HIGH elif width > 320: quality = QUALITIES.MEDIUM else: quality = QUALITIES.LOW return quality def _height_get_quality(self, height): height = int(height) if height > 480: quality = QUALITIES.HD elif height >= 400: quality = QUALITIES.HIGH elif height > 200: quality = QUALITIES.MEDIUM else: quality = QUALITIES.LOW return quality
class DirectDownload_Scraper(scraper.Scraper): base_url = BASE_URL def __init__(self, timeout=scraper.DEFAULT_TIMEOUT): self.timeout = timeout self.db_connection = DB_Connection() self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' % (self.get_name())) self.username = xbmcaddon.Addon().getSetting('%s-username' % (self.get_name())) self.password = xbmcaddon.Addon().getSetting('%s-password' % (self.get_name())) @classmethod def provides(cls): return frozenset([VIDEO_TYPES.EPISODE]) @classmethod def get_name(cls): return 'DirectDownload.tv' def resolve_link(self, link): return link def format_source_label(self, item): return '[%s] (%s) %s' % (item['quality'], item['dd_qual'], item['host']) def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) if html: js_result = json.loads(html) query = urlparse.parse_qs(urlparse.urlparse(url).query) match_quality = Q_ORDER if 'quality' in query: temp_quality = re.sub('\s', '', query['quality'][0]) match_quality = temp_quality.split(',') import urlresolver for result in js_result: if result['quality'] in match_quality: for link in result['links']: # validate url since host validation fails for real-debrid; mark links direct to avoid unusable check if urlresolver.HostedMediaFile(link['url']): hostname = urlparse.urlparse( link['url']).hostname hoster = { 'multi-part': False, 'class': self, 'views': None, 'url': link['url'], 'rating': None, 'host': hostname, 'quality': QUALITY_MAP[result['quality']], 'dd_qual': result['quality'], 'direct': True } hosters.append(hoster) return hosters def get_url(self, video): url = None result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode) if result: url = result[0][0] log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url)) else: search_title = '%s S%02dE%02d' % (video.title, int( video.season), int(video.episode)) results = self.search(video.video_type, search_title, '') best_q_index = -1 for result in results: if Q_DICT[result['quality']] > best_q_index: best_q_index = Q_DICT[result['quality']] url = result['url'] self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url) return url @classmethod def get_settings(cls): settings = super(DirectDownload_Scraper, cls).get_settings() settings = cls._disable_sub_check(settings) name = cls.get_name() settings.append( ' <setting id="%s-username" type="text" label=" Username" default="" visible="eq(-6,true)"/>' % (name)) settings.append( ' <setting id="%s-password" type="text" label=" Password" option="hidden" default="" visible="eq(-7,true)"/>' % (name)) return settings def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/search?query=') search_url += title html = self._http_get(search_url, cache_limit=.25) results = [] if html: js_result = json.loads(html) for match in js_result: url = search_url + '&quality=%s' % match['quality'] result = { 'url': url.replace(self.base_url, ''), 'title': match['release'], 'quality': match['quality'], 'year': '' } results.append(result) return results def _http_get(self, url, data=None, cache_limit=8): # return all uncached blank pages if no user or pass if not self.username or not self.password: return '' if 'search?query' in url: log_utils.log('Translating Search Url: %s' % (url), xbmc.LOGDEBUG) url = self.__translate_search(url) html = super(DirectDownload_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, data=data, cache_limit=cache_limit) fake = None try: js_result = json.loads(html) fake = False fake = js_result[0]['fake'] except: pass if fake or (fake is None and not re.search(LOGOUT, html)): log_utils.log('Logging in for url (%s)' % (url), xbmc.LOGDEBUG) self.__login() html = super(DirectDownload_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, data=data, cache_limit=0) return html def __translate_search(self, url): query = urlparse.parse_qs(urlparse.urlparse(url).query) quality = re.sub( '\s', '', query['quality'][0]) if 'quality' in query else ','.join(Q_ORDER) return urlparse.urljoin(self.base_url, (SEARCH_URL % (urllib.quote(query['query'][0]), quality))) def __login(self): url = self.base_url data = { 'username': self.username, 'password': self.password, 'Login': '******' } html = super(DirectDownload_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, data=data, cache_limit=0) if not re.search(LOGOUT, html): raise Exception('directdownload.tv login failed')
class MyVidLinks_Scraper(scraper.Scraper): base_url=BASE_URL def __init__(self, timeout=scraper.DEFAULT_TIMEOUT): self.timeout=timeout self.db_connection = DB_Connection() self.base_url = xbmcaddon.Addon().getSetting('%s-base_url' % (self.get_name())) @classmethod def provides(cls): return frozenset([VIDEO_TYPES.MOVIE, VIDEO_TYPES.EPISODE]) @classmethod def get_name(cls): return 'MyVideoLinks.eu' def resolve_link(self, link): return link def format_source_label(self, item): return '[%s] %s (%s Views) (%s/100)' % (item['quality'], item['host'], item['views'], item['rating']) def get_sources(self, video): source_url= self.get_url(video) hosters=[] if source_url: url = urlparse.urljoin(self.base_url,source_url) html = self._http_get(url, cache_limit=.5) views= None pattern = '<span[^>]+>(\d+)\s+Views' match = re.search(pattern, html) if match: views=int(match.group(1)) if video.video_type == VIDEO_TYPES.MOVIE: return self.__get_movie_links(video, views, html) else: return self.__get_episode_links(video, views, html) return hosters def __get_movie_links(self, video, views, html): pattern = 'rel="bookmark"\s+title="Permanent Link to ([^"]+)' match = re.search(pattern, html) q_str = '' if match: q_str=match.group(1) return self.__get_links(video, views, html, q_str) def __get_episode_links(self, video, views, html): pattern = '<h4>(.*?)</h4>(.*?)</ul>' hosters=[] for match in re.finditer(pattern, html, re.DOTALL): q_str, fragment = match.groups() hosters += self.__get_links(video, views, fragment, q_str) return hosters def __get_links(self, video, views, html, q_str): pattern = 'li>\s*<a\s+href="(http[^"]+)' hosters=[] for match in re.finditer(pattern, html): url=match.group(1) hoster={'multi-part': False, 'class': self, 'views': views, 'url': url, 'rating': None, 'quality': None, 'direct': False} hoster['host']=urlparse.urlsplit(url).hostname hoster['quality']=self._blog_get_quality(video, q_str, hoster['host']) hosters.append(hoster) return hosters def get_url(self, video): url = None result = self.db_connection.get_related_url(video.video_type, video.title, video.year, self.get_name(), video.season, video.episode) if result: url=result[0][0] log_utils.log('Got local related url: |%s|%s|%s|%s|%s|' % (video.video_type, video.title, video.year, self.get_name(), url)) else: select = int(xbmcaddon.Addon().getSetting('%s-select' % (self.get_name()))) if video.video_type == VIDEO_TYPES.EPISODE: search_title = '%s S%02dE%02d' % (video.title, int(video.season), int(video.episode)) else: search_title = '%s %s' % (video.title, video.year) results = self.search(video.video_type, search_title, video.year) if results: # episodes don't tell us the quality on the search screen so just return the 1st result if select == 0 or video.video_type == VIDEO_TYPES.EPISODE: best_result = results[0] else: best_qorder=0 best_qstr='' for result in results: match = re.search('\[(.*)\]$', result['title']) if match: q_str = match.group(1) quality=self._blog_get_quality(video, q_str, '') #print 'result: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) if Q_ORDER[quality]>=best_qorder: if Q_ORDER[quality] > best_qorder or (quality == QUALITIES.HD and '1080' in q_str and '1080' not in best_qstr): #print 'Setting best as: |%s|%s|%s|%s|' % (result, q_str, quality, Q_ORDER[quality]) best_qstr = q_str best_result=result best_qorder = Q_ORDER[quality] url = best_result['url'] self.db_connection.set_related_url(video.video_type, video.title, video.year, self.get_name(), url) return url @classmethod def get_settings(cls): settings = super(MyVidLinks_Scraper, cls).get_settings() settings = cls._disable_sub_check(settings) name=cls.get_name() settings.append(' <setting id="%s-filter" type="slider" range="0,180" option="int" label=" Filter results older than (0=No Filter) (days)" default="30" visible="eq(-6,true)"/>' % (name)) settings.append(' <setting id="%s-select" type="enum" label=" Automatically Select (Movies only)" values="Most Recent|Highest Quality" default="0" visible="eq(-7,true)"/>' % (name)) return settings def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results=[] filter_days = datetime.timedelta(days=int(xbmcaddon.Addon().getSetting('%s-filter' % (self.get_name())))) today = datetime.date.today() pattern ='<h4>\s*<a\s+href="([^"]+)"\s+rel="bookmark"\s+title="([^"]+)' for match in re.finditer(pattern, html, re.DOTALL): url, title = match.groups('') if filter_days: match = re.search('/(\d{4})/(\d{2})/(\d{2})/', url) if match: year, month, day = match.groups() post_date = datetime.date(int(year), int(month), int(day)) if today - post_date > filter_days: continue match_year = '' title = title.replace('–', '-') if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)\s*[\[(]?(\d{4})[)\]]?\s*(.*)', title) if match: title, match_year, extra_title = match.groups() title = '%s [%s]' % (title, extra_title) if not year or not match_year or year == match_year: result={'url': url.replace(self.base_url,''), 'title': title, 'year': match_year} results.append(result) return results def _http_get(self, url, cache_limit=8): return super(MyVidLinks_Scraper, self)._cached_http_get(url, self.base_url, self.timeout, cache_limit=cache_limit)