Python urljoin示例，transistortv_lib.scraper_utils.urljoin Python示例

示例#1

0

显示文件

文件： snagfilms_scraper.py 项目： hpduong/retropie_configs

 def get_sources(self, video):
     source_url = self.get_url(video)
     hosters = []
     if not source_url or source_url == FORCE_NO_MATCH: return hosters
     page_url = scraper_utils.urljoin(self.base_url, source_url)
     html = self._http_get(page_url, cache_limit=.5)
     fragment = dom_parser2.parse_dom(html, 'div', {'class': 'film-container'})
     if fragment:
         iframe_url = dom_parser2.parse_dom(fragment[0].content, 'iframe', req='src')
         if iframe_url:
             iframe_url = scraper_utils.urljoin(self.base_url, iframe_url[0].attrs['src'])
             headers = {'Referer': page_url}
             html = self._http_get(iframe_url, headers=headers, cache_limit=.5)
             sources = scraper_utils.parse_sources_list(self, html)
             for source in sources:
                 quality = sources[source]['quality']
                 host = scraper_utils.get_direct_hostname(self, source)
                 stream_url = source + scraper_utils.append_headers({'User-Agent': scraper_utils.get_ua(), 'Referer': iframe_url})
                 hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True}
                 match = re.search('(\d+[a-z]bps)', source)
                 if match:
                     hoster['extra'] = match.group(1)
                 hosters.append(hoster)
                     
     hosters.sort(key=lambda x: x.get('extra', ''), reverse=True)
     return hosters

示例#2

0

显示文件

    def _get_episode_url(self, show_url, video):
        show_url = scraper_utils.urljoin(self.base_url, show_url)
        html = self._http_get(show_url, cache_limit=8)
        pattern = '''href=['"]([^'"]+)[^>]+>\s*%s\.\s*Sezon<''' % (
            video.season)
        match = re.search(pattern, html)
        if match:
            episode_pattern = '''href=['"]([^'"]+-%s-sezon-%s-bolum[^'"]*)''' % (
                video.season, video.episode)
            season_url = scraper_utils.urljoin(self.base_url, match.group(1))
            html = self._http_get(season_url, cache_limit=2)
            ep_url = self._default_get_episode_url(html, video,
                                                   episode_pattern)
            if ep_url: return ep_url

        # front page fallback
        html = self._http_get(self.base_url, cache_limit=2)
        for slug in reversed(show_url.split('/')):
            if slug: break

        ep_url_frag = 'href="([^"]+/{slug}-{season}-sezon-{episode}-bolum[^"]*)'.format(
            slug=slug, season=video.season, episode=video.episode)
        match = re.search(ep_url_frag, html)
        if match:
            return scraper_utils.pathify_url(match.group(1))

示例#3

0

显示文件

    def get_sources(self, video):
        hosters = []
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        url = scraper_utils.urljoin(self.base_url, source_url)
        entry = ''
        while True:
            html = self._http_get(url, cache_limit=.5)
            if not html:
                url = scraper_utils.urljoin(BASE_URL2, source_url)
                html = self._http_get(url, cache_limit=.5)
                
            entry = dom_parser2.parse_dom(html, 'div', {'class': 'entry'})
            if entry:
                entry = entry[0].content
                match = re.search('Watch it here\s*:.*?href="([^"]+)', entry, re.I)
                if not match: break
                url = match.group(1)
            else:
                entry = ''
                break

        for _attribs, tab in dom_parser2.parse_dom(entry, 'div', {'class': 'postTabs_divs'}):
            match = dom_parser2.parse_dom(tab, 'iframe', req='src')
            if not match: continue
            link = match[0].attrs['src']
            host = urlparse.urlparse(link).hostname
            hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': scraper_utils.get_quality(video, host, QUALITIES.HIGH), 'views': None, 'rating': None, 'url': link, 'direct': False}
            hosters.append(hoster)

        return hosters

示例#4

0

显示文件

文件： sezonlukdizi_scraper.py 项目： hpduong/retropie_configs

    def _get_episode_url(self, show_url, video):
        show_url = scraper_utils.urljoin(self.base_url, show_url)
        headers = {'Referer': self.base_url}
        html = self._http_get(show_url, headers=headers, cache_limit=.25)
        data = dom_parser2.parse_dom(html,
                                     'div', {'id': 'dizidetay'},
                                     req=['data-dizi', 'data-id'])
        if not data: return

        episode_pattern = '''href=['"]([^'"]*/%s-sezon-%s-[^'"]*bolum[^'"]*)''' % (
            video.season, video.episode)
        title_pattern = '''href=['"](?P<url>[^'"]+)[^>]*>(?P<title>[^<]+)'''
        airdate_pattern = '''href=['"]([^"']+)[^>]*>[^<]*</a>\s*</td>\s*<td class="right aligned">{p_day}\.{p_month}\.{year}'''

        season_url = scraper_utils.urljoin(self.base_url, SEASON_URL)
        queries = {
            'sekme': 'bolumler',
            'id': data[0].attrs['data-id'],
            'dizi': data[0].attrs['data-dizi']
        }
        headers = {'Referer': show_url, 'Content-Length': 0}
        headers.update(XHR)

        html = self._http_get(season_url,
                              params=queries,
                              headers=headers,
                              method='POST',
                              cache_limit=2)
        result = self._default_get_episode_url(html, video, episode_pattern,
                                               title_pattern, airdate_pattern)
        if result and 'javascript:;' not in result:
            return result

示例#5

0

显示文件

文件： rmz_scraper.py 项目： hpduong/retropie_configs

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/')
        search_url = scraper_utils.urljoin(search_url,
                                           urllib.quote_plus(title))
        html = self._http_get(search_url, require_debrid=True, cache_limit=8)
        for _attrs, fragment in dom_parser2.parse_dom(html, 'div',
                                                      {'class': 'list'}):
            if not dom_parser2.parse_dom(fragment, 'div',
                                         {'class': 'lists_titles'}):
                continue
            for attrs, match_title_year in dom_parser2.parse_dom(
                    fragment, 'a', {'class': 'title'}, req='href'):
                match_url = attrs['href']
                match_title_year = re.sub('</?[^>]*>', '', match_title_year)
                is_show = re.search('\(d{4|-\)', match_title_year)
                if (is_show and video_type == VIDEO_TYPES.MOVIE) or (
                        not is_show and video_type == VIDEO_TYPES.TVSHOW):
                    continue

                match_title, match_year = scraper_utils.extra_year(
                    match_title_year)
                if not year or not match_year or year == match_year:
                    result = {
                        'url': scraper_utils.pathify_url(match_url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year
                    }
                    results.append(result)

        return results

示例#6

0

显示文件

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        sources = {}
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(url, require_debrid=True, cache_limit=.5)
        if not html:
            url = scraper_utils.urljoin(self.old_base_url, source_url)
            html = self._http_get(url, require_debrid=True, cache_limit=.5)

        sources.update(self.__get_post_links(html, video))

        if kodi.get_setting('%s-include_comments' %
                            (self.get_name())) == 'true':
            for _attrs, comment in dom_parser2.parse_dom(
                    html, 'div', {'id': re.compile('commentbody-\d+')}):
                sources.update(self.__get_comment_links(comment, video))

        for source in sources:
            if scraper_utils.excluded_link(source): continue
            host = urlparse.urlparse(source).hostname
            hoster = {
                'multi-part': False,
                'host': host,
                'class': self,
                'views': None,
                'url': source,
                'rating': None,
                'quality': sources[source],
                'direct': False
            }
            hosters.append(hoster)
        return hosters

示例#7

0

显示文件

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        if video_type == VIDEO_TYPES.MOVIE:
            url = scraper_utils.urljoin(self.base_url, '/movies/a-z/')
        else:
            url = scraper_utils.urljoin(self.base_url, '/tv/a-z/')

        if title.upper().startswith('THE '):
            search_title = title[4:5]
        elif title.upper().startswith('A '):
            search_title = title[2:3]
        else:
            search_title = title
            
        if title[:1] in string.digits:
            first_letter = '1'
        else:
            first_letter = search_title[:1]
        url = url + first_letter.upper()
        
        html = self._http_get(url, cache_limit=48)
        norm_title = scraper_utils.normalize_title(title)
        pattern = 'class=star.*?href=([^>]+)>(.*?)</a>'
        for match in re.finditer(pattern, html, re.DOTALL):
            match_url, match_title_year = match.groups()
            match_title, match_year = scraper_utils.extra_year(match_title_year)
            if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year):
                result = {'url': match_url, 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)
        return results

示例#8

0

显示文件

文件： xmovies8v2_scraper.py 项目： hpduong/retropie_configs

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/movies/search')
        html = self._http_get(search_url, params={'s': title}, cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'div', {'class': 'item_movie'}):
            match = dom_parser2.parse_dom(item, 'a', req=['href', 'title'])
            if not match: continue
            
            match_title_year = match[0].attrs['title']
            match_url = match[0].attrs['href']
            is_season = re.search('S(?:eason\s+)?(\d+)', match_title_year, re.I)
            match_vt = video_type == (VIDEO_TYPES.MOVIE and not is_season) or (video_type == VIDEO_TYPES.SEASON and is_season)
            match_year = ''
            if video_type == VIDEO_TYPES.SEASON:
                if not season and not match_vt: continue
                if match_vt:
                    if season and int(is_season.group(1)) != int(season): continue
                else:
                    if season and int(season) != 1: continue
                    site_title, site_year = scraper_utils.extra_year(match_title_year)
                    if scraper_utils.normalize_title(site_title) not in scraper_utils.normalize_title(title) or year != site_year: continue
                    
                match_title = match_title_year
            else:
                if not match_vt: continue
                match_title, match_year = scraper_utils.extra_year(match_title_year)

            match_url = scraper_utils.urljoin(match_url, 'watching.html')
            if not year or not match_year or year == match_year:
                result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)
        return results

示例#9

0

显示文件

文件： snagfilms_scraper.py 项目： hpduong/retropie_configs

 def __login(self):
     url = scraper_utils.urljoin(self.base_url, '/apis/v2/user/login.json')
     data = {'email': self.username, 'password': self.password, 'rememberMe': True}
     referer = scraper_utils.urljoin(self.base_url, '/login')
     headers = {'Content-Type': 'application/json', 'Referer': referer}
     headers.update(XHR)
     html = super(self.__class__, self)._http_get(url, data=json.dumps(data), headers=headers, cache_limit=0)
     js_data = scraper_utils.parse_json(html, url)
     return js_data.get('status') == 'success'

示例#10

0

显示文件

    def get_sources(self, video):
        hosters = []
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(url, cache_limit=.5)

        views = None
        fragment = dom_parser2.parse_dom(
            html, 'img', {'src': re.compile('[^"]*view_icon.png')})
        if fragment:
            match = re.search('(\d+)', fragment[0].content)
            if match:
                views = match.group(1)

        match = re.search('href="([^"]+-full-movie-[^"]+)', html)
        if match:
            url = match.group(1)
            html = self._http_get(url, cache_limit=.5)

        sources = self.__get_embedded(html)
        for link in dom_parser2.parse_dom(html,
                                          'span', {'class': 'btn-eps'},
                                          req='link'):
            link = link.attrs['link']
            ajax_url = scraper_utils.urljoin(self.base_url, AJAX_URL)
            headers = {'Referer': url}
            headers.update(XHR)
            html = self._http_get(ajax_url,
                                  params={'v': link},
                                  headers=headers,
                                  cache_limit=.5)
            sources.update(self.__get_sources(html))

        for source in sources:
            if sources[source]['direct']:
                host = scraper_utils.get_direct_hostname(self, source)
            else:
                host = urlparse.urlparse(source).hostname
            stream_url = source + scraper_utils.append_headers(
                {'User-Agent': scraper_utils.get_ua()})
            direct = sources[source]['direct']
            quality = sources[source]['quality']
            hoster = {
                'multi-part': False,
                'host': host,
                'class': self,
                'quality': quality,
                'views': views,
                'rating': None,
                'url': stream_url,
                'direct': direct
            }
            hosters.append(hoster)

        return hosters

示例#11

0

显示文件

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        sources = {}
        headers = {'Accept-Language': 'en-US,en;q=0.5'}
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(page_url, headers=headers, cache_limit=2)
        if video.video_type == VIDEO_TYPES.MOVIE:
            sources.update(self.__scrape_sources(html, page_url))
            pages = set([
                r.attrs['href'] for r in dom_parser2.parse_dom(
                    html, 'a', {'class': 'btn-eps'}, req='href')
            ])
            active = set([
                r.attrs['href'] for r in dom_parser2.parse_dom(
                    html, 'a', {'class': 'active'}, req='href')
            ])
            for page in list(pages - active):
                page_url = scraper_utils.urljoin(self.base_url, page)
                html = self._http_get(page_url, headers=headers, cache_limit=2)
                sources.update(self.__scrape_sources(html, page_url))
        else:
            for page in self.__match_episode(video, html):
                page_url = scraper_utils.urljoin(self.base_url, page)
                html = self._http_get(page_url, headers=headers, cache_limit=2)
                sources.update(self.__scrape_sources(html, page_url))

        for source, values in sources.iteritems():
            if not source.lower().startswith('http'): continue
            if values['direct']:
                host = scraper_utils.get_direct_hostname(self, source)
                if host != 'gvideo':
                    stream_url = source + scraper_utils.append_headers(
                        {
                            'User-Agent': scraper_utils.get_ua(),
                            'Referer': page_url
                        })
                else:
                    stream_url = source
            else:
                host = urlparse.urlparse(source).hostname
                stream_url = source
            hoster = {
                'multi-part': False,
                'host': host,
                'class': self,
                'quality': values['quality'],
                'views': None,
                'rating': None,
                'url': stream_url,
                'direct': values['direct']
            }
            hosters.append(hoster)

        return hosters

示例#12

0

显示文件

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/')
        headers = {'Accept-Language': 'en-US,en;q=0.5'}
        html = self._http_get(search_url,
                              params={'q': title},
                              headers=headers,
                              cache_limit=8)
        norm_title = scraper_utils.normalize_title(title)
        for _attrs, item in dom_parser2.parse_dom(html, 'div',
                                                  {'class': 'ml-item'}):
            match_title = dom_parser2.parse_dom(item, 'span',
                                                {'class': 'mli-info'})
            match_url = dom_parser2.parse_dom(item, 'a', req='href')
            year_frag = dom_parser2.parse_dom(item, 'img', req='alt')
            is_episodes = dom_parser2.parse_dom(item, 'span',
                                                {'class': 'mli-eps'})

            if (video_type == VIDEO_TYPES.MOVIE
                    and not is_episodes) or (video_type == VIDEO_TYPES.SEASON
                                             and is_episodes):
                if match_title and match_url:
                    match_url = match_url[0].attrs['href']
                    match_title = match_title[0].content
                    match_title = re.sub('</?h2>', '', match_title)
                    match_title = re.sub('\s+\d{4}$', '', match_title)
                    if video_type == VIDEO_TYPES.SEASON:
                        if season and not re.search(
                                'Season\s+0*%s$' % (season), match_title):
                            continue

                    if not match_url.endswith('/'): match_url += '/'
                    match_url = scraper_utils.urljoin(match_url, 'watch/')
                    match_year = ''
                    if video_type == VIDEO_TYPES.MOVIE and year_frag:
                        match = re.search('\s*-\s*(\d{4})$',
                                          year_frag[0].attrs['alt'])
                        if match:
                            match_year = match.group(1)

                    match_norm_title = scraper_utils.normalize_title(
                        match_title)
                    title_match = (norm_title
                                   in match_norm_title) or (match_norm_title
                                                            in norm_title)
                    if title_match and (not year or not match_year
                                        or year == match_year):
                        result = {
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year,
                            'url': scraper_utils.pathify_url(match_url)
                        }
                        results.append(result)

        return results

示例#13

0

显示文件

文件： xmovies8v2_scraper.py 项目： hpduong/retropie_configs

    def get_sources(self, video):
        hosters = []
        sources = {}
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(page_url, cache_limit=.5)
        match = re.search("load_player\('([^']+)", html)
        if not match: return hosters
        
        headers = {'Referer': page_url, 'Server': 'cloudflare-nginx', 'Accept': 'text/html, */*; q=0.01',
                   'Accept-Language': 'en-US,en;q=0.5', 'Accept-Formating': 'application/json, text/javascript', 'Accept-Encoding': 'gzip, deflate'}
        headers.update(XHR)
        params = {'id': match.group(1)}
        player_url = scraper_utils.urljoin(self.base_url, PLAYER_URL)
        html = self._http_get(player_url, params=params, headers=headers, cache_limit=1)
        js_data = scraper_utils.parse_json(html, player_url)
        pl_url = js_data.get('value') or js_data.get('download')
        if not pl_url: return hosters
        
        headers = {'Referer': page_url}
        if pl_url.startswith('//'): pl_url = 'https:' + pl_url
        html = self._http_get(pl_url, headers=headers, allow_redirect=False, cache_limit=0)
        if html.startswith('http'):
            streams = [(html, '')]
        else:
            js_data = scraper_utils.parse_json(html, pl_url)
            try: streams = [(source['file'], source.get('label', '')) for source in js_data['playlist'][0]['sources']]
            except: streams = []
            
        for stream in streams:
            stream_url, label = stream
            if scraper_utils.get_direct_hostname(self, stream_url) == 'gvideo':
                sources[stream_url] = {'quality': scraper_utils.gv_get_quality(stream_url), 'direct': True}
            else:
                if label:
                    quality = scraper_utils.height_get_quality(label)
                else:
                    quality = QUALITIES.HIGH
                sources[stream_url] = {'quality': quality, 'direct': False}
                    
        for source, value in sources.iteritems():
            direct = value['direct']
            quality = value['quality']
            if direct:
                host = scraper_utils.get_direct_hostname(self, source)
            else:
                host = urlparse.urlparse(source).hostname

            stream_url = source + scraper_utils.append_headers({'User-Agent': scraper_utils.get_ua()})
            hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': direct}
            hosters.append(hoster)
            
        return hosters

示例#14

0

显示文件

文件： rmz_scraper.py 项目： hpduong/retropie_configs

    def get_sources(self, video):
        hosters = []
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(page_url, require_debrid=True, cache_limit=.5)
        if video.video_type == VIDEO_TYPES.MOVIE:
            page_url = self.__get_release(html, video)
            if page_url is None: return hosters

            page_url = scraper_utils.urljoin(self.base_url, page_url)
            html = self._http_get(page_url,
                                  require_debrid=True,
                                  cache_limit=.5)

        hevc = False
        for _attrs, content in dom_parser2.parse_dom(
                html, 'span', {'class': 'releaselabel'}):
            if re.search('(hevc|x265)', content, re.I):
                hevc = 'x265'

            match = re.search('(\d+)x(\d+)', content)
            if match:
                _width, height = match.groups()
                quality = scraper_utils.height_get_quality(height)
                break
        else:
            quality = QUALITIES.HIGH

        streams = [
            attrs['href'] for attrs, _content in dom_parser2.parse_dom(
                html, 'a', {'class': 'links'}, req='href')
        ]
        streams += [
            content for _attrs, content in dom_parser2.parse_dom(
                html, 'pre', {'class': 'links'})
        ]
        for stream_url in streams:
            if scraper_utils.excluded_link(stream_url): continue
            host = urlparse.urlparse(stream_url).hostname
            hoster = {
                'multi-part': False,
                'host': host,
                'class': self,
                'views': None,
                'url': stream_url,
                'rating': None,
                'quality': quality,
                'direct': False
            }
            if hevc: hoster['format'] = hevc
            hosters.append(hoster)

        return hosters

示例#15

0

显示文件

 def _get_episode_url(self, show_url, video):
     episode_pattern = 'href="([^"]+/season/%s/episode/%s/?)"' % (
         video.season, video.episode)
     title_pattern = 'href="(?P<url>[^"]+)"[^>]+title="(?:S\d+\s*E\d+:\s*)?(?P<title>[^"]+)'
     headers = {'Referer': scraper_utils.urljoin(self.base_url, show_url)}
     season_url = scraper_utils.urljoin(show_url,
                                        '/season/%s' % (video.season))
     season_url = scraper_utils.urljoin(self.base_url, season_url)
     html = self._http_get(season_url, headers=headers, cache_limit=2)
     fragment = dom_parser2.parse_dom(html, 'div', {'id': 'episodes'})
     return self._default_get_episode_url(fragment, video, episode_pattern,
                                          title_pattern)

示例#16

0

显示文件

    def get_sources(self, video):
        hosters = []
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(page_url, cache_limit=8)
        fragment = dom_parser2.parse_dom(html, 'div', {'class': 'playex'})
        if fragment: html = fragment[0].content
        iframe_url = dom_parser2.parse_dom(html, 'iframe', req='src')
        if not iframe_url: return hosters
        iframe_url = iframe_url[0].attrs['src']
        if iframe_url.startswith('/'):
            iframe_url = scraper_utils.urljoin(self.base_url, iframe_url)
        html = self._http_get(iframe_url,
                              headers={'Referer': page_url},
                              cache_limit=.5)
        obj = dom_parser2.parse_dom(html, 'object', req='data')
        if obj:
            streams = dict((stream_url, {
                'quality': scraper_utils.gv_get_quality(stream_url),
                'direct': True
            }) for stream_url in scraper_utils.parse_google(
                self, obj[0].attrs['data']))
        else:
            streams = scraper_utils.parse_sources_list(self, html)

        for stream_url, values in streams.iteritems():
            host = scraper_utils.get_direct_hostname(self, stream_url)
            if host == 'gvideo':
                quality = scraper_utils.gv_get_quality(stream_url)
            else:
                quality = values['quality']
                stream_url += scraper_utils.append_headers({
                    'User-Agent':
                    scraper_utils.get_ua(),
                    'Referer':
                    page_url
                })

            source = {
                'multi-part': False,
                'url': stream_url,
                'host': host,
                'class': self,
                'quality': quality,
                'views': None,
                'rating': None,
                'direct': True
            }
            hosters.append(source)

        return hosters

示例#17

0

显示文件

    def get_sources(self, video):
        sources = []
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return sources
        try:
            url = scraper_utils.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=2)

            pattern = '<iframe id="videoframe" src="([^"]+)'
            match = re.search(pattern, html)
            url = scraper_utils.urljoin(self.base_url, match.group(1))
            html = self._http_get(url, cache_limit=0)

            match = re.search('lastChild\.value="([^"]+)"(?:\s*\+\s*"([^"]+))?', html)
            secret = ''.join(match.groups(''))

            match = re.search('"&t=([^"]+)', html)
            t = match.group(1)
            
            match = re.search('(?:\s+|,)s\s*=(\d+)', html)
            s_start = int(match.group(1))
            
            match = re.search('(?:\s+|,)m\s*=(\d+)', html)
            m_start = int(match.group(1))
            
            for _attrs, fragment in dom_parser2.parse_dom(html, 'div', {'class': 'ripdiv'}):
                match = re.match('<b>(.*?)</b>', fragment)
                if match:
                    q_str = match.group(1).replace(' ', '').upper()
                    quality = QUALITY_MAP.get(q_str, QUALITIES.HIGH)
                else:
                    quality = QUALITIES.HIGH

                for attrs, label in dom_parser2.parse_dom(fragment, 'a', {'rel': re.compile('\d+')}, req='onclick'):
                    link_id = re.sub('[^\d]', '', attrs['onclick'])
                    match = re.match('([^:]+:)\s*(.*)', label)
                    if not match: continue
                    
                    version, host_fragment = match.groups()
                    source = {'multi-part': False, 'quality': quality, 'class': self, 'version': version, 'rating': None, 'views': None, 'direct': False}
                    source['host'] = re.sub('(</?[^>]*>)', '', host_fragment)
                    s = s_start + random.randint(3, 100)
                    m = m_start + random.randint(21, 100)
                    url = AJAX_URL.format(link_id=link_id, s=s, m=m, secret=secret, t=t)
                    source['url'] = url
                    sources.append(source)
                    
        except Exception as e:
            logger.log('Failure (%s) during icefilms get sources: |%s|' % (str(e), video), log_utils.LOGWARNING)
            
        return sources

示例#18

0

显示文件

文件： xmovies8_scraper.py 项目： hpduong/retropie_configs

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/results')
        params = {'q': title}
        referer = search_url + '?' + urllib.urlencode(params)
        headers = {'Referer': referer}
        headers.update(XHR)
        _html = self._http_get(scraper_utils.urljoin(self.base_url, 'av'),
                               headers=headers,
                               method='POST',
                               cache_limit=0)

        cookies = {'begin_referer': referer, 'prounder': 1}
        html = self._http_get(search_url,
                              params=params,
                              cookies=cookies,
                              cache_limit=8)
        if any('jquery.js' in match.attrs['src']
               for match in dom_parser2.parse_dom(html, 'script', req='src')):
            html = self._http_get(search_url,
                                  params=params,
                                  cookies=cookies,
                                  cache_limit=0)

        for _attrs, result in dom_parser2.parse_dom(html, 'div',
                                                    {'class': 'cell'}):
            title_frag = dom_parser2.parse_dom(result, 'div',
                                               {'class': 'video_title'})
            year_frag = dom_parser2.parse_dom(result, 'div',
                                              {'class': 'video_quality'})
            if not title_frag: continue
            match = dom_parser2.parse_dom(title_frag[0].content,
                                          'a',
                                          req='href')
            if not match: continue
            match_url = match[0].attrs['href']
            match_title = match[0].content
            try:
                match = re.search('\s+(\d{4})\s+', year_frag[0].content)
                match_year = match.group(1)
            except:
                match_year = ''

            if not year or not match_year or year == match_year:
                result = {
                    'url': scraper_utils.pathify_url(match_url),
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year
                }
                results.append(result)
        return results

示例#19

0

显示文件

文件： premiumize_scraper.py 项目： hpduong/retropie_configs

    def __add_torrent(self, hash_id):
        list_url = scraper_utils.urljoin(self.base_url, LIST_URL)
        js_data = self._json_get(list_url, cache_limit=0)
        for transfer in js_data.get('transfers', []):
            if transfer['hash'].lower() == hash_id:
                return True

        add_url = scraper_utils.urljoin(self.base_url, ADD_URL)
        data = {'src': MAGNET_LINK % hash_id}
        js_data = self._json_get(add_url, data=data, cache_limit=0)
        if js_data.get('status') == 'success':
            return True
        else:
            return False

示例#20

0

显示文件

 def _get_episode_url(self, show_url, video):
     url = scraper_utils.urljoin(self.base_url, show_url)
     html = self._http_get(url, cache_limit=24)
     fragment = dom_parser2.parse_dom(html, 'div', {'class': 'poster'})
     if not fragment: return
     show_url = dom_parser2.parse_dom(fragment[0].content, 'a', req='href')
     if not show_url: return
     show_url = scraper_utils.urljoin(self.base_url,
                                      show_url[0].attrs['href'])
     html = self._http_get(show_url, cache_limit=2)
     fragment = dom_parser2.parse_dom(html, 'div', {'id': 'servers'})
     episode_pattern = 'href="([^"]+)[^>]+>[Ee][Pp]\s*(?:[Ss]0*%s-)?E?p?0*%s(?!\d)' % (
         video.season, video.episode)
     return self._default_get_episode_url(fragment or html, video,
                                          episode_pattern)

示例#21

0

显示文件

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/advanced-search/')
        headers = {'Referer': self.base_url}
        params = {'search_query': title, 'orderby': '', 'order': '', 'wpas': 1}
        html = self._http_get(search_url,
                              params=params,
                              headers=headers,
                              cache_limit=8)
        norm_title = scraper_utils.normalize_title(title)
        for _attrs, item in dom_parser2.parse_dom(html, 'div',
                                                  {'class': 'datos'}):
            match = dom_parser2.parse_dom(item, 'a', req='href')
            if not match: continue

            match_url = match[0].attrs['href']
            is_tvshow = '/tvshows/' in match_url
            if is_tvshow and video_type == VIDEO_TYPES.MOVIE or not is_tvshow and video_type == VIDEO_TYPES.TVSHOW:
                continue

            match_title = match[0].content
            match_title, match_year = scraper_utils.extra_year(match_title)
            if scraper_utils.normalize_title(match_title) in norm_title and (
                    not year or not match_year or year == match_year):
                result = {
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year,
                    'url': scraper_utils.pathify_url(match_url)
                }
                results.append(result)

        return results

示例#22

0

显示文件

文件： watchitvideos_scraper.py 项目： hpduong/retropie_configs

 def get_sources(self, video):
     hosters = []
     source_url = self.get_url(video)
     if not source_url or source_url == FORCE_NO_MATCH: return hosters
     page_url = scraper_utils.urljoin(self.base_url, source_url)
     html = self._http_get(page_url, cache_limit=.5)
     
     best_quality = QUALITIES.HIGH
     fragment = dom_parser2.parse_dom(html, 'div', {'class': 'entry'})
     if fragment:
         for match in re.finditer('href="[^"]*/movies-quality/[^"]*[^>]*>([^<]+)', fragment[0].content, re.I):
             quality = Q_MAP.get(match.group(1).upper(), QUALITIES.HIGH)
             if Q_ORDER[quality] > Q_ORDER[best_quality]:
                 best_quality = quality
                 
     sources = []
     for attrs, _content in dom_parser2.parse_dom(html, 'a', req='data-vid'):
         try:
             vid_url = dom_parser2.parse_dom(scraper_utils.cleanse_title(attrs['data-vid']), 'iframe', req='src')
             sources.append(vid_url[0])
         except:
             pass
         
     fragment = dom_parser2.parse_dom(html, 'table', {'class': 'additional-links'})
     if fragment:
         sources += dom_parser2.parse_dom(fragment[0].content, 'a', req='href')
             
     for stream_url in sources:
         stream_url = stream_url.attrs.get('href') or stream_url.attrs.get('src')
         host = urlparse.urlparse(stream_url).hostname
         quality = scraper_utils.get_quality(video, host, best_quality)
         hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': False}
         hosters.append(hoster)
     return hosters

示例#23

0

显示文件

文件： iomovies_scraper.py 项目： hpduong/retropie_configs

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search')
        html = self._http_get(search_url, params={'q': title}, cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'div',
                                                  {'class': 'movie-item'}):
            match = dom_parser2.parse_dom(item,
                                          'a', {'itemprop': 'url'},
                                          req='href')
            if not match: continue

            match_url, match_title_year = match[0].attrs['href'], match[
                0].content
            match_title, match_year = scraper_utils.extra_year(
                match_title_year)
            if not match_year:
                try:
                    match_year = dom_parser2.parse_dom(
                        item, 'div', {'class': 'overlay-year'})[0].content
                except:
                    match_year = ''

            if not year or not match_year or year == match_year:
                result = {
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year,
                    'url': scraper_utils.pathify_url(match_url)
                }
                results.append(result)

        return results

示例#24

0

显示文件

 def _get_episode_url(self, show_url, video):
     episode_pattern = 'href=([^>]+)>0*%sx0*%s\s+' % (video.season, video.episode)
     title_pattern = 'href=(?P<url>[^>]+)>(?:\d+x\d+\s+)+(?P<title>[^<]+)'
     show_url = scraper_utils.urljoin(self.base_url, show_url)
     html = self._http_get(show_url, cache_limit=2)
     fragment = dom_parser2.parse_dom(html, 'span', {'class': 'list'})
     return self._default_get_episode_url(fragment, video, episode_pattern, title_pattern)

示例#25

0

显示文件

    def _get_episode_url(self, show_url, video):
        force_title = scraper_utils.force_title(video)
        title_fallback = kodi.get_setting('title-fallback') == 'true'
        norm_title = scraper_utils.normalize_title(video.ep_title)
        page_url = [show_url]
        too_old = False
        while page_url and not too_old:
            url = scraper_utils.urljoin(self.base_url, page_url[0])
            html = self._http_get(url, require_debrid=True, cache_limit=1)
            for _attrs, post in dom_parser2.parse_dom(
                    html, 'div', {'id': re.compile('post-\d+')}):
                if self.__too_old(post):
                    too_old = True
                    break
                if show_url not in post: continue
                match = dom_parser2.parse_dom(post, 'a', req='href')
                if match:
                    url, title = match[0].attrs['href'], match[0].content
                    if not force_title:
                        if scraper_utils.release_check(video,
                                                       title,
                                                       require_title=False):
                            return scraper_utils.pathify_url(url)
                    else:
                        if title_fallback and norm_title:
                            match = re.search('</strong>(.*?)</p>', post)
                            if match and norm_title == scraper_utils.normalize_title(
                                    match.group(1)):
                                return scraper_utils.pathify_url(url)

            page_url = dom_parser2.parse_dom(html,
                                             'a', {'class': 'nextpostslink'},
                                             req='href')
            if page_url: page_url = [page_url[0].attrs['href']]

示例#26

0

显示文件

文件： mvl_proxy.py 项目： hpduong/retropie_configs

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        seen_urls = set()
        for page in ['/latest-added/', '/popular-today/', '/most-popular/']:
            url = scraper_utils.urljoin(self.base_url, page)
            html = self._http_get(url, cache_limit=24)
            fragment = dom_parser2.parse_dom(html, 'div', {'class': 'home'})
            if fragment:
                norm_title = scraper_utils.normalize_title(title)
                for attrs, match_title_year in dom_parser2.parse_dom(
                        fragment[0].content, 'a', req='href'):
                    match_url = attrs['href']
                    match_title, match_year = scraper_utils.extra_year(
                        match_title_year)
                    if norm_title in scraper_utils.normalize_title(
                            match_title) and (not year or not match_year
                                              or year == match_year):
                        match_url = scraper_utils.pathify_url(match_url)
                        if match_url in seen_urls: continue
                        seen_urls.add(match_url)
                        result = {
                            'url': match_url,
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year
                        }
                        results.append(result)

        return results

示例#27

0

显示文件

文件： mvl_proxy.py 项目： hpduong/retropie_configs

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(page_url, cache_limit=.25)
        for _attrs, button in dom_parser2.parse_dom(
                html, 'li', {'class': 'playing_button'}):
            try:
                link = dom_parser2.parse_dom(button, 'a', req='href')
                match = re.search('php\?.*?=?([^"]+)', link[0].attrs['href'])
                stream_url = base64.b64decode(match.group(1))
                match = re.search('(https?://.*)', stream_url)
                stream_url = match.group(1)
                host = urlparse.urlparse(stream_url).hostname
                quality = scraper_utils.get_quality(video, host,
                                                    QUALITIES.HIGH)
                hoster = {
                    'multi-part': False,
                    'host': host,
                    'class': self,
                    'quality': quality,
                    'views': None,
                    'rating': None,
                    'url': stream_url,
                    'direct': False
                }
                hosters.append(hoster)
            except Exception as e:
                logger.log(
                    'Exception during tvonline source: %s - |%s|' %
                    (e, button), log_utils.LOGDEBUG)

        return hosters

示例#28

0

显示文件

文件： iwatch_scraper.py 项目： hpduong/retropie_configs

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_in = 'm' if video_type == VIDEO_TYPES.MOVIE else 't'
        search_url = scraper_utils.urljoin(self.base_url, '/search')
        html = self._http_get(search_url,
                              data={
                                  'searchquery': title,
                                  'searchin': search_in
                              },
                              cache_limit=8)
        fragment = dom_parser2.parse_dom(html, 'div', {'class': 'search-page'})
        if not fragment: return results
        fragment = dom_parser2.parse_dom(fragment[0].content, 'table')
        if not fragment: return results
        for attrs, match_title_year in dom_parser2.parse_dom(
                fragment[0].content, 'a', req='href'):
            match_url = attrs['href']
            match_title, match_year = scraper_utils.extra_year(
                match_title_year)
            if not year or not match_year or year == match_year:
                result = {
                    'url': scraper_utils.pathify_url(match_url),
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year
                }
                results.append(result)

        return results

示例#29

0

显示文件

文件： afdah_scraper.py 项目： hpduong/retropie_configs

    def get_sources(self, video):
        hosters = []
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return hosters

        url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(url, cache_limit=.5)
        match = re.search('This movie is of poor quality', html, re.I)
        if match:
            quality = QUALITIES.LOW
        else:
            quality = QUALITIES.HIGH

        for match in re.finditer('href="([^"]+/embed\d*/[^"]+)', html):
            url = match.group(1)
            embed_html = self._http_get(url, cache_limit=.5)
            hosters += self.__get_links(embed_html)

        pattern = 'href="([^"]+)[^>]*>\s*<[^>]+play_video.gif'
        for match in re.finditer(pattern, html, re.I):
            stream_url = match.group(1)
            host = urlparse.urlparse(stream_url).hostname
            quality = scraper_utils.get_quality(video, host, quality)
            hoster = {
                'multi-part': False,
                'url': stream_url,
                'host': host,
                'class': self,
                'quality': quality,
                'rating': None,
                'views': None,
                'direct': False
            }
            hosters.append(hoster)
        return hosters

示例#30

0

显示文件

    def __get_ajax_sources(self, html, page_url):
        hosters = []
        match = re.search('''url\s*:\s*"([^"]+)"\s*,\s*data:'id=''', html)
        if match:
            ajax_url = match.group(1)
            for data_id in re.findall("kaynakdegis\('([^']+)", html):
                url = scraper_utils.urljoin(self.base_url, ajax_url)
                data = {'id': data_id}
                headers = {'Referer': page_url}
                headers.update(XHR)
                result = self._http_get(url,
                                        data=data,
                                        headers=headers,
                                        cache_limit=.5)
                js_data = scraper_utils.parse_json(result, url)
                if 'iframe' in js_data:
                    if self.base_url in js_data['iframe']:
                        hosters += self.__get_iframe_sources(
                            js_data['iframe'], page_url)
                    else:
                        hosters.append(
                            self.__create_source(js_data['iframe'],
                                                 720,
                                                 page_url,
                                                 direct=False))
                else:
                    hosters += self.__get_js_sources(js_data, page_url)
                    pass

        return hosters