Python parse_json示例，dsrd_lib.scraper_utils.parse_json Python示例

示例#1

0

显示文件

文件： unblocked_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        scrape = title.lower().replace(' ','+').replace(':', '')

        start_url = self.search_link %(self.goog,scrape,year)

        html = client.request(start_url)
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/ajax_search')
        html = self._http_get(search_url, params={'q': title}, headers=XHR, cache_limit=1)
        js_result = scraper_utils.parse_json(html, search_url)
        match_year = ''
        for series in js_result.get('series', []):
            match_url = series.get('seo')
            match_title = series.get('label')
            if match_url and match_title and (not year or not match_year or year == match_year):
                result = {'url': scraper_utils.pathify_url('/' + match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)

        return results

示例#2

0

显示文件

    def __get_posts(self, html):
        sources = {}
        pattern = '\$\.post\("([^"]+)"\s*,\s*\{(.*?)\}'
        match = re.search(pattern, html)
        if not match: return sources

        post_url, post_data = match.groups()
        data = self.__get_data(post_data)
        html = self._http_get(post_url, data=data, cache_limit=.5)
        js_result = scraper_utils.parse_json(html, post_url)
        for key in js_result:
            stream_url = js_result[key]
            host = scraper_utils.get_direct_hostname(self, stream_url)
            if host == 'gvideo':
                quality = scraper_utils.gv_get_quality(stream_url)
            else:
                quality = scraper_utils.height_get_quality(key)
            sources[stream_url] = quality
        return sources

示例#3

0

显示文件

文件： snagfilms_scraper.py 项目： Lhse44/repository.deallen

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     search_url = scraper_utils.urljoin(self.base_url, SEARCH_URL)
     referer = scraper_utils.urljoin(self.base_url, '/search/?q=%s')
     referer = referer % (urllib.quote_plus(title))
     headers = {'Referer': referer}
     headers.update(XHR)
     params = {'searchTerm': title, 'type': SEARCH_TYPES[video_type], 'limit': 500}
     html = self._http_get(search_url, params=params, headers=headers, auth=False, cache_limit=2)
     js_data = scraper_utils.parse_json(html, search_url)
     if 'results' in js_data:
         for result in js_data['results']:
             match_year = str(result.get('year', ''))
             match_url = result.get('permalink', '')
             match_title = result.get('title', '')
             if not year or not match_year or year == match_year:
                 result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                 results.append(result)
     return results

示例#4

0

显示文件

 def __get_json_links(self, html, sub):
     hosters = []
     js_data = scraper_utils.parse_json(html)
     if 'sources' in js_data:
         for source in js_data.get('sources', []):
             stream_url = source.get('file')
             if stream_url is None: continue
             
             host = scraper_utils.get_direct_hostname(self, stream_url)
             if host == 'gvideo':
                 quality = scraper_utils.gv_get_quality(stream_url)
             elif 'label' in source:
                 quality = scraper_utils.height_get_quality(source['label'])
             else:
                 quality = QUALITIES.HIGH
             hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True}
             hoster['subs'] = sub
             hosters.append(hoster)
     return hosters

示例#5

0

显示文件

    def __search(self, video_type, title, year, season=''):
        results = []
        search_url = (SEARCH_URL) % (urllib.quote_plus(title))
        html = self._http_get(search_url, cache_limit=1)
        js_data = scraper_utils.parse_json(html)
        norm_title = scraper_utils.normalize_title(title)
        for item in js_data.get('results', []):
            if '/watch/' not in item['url'].lower(): continue
            is_season = re.search('Season\s+(\d+)', item['titleNoFormatting'],
                                  re.IGNORECASE)
            if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (
                    is_season and video_type == VIDEO_TYPES.SEASON):
                match_title_year = item['titleNoFormatting']
                match_title_year = re.sub('^Watch\s+', '', match_title_year)
                match_url = item['url']
                match_year = ''
                if video_type == VIDEO_TYPES.MOVIE:
                    match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)',
                                      match_title_year)
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year
                else:
                    if season and int(is_season.group(1)) != int(season):
                        continue
                    match = re.search('(.*?)\s+\(\d{4}\)', match_title_year)
                    if match:
                        match_title = match.group(1)
                    else:
                        match_title = match_title_year

                if norm_title in scraper_utils.normalize_title(
                        match_title) and (not year or not match_year
                                          or year == match_year):
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results

示例#6

0

显示文件

文件： direct_scraper.py 项目： Lhse44/repository.deallen

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if not source_url or source_url == FORCE_NO_MATCH: return hosters
        url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(url, cache_limit=.5)
        js_result = scraper_utils.parse_json(html, url)
        if 'error' in js_result:
            logger.log(
                'Direct API error: "%s" @ %s' % (js_result['error'], url),
                log_utils.LOGWARNING)
            return hosters

        for result in js_result:
            if not scraper_utils.release_check(
                    video, result['release'], require_title=False):
                continue
            if result['quality'] in self.q_order:
                for key in result['links']:
                    url = result['links'][key][0]
                    if re.search('\.rar(\.|$)', url):
                        continue

                    hostname = urlparse.urlparse(url).hostname
                    hoster = {
                        'multi-part': False,
                        'class': self,
                        'views': None,
                        'url': url,
                        'rating': None,
                        'host': hostname,
                        'quality': QUALITY_MAP[result['quality']],
                        'direct': False
                    }
                    hoster['format'] = result['quality']
                    if 'x265' in result[
                            'release'] and result['quality'] != '1080P-X265':
                        hoster['dd_qual'] += '-x265'
                    hosters.append(hoster)

        return hosters

示例#7

0

显示文件

    def __get_linked(self, html):
        sources = {}
        match = re.search('dizi=([^"]+)', html)
        if not match: return sources
        html = self._http_get(AJAX_URL,
                              params={'dizi': match.group(1)},
                              headers=XHR,
                              cache_limit=.5)
        js_result = scraper_utils.parse_json(html, AJAX_URL)
        for source in js_result.get('success', []):
            stream_url = source.get('src')
            if stream_url is None: continue

            if scraper_utils.get_direct_hostname(self, stream_url) == 'gvideo':
                quality = scraper_utils.gv_get_quality(stream_url)
            elif 'label' in source:
                quality = scraper_utils.height_get_quality(source['label'])
            else:
                quality = QUALITIES.HIGH
            sources[stream_url] = quality
        return sources

示例#8

0

显示文件

 def __get_ht_links(self, html, page_url):
     sources = {}
     match = re.search('Htplugins_Make_Player\("([^"]+)', html)
     if match:
         data = {'data': match.group(1)}
         url = scraper_utils.urljoin(self.base_url, LINK_URL2)
         headers = {'Referer': page_url}
         html = self._http_get(url,
                               data=data,
                               headers=headers,
                               cache_limit=.25)
         js_data = scraper_utils.parse_json(html, url)
         if 'l' in js_data:
             for link in js_data['l']:
                 if scraper_utils.get_direct_hostname(self,
                                                      link) == 'gvideo':
                     quality = scraper_utils.gv_get_quality(link)
                 else:
                     quality = QUALITIES.HIGH
                 sources[link] = quality
     return sources

示例#9

0

显示文件

    def __get_source_page(self, video_type, page_url):
        match = re.search('/movie/(.*?)-(\d+)\.html', page_url)
        if not match: return '', '', ''
        slug, movie_id = match.groups()

        vid_type = 'movie' if video_type == VIDEO_TYPES.MOVIE else 'series'
        qp_url = QP_URL.format(slug=slug, movie_id=movie_id, vid_type=vid_type)
        qp_url = scraper_utils.urljoin(self.base_url, qp_url)
        headers = {'Referer': scraper_utils.urljoin(self.base_url, page_url)}
        headers.update(XHR)
        html = self._http_get(qp_url, headers=headers, cache_limit=8)
        watching_url = dom_parser2.parse_dom(
            html, 'a', {'title': re.compile('View all episodes')}, req='href')
        if not watching_url: return '', '', ''

        watching_url = watching_url[0].attrs['href']
        page_html = self._http_get(watching_url,
                                   headers={
                                       'Referer':
                                       scraper_utils.urljoin(
                                           self.base_url, page_url)
                                   },
                                   cache_limit=8)
        for attrs, _content in dom_parser2.parse_dom(page_html,
                                                     'img',
                                                     {'class': 'hidden'},
                                                     req='src'):
            _img = self._http_get(attrs['src'],
                                  headers={'Referer': watching_url},
                                  cache_limit=8)

        sl_url = SL_URL.format(movie_id=movie_id)
        sl_url = scraper_utils.urljoin(self.base_url, sl_url)
        html = self._http_get(sl_url, headers=headers, cache_limit=8)
        js_data = scraper_utils.parse_json(html, sl_url)
        try:
            html = js_data['html']
        except:
            html = ''
        return movie_id, watching_url, html

示例#10

0

显示文件

文件： afdahorg_scraper.py 项目： Lhse44/repository.deallen

 def get_sources(self, video):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         page_url = urlparse.urljoin(self.base_url, source_url)
         html = self._http_get(page_url, cache_limit=.5)
         match = re.search('var\s*video_id="([^"]+)', html)
         if match:
             video_id = match.group(1)
             data = {'v': video_id}
             headers = {'Referer': page_url}
             headers.update(XHR)
             html = self._http_get(self.info_url,
                                   data=data,
                                   headers=headers,
                                   cache_limit=0)
             sources = scraper_utils.parse_json(html, self.info_url)
             for source in sources:
                 match = re.search('url=(.*)', sources[source])
                 if match:
                     stream_url = urllib.unquote(match.group(1))
                     host = self._get_direct_hostname(stream_url)
                     if host == 'gvideo':
                         quality = scraper_utils.gv_get_quality(stream_url)
                     else:
                         quality = scraper_utils.height_get_quality(source)
                     stream_url += scraper_utils.append_headers(
                         {'User-Agent': scraper_utils.get_ua()})
                     hoster = {
                         'multi-part': False,
                         'host': host,
                         'class': self,
                         'quality': quality,
                         'views': None,
                         'rating': None,
                         'url': stream_url,
                         'direct': True
                     }
                     hosters.append(hoster)
     return hosters

示例#11

0

显示文件

文件： sit2play_scraper.py 项目： Lhse44/repository.deallen

    def get_sources(self, video):
        sources = []
        source_url = self.get_url(video)
        if not source_url or source_url == FORCE_NO_MATCH: return sources
        object_id = self.__extract_id(source_url)
        if object_id is None: return sources
        source_url = TITLE_URL.format(id=object_id)
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._authed_http_get(page_url, cache_limit=.5)
        js_data = scraper_utils.parse_json(html, page_url)
        if video.video_type == VIDEO_TYPES.MOVIE:
            links = js_data.get('links', {})
        else:
            links = self.__episode_match(js_data, video)

        prefix = js_data.get('domain', {}).get('prefix')
        suffix = js_data.get('domain', {}).get('suffix')
        for key, path in links.get('links', {}).iteritems():
            for mirror in sorted(list(set(links.get('mirrors', [])))):
                stream_url = TEMPLATE.format(prefix=prefix,
                                             mirror=mirror,
                                             suffix=suffix,
                                             path=path)
                host = scraper_utils.get_direct_hostname(self, stream_url)
                quality = Q_MAP.get(key, QUALITIES.HIGH)
                source = {
                    'multi-part': False,
                    'url': stream_url,
                    'host': host,
                    'class': self,
                    'quality': quality,
                    'views': None,
                    'rating': None,
                    'direct': True
                }
                source['version'] = '(Mirror %d)' % (mirror)
                sources.append(source)

        return sources

示例#12

0

显示文件

文件： direct_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search?query=')
        search_url += title.replace("'", "")
        html = self._http_get(search_url, cache_limit=.25)
        js_result = scraper_utils.parse_json(html, search_url)
        if 'error' in js_result:
            logger.log(
                'Direct API error: "%s" @ %s' %
                (js_result['error'], search_url), log_utils.LOGWARNING)
            return results

        for match in js_result:
            # url = search_url + '&quality=%s' % match['quality']
            result = {
                'url': scraper_utils.pathify_url(url),
                'title': scraper_utils.cleanse_title(match['release']),
                'quality': match['quality'],
                'year': ''
            }
            results.append(result)
        return results

示例#13

0

显示文件

文件： sit2play_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(SEARCH_BASE,
                                           '/1/indexes/al_titles_index/query')
        params = {
            'x-algolia-agent': 'Algolia for vanilla JavaScript (lite) 3.22.1',
            'x-algolia-application-id': 'XXDAZCOUL3',
            'x-algolia-api-key': 'c5c1279f5ad09819ecf2af9d6b5ee06a'
        }
        data = {
            'params':
            urllib.urlencode({
                'query': title,
                'facets': '*',
                'hitsPerPage': 30
            })
        }
        headers = {'Origin': self.base_url}
        html = self._http_get(search_url,
                              params=params,
                              data=json.dumps(data),
                              headers=headers,
                              cache_limit=8)
        js_data = scraper_utils.parse_json(html, search_url)
        media_type = '/movies/' if video_type == VIDEO_TYPES.MOVIE else '/tv/'
        for item in js_data.get('hits', []):
            if 'permalink' in item and 'title' in item and media_type in item[
                    'permalink']:
                match_year = str(item.get('yr', ''))
                if not year or not match_year or year == match_year:
                    result = {
                        'title': scraper_utils.cleanse_title(item['title']),
                        'url': scraper_utils.pathify_url(item['permalink']),
                        'year': match_year
                    }
                    results.append(result)

        return results

示例#14

0

显示文件

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)
            match = re.search('href="([^"]+)"\s*class="player_btn_big"', html)
            if match:
                url = match.group(1)
                html = self._http_get(url, cache_limit=.5)

                q_str = ''
                match = re.search('class="status">([^<]+)', html)
                if match:
                    q_str = match.group(1)
                page_quality = QUALITY_MAP.get(q_str, QUALITIES.HIGH)

                views = None
                match = re.search('Views:</dt>\s*<dd>(\d+)', html, re.DOTALL)
                if match:
                    views = match.group(1)

                for src in dom_parser.parse_dom(html, 'iframe', ret='SRC'):
                    html = self._http_get(src, cache_limit=.5)
                    for match in re.finditer('index.php.*?link\s*:\s*"([^"]+)',
                                             html):
                        data = {'link': match.group(1)}
                        headers = XHR
                        headers['Referer'] = url
                        gk_url = urlparse.urljoin(src, GK_URL)
                        html = self._http_get(gk_url,
                                              data=data,
                                              headers=headers,
                                              cache_limit=.25)
                        js_result = scraper_utils.parse_json(html, gk_url)
                        if 'link' in js_result and 'func' not in js_result:
                            if isinstance(js_result['link'], list):
                                sources = dict(
                                    (link['link'],
                                     scraper_utils.height_get_quality(
                                         link['label']))
                                    for link in js_result['link'])
                            else:
                                sources = {js_result['link']: page_quality}

                            for source in sources:
                                host = self._get_direct_hostname(source)
                                if Q_ORDER[page_quality] < Q_ORDER[
                                        sources[source]]:
                                    quality = page_quality
                                else:
                                    quality = sources[source]
                                hoster = {
                                    'multi-part': False,
                                    'url': source,
                                    'class': self,
                                    'quality': quality,
                                    'host': host,
                                    'rating': None,
                                    'views': views,
                                    'direct': True
                                }
                                hosters.append(hoster)
        return hosters

示例#15

0

显示文件

文件： hdmoviefree_scraper.py 项目： Lhse44/repository.deallen

    def get_sources(self, video):
        source_url = self.get_url(video)
        sources = []
        if not source_url or source_url == FORCE_NO_MATCH: return sources
        page_url = scraper_utils.urljoin(self.base_url, source_url)
        html = self._http_get(page_url, cache_limit=8)
        for attrs, _content in dom_parser2.parse_dom(
                html, 'img', req=['data-id', 'data-name']):
            film_id, data_name = attrs['data-id'], attrs['data-name']
            data = {'id': film_id, 'n': data_name}
            server_url = scraper_utils.urljoin(self.base_url, SERVER_URL)
            server_url = server_url % (film_id)
            headers = {'Referer': page_url}
            headers.update(XHR)
            html = self._http_get(server_url,
                                  data=data,
                                  headers=headers,
                                  cache_limit=.5)
            for attrs, _content in dom_parser2.parse_dom(html,
                                                         'a',
                                                         req='data-id'):
                data = {'epid': attrs['data-id']}
                ep_url = scraper_utils.urljoin(self.base_url, EP_URL)
                ep_url = ep_url % (attrs['data-id'])
                headers = {'Referer': page_url}
                headers.update(XHR)
                html = self._http_get(ep_url,
                                      data=data,
                                      headers=headers,
                                      cache_limit=.5)
                js_data = scraper_utils.parse_json(html, ep_url)
                try:
                    links = [
                        r.attrs['src'] for r in dom_parser2.parse_dom(
                            js_data['link']['embed'], 'iframe', req='src')
                    ]
                except:
                    try:
                        links = js_data['link']['l']
                    except:
                        links = []
                try:
                    heights = js_data['link']['q']
                except:
                    heights = []
                for stream_url, height in map(None, links, heights):
                    match = re.search('movie_url=(.*)', stream_url)
                    if match:
                        stream_url = match.group(1)

                    host = scraper_utils.get_direct_hostname(self, stream_url)
                    if host == 'gvideo':
                        quality = scraper_utils.gv_get_quality(stream_url)
                        stream_url += scraper_utils.append_headers({
                            'User-Agent':
                            scraper_utils.get_ua(),
                            'Referer':
                            page_url
                        })
                        direct = True
                    else:
                        host = urlparse.urlparse(stream_url).hostname
                        if height:
                            quality = scraper_utils.height_get_quality(height)
                        else:
                            quality = QUALITIES.HD720
                        direct = False
                    source = {
                        'multi-part': False,
                        'url': stream_url,
                        'host': host,
                        'class': self,
                        'quality': quality,
                        'views': None,
                        'rating': None,
                        'direct': direct
                    }
                    sources.append(source)

        return sources

示例#16

0

显示文件

    def __get_links(self, url, video):
        hosters = []
        search_url, params = self.__translate_search(url)
        html = self._http_get(search_url, params=params, cache_limit=.5)
        js_result = scraper_utils.parse_json(html, search_url)
        down_url = js_result.get('downURL')
        dl_farm = js_result.get('dlFarm')
        dl_port = js_result.get('dlPort')
        for item in js_result.get('data', []):
            post_hash, size, post_title, ext, duration = item['0'], item[
                '4'], item['10'], item['11'], item['14']
            checks = [False] * 6
            if not scraper_utils.release_check(video, post_title):
                checks[0] = True
            if 'alangs' in item and item['alangs'] and 'eng' not in item[
                    'alangs']:
                checks[1] = True
            if re.match('^\d+s', duration) or re.match('^[0-5]m', duration):
                checks[2] = True
            if 'passwd' in item and item['passwd']: checks[3] = True
            if 'virus' in item and item['virus']: checks[4] = True
            if 'type' in item and item['type'].upper() != 'VIDEO':
                checks[5] = True
            if any(checks):
                logger.log(
                    'EasyNews Post excluded: %s - |%s|' % (checks, item),
                    log_utils.LOGDEBUG)
                continue

            stream_url = down_url + urllib.quote(
                '/%s/%s/%s%s/%s%s' %
                (dl_farm, dl_port, post_hash, ext, post_title, ext))
            stream_url = stream_url + '|Authorization=%s' % (urllib.quote(
                self.auth))
            host = scraper_utils.get_direct_hostname(self, stream_url)
            quality = None
            if 'width' in item:
                try:
                    width = int(item['width'])
                except:
                    width = 0
                if width:
                    quality = scraper_utils.width_get_quality(width)

            if quality is None:
                if video.video_type == VIDEO_TYPES.MOVIE:
                    meta = scraper_utils.parse_movie_link(post_title)
                else:
                    meta = scraper_utils.parse_episode_link(post_title)
                quality = scraper_utils.height_get_quality(meta['height'])

            if self.max_bytes:
                match = re.search('([\d.]+)\s+(.*)', size)
                if match:
                    size_bytes = scraper_utils.to_bytes(*match.groups())
                    if size_bytes > self.max_bytes:
                        logger.log(
                            'Result skipped, Too big: |%s| - %s (%s) > %s (%s GB)'
                            % (post_title, size_bytes, size, self.max_bytes,
                               self.max_gb))
                        continue

            hoster = {
                'multi-part': False,
                'class': self,
                'views': None,
                'url': stream_url,
                'rating': None,
                'host': host,
                'quality': quality,
                'direct': True
            }
            if any(i for i in ['X265', 'HEVC'] if i in post_title.upper()):
                hoster['format'] = 'x265'
            if size: hoster['size'] = size
            if post_title: hoster['extra'] = post_title
            hosters.append(hoster)
        return hosters