Python pathify_url示例，dsrd_lib.scraper_utils.pathify_url Python示例

示例#1

0

显示文件

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        url = scraper_utils.urljoin(self.base_url, AJAX_URL)
        data = {'type': 'getDizi'}
        headers = {'Referer': scraper_utils.urljoin(self.base_url, '/arsiv')}
        headers.update(XHR)
        html = self._http_get(url, data=data, headers=headers, cache_limit=48)
        norm_title = scraper_utils.normalize_title(title)
        match_year = ''
        js_data = scraper_utils.parse_json(html, url)
        for item in js_data.get('data', []):
            match_title = item.get('adi', '')
            if 'url' in item and norm_title in scraper_utils.normalize_title(match_title):
                result = {'url': scraper_utils.pathify_url(item['url']), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)

        return results

示例#2

0

显示文件

文件： vivoto_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/%s.html')
        search_url = search_url % (urllib.quote_plus(title))
        html = self._http_get(search_url, cache_limit=1)
        fragment = dom_parser2.parse_dom(html, 'div', {'class': 'movie'})
        if not fragment: return results

        norm_title = scraper_utils.normalize_title(title)
        for _attrs, item in dom_parser2.parse_dom(fragment[0].content, 'li'):
            match_url = dom_parser2.parse_dom(item, 'a', req='href')
            match_title = dom_parser2.parse_dom(item, 'span',
                                                {'class': 'text'})
            match_year = dom_parser2.parse_dom(item, 'span', {'class': 'year'})
            if not match_url or not match_title: continue

            match_url = match_url[0].attrs['href']
            match_title = re.sub('</?strong>', '', match_title[0].content)
            is_season = re.search('Season\s+(\d+)$', match_title, re.I)
            if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (
                    is_season and video_type == VIDEO_TYPES.SEASON):
                if video_type == VIDEO_TYPES.MOVIE:
                    if match_year:
                        match_year = match_year[0].content
                    else:
                        match_year = ''
                else:
                    if season and int(is_season.group(1)) != int(season):
                        continue
                    match_year = ''

                match_norm_title = scraper_utils.normalize_title(match_title)
                title_match = (norm_title
                               in match_norm_title) or (match_norm_title
                                                        in norm_title)
                if title_match and (not year or not match_year
                                    or year == match_year):
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results

示例#3

0

显示文件

 def __list(self, title):
     results = []
     search_url = scraper_utils.urljoin(self.base_url, 'index.php')
     params = {'do': 'charmap', 'name': 'series-list', 'args': '/' + title[0]}
     html = self._http_get(search_url, params=params, require_debrid=True, cache_limit=48)
     
     fragment = dom_parser2.parse_dom(html, 'div', {'class': 'downpara-list'})
     if not fragment: return results
     
     for match in dom_parser2.parse_dom(fragment[0].content, 'a', req='href'):
         match_url = match.attrs['href']
         match_title_extra = match.content
         match_title, match_season, q_str, is_pack = self.__get_title_parts(match_title_extra)
         if is_pack: continue
         quality = QUALITY_MAP.get(q_str, QUALITIES.HIGH)
         result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': '', 'quality': quality,
                   'season': match_season, 'q_str': q_str}
         results.append(result)
     return results

示例#4

0

显示文件

文件： unblocked_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        scrape = title.lower().replace(' ','+').replace(':', '')

        start_url = self.search_link %(self.goog,scrape,year)

        html = client.request(start_url)
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/ajax_search')
        html = self._http_get(search_url, params={'q': title}, headers=XHR, cache_limit=1)
        js_result = scraper_utils.parse_json(html, search_url)
        match_year = ''
        for series in js_result.get('series', []):
            match_url = series.get('seo')
            match_title = series.get('label')
            if match_url and match_title and (not year or not match_year or year == match_year):
                result = {'url': scraper_utils.pathify_url('/' + match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)

        return results

示例#5

0

显示文件

 def search(self, video_type, title, year, season=''):
     search_url = scraper_utils.urljoin(SEARCH_BASE_URL, '/search/')
     search_url += urllib.quote_plus(title)
     html = self._http_get(search_url, cache_limit=0)
     results = []
     match = re.search('ul class="list-film"(.*?)</ul>', html, re.DOTALL)
     if match:
         result_fragment = match.group(1)
         pattern = 'class="name">\s*<a\s+href="([^"]+)"\s+title="Watch\s+(.*?)\s+\((\d{4})\)'
         for match in re.finditer(pattern, result_fragment, re.DOTALL):
             url, title, match_year = match.groups('')
             if not year or not match_year or year == match_year:
                 result = {
                     'url': scraper_utils.pathify_url(url),
                     'title': scraper_utils.cleanse_title(title),
                     'year': match_year
                 }
                 results.append(result)
     return results

示例#6

0

显示文件

    def search(self, video_type, title, year, season=''):
        results = []
        test_url = title.replace("'", '')
        test_url = re.sub(r'[^a-zA-Z0-9\s]+', ' ', test_url).lower().strip()
        test_url = re.sub('\s+', ' ', test_url)
        test_url = test_url.replace(' ', '-')
        if year:
            test_url += '-%s' % (year)

        test_url = urlparse.urljoin(self.base_url, test_url)
        if self._http_get(test_url, cache_limit=1):
            result = {
                'title': scraper_utils.cleanse_title(title),
                'year': year,
                'url': scraper_utils.pathify_url(test_url)
            }
            results.append(result)

        return results

示例#7

0

显示文件

    def __search(self, video_type, title, year, season=''):
        results = []
        search_url = (SEARCH_URL) % (urllib.quote_plus(title))
        html = self._http_get(search_url, cache_limit=1)
        js_data = scraper_utils.parse_json(html)
        norm_title = scraper_utils.normalize_title(title)
        for item in js_data.get('results', []):
            if '/watch/' not in item['url'].lower(): continue
            is_season = re.search('Season\s+(\d+)', item['titleNoFormatting'],
                                  re.IGNORECASE)
            if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (
                    is_season and video_type == VIDEO_TYPES.SEASON):
                match_title_year = item['titleNoFormatting']
                match_title_year = re.sub('^Watch\s+', '', match_title_year)
                match_url = item['url']
                match_year = ''
                if video_type == VIDEO_TYPES.MOVIE:
                    match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)',
                                      match_title_year)
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year
                else:
                    if season and int(is_season.group(1)) != int(season):
                        continue
                    match = re.search('(.*?)\s+\(\d{4}\)', match_title_year)
                    if match:
                        match_title = match.group(1)
                    else:
                        match_title = match_title_year

                if norm_title in scraper_utils.normalize_title(
                        match_title) and (not year or not match_year
                                          or year == match_year):
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results

示例#8

0

显示文件

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = scraper_utils.urljoin(
            self.base_url, '/search/%s.html' % (urllib.quote_plus(title)))
        html = self._http_get(search_url, cache_limit=1)
        fragment = dom_parser2.parse_dom(html, 'ul', {'class': 'cfv'})
        if not fragment: return results

        norm_title = scraper_utils.normalize_title(title)
        for _attrs, item in dom_parser2.parse_dom(fragment[0].content, 'li'):
            is_season = dom_parser2.parse_dom(item, 'div', {'class': 'status'})
            if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (
                    is_season and video_type == VIDEO_TYPES.SEASON):
                match = dom_parser2.parse_dom(item, 'a', req=['href', 'title'])
                if not match: continue

                match_title = match[0].attrs['title']
                match_url = match[0].attrs['href']
                match_year = ''
                if video_type == VIDEO_TYPES.SEASON:
                    if season and not re.search('Season\s+%s$' %
                                                (season), match_title, re.I):
                        continue
                else:
                    match = re.search('-(\d{4})[-.]', match_url)
                    if match:
                        match_year = match.group(1)

                match_norm_title = scraper_utils.normalize_title(match_title)
                title_match = (norm_title
                               in match_norm_title) or (match_norm_title
                                                        in norm_title)
                if title_match and (not year or not match_year
                                    or year == match_year):
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results

示例#9

0

显示文件

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        for _attrs, fragment in dom_parser2.parse_dom(html, 'div',
                                                      {'class': 'inner'}):
            name = dom_parser2.parse_dom(fragment, 'div', {'class': 'name'})
            if not name: continue

            match = dom_parser2.parse_dom(name[0].content, 'a', req='href')
            if not match: continue

            match_url, match_title_year = match[0].attrs['href'], match[
                0].content
            if 'tv-series' in match_url and video_type == VIDEO_TYPES.MOVIE:
                continue

            match_title_year = re.sub('</?[^>]*>', '', match_title_year)
            match_title_year = re.sub('[Ww]atch\s+[Mm]ovie\s*', '',
                                      match_title_year)
            match_title_year = match_title_year.replace('&#8217;', "'")
            match_title, match_year = scraper_utils.extra_year(
                match_title_year)
            if not match_year:
                year_span = dom_parser2.parse_dom(fragment, 'span',
                                                  {'class': 'year'})
                if year_span:
                    year_text = dom_parser2.parse_dom(year_span[0].content,
                                                      'a')
                    if year_text:
                        match_year = year_text[0].content.strip()

            if not year or not match_year or year == match_year:
                result = {
                    'title': scraper_utils.cleanse_title(match_title),
                    'url': scraper_utils.pathify_url(match_url),
                    'year': match_year
                }
                results.append(result)

        return results

示例#10

0

显示文件

文件： moviehubs_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(
            self.base_url,
            '/search-movies/%s.html' % (urllib.quote_plus(title)))
        html = self._http_get(search_url, cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'li',
                                                  {'class': 'item'}):
            match_url = dom_parser2.parse_dom(item, 'a', req='href')
            match_title_year = re.search('onmouseover="([^"]+)', item)
            if match_url and match_title_year:
                match_url = match_url[0].attrs['href']
                match_title_year = match_title_year.group(1)
                match = re.search('<b>(?:<i>)?\s*(.*?)\s*(?:</i>)?</b>',
                                  match_title_year)
                if not match: continue

                match_title, match_year = scraper_utils.extra_year(
                    match.group(1))
                is_season = re.search('season\s+(\d+)', match_title_year, re.I)
                if (is_season and video_type == VIDEO_TYPES.MOVIE) or (
                        not is_season and video_type == VIDEO_TYPES.SEASON):
                    continue

                if video_type == VIDEO_TYPES.MOVIE:
                    if not match_year:
                        match_year = re.search('>Release:\s*(\d{4})',
                                               match_title_year)
                        match_year = match_year.group(1) if match_year else ''
                else:
                    if season and int(season) != int(is_season.group(1)):
                        continue

                if not year or not match_year or year == match_year:
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results

示例#11

0

显示文件

    def __alt_search(self, video_type, title, year, season=''):
        results = []
        params = title.lower()
        if year: params += ' %s' % (year)
        if video_type == VIDEO_TYPES.SEASON and season:
            params += ' Season %s' % (season)
        params = {'key': params}
        search_url = urlparse.urljoin(self.base_url, '/search')
        html = self._http_get(search_url, params=params, cache_limit=1)
        norm_title = scraper_utils.normalize_title(title)
        for item in dom_parser.parse_dom(html, 'div', {'class': 'caption'}):
            match = re.search('href="([^"]+)[^>]+>(.*?)<span[^>]*>', item)
            if match:
                match_url, match_title = match.groups()
                is_season = re.search('-season-\d+', match_url)
                if (video_type == VIDEO_TYPES.MOVIE
                        and not is_season) or (video_type == VIDEO_TYPES.SEASON
                                               and is_season):
                    if video_type == VIDEO_TYPES.SEASON:
                        if season and not re.search('season-0*%s$' %
                                                    (season), match_url):
                            continue

                    match_title = re.sub('</?[^>]*>', '', match_title)
                    match_title = re.sub('\s+Full\s+Movie', '', match_title)
                    match = re.search('-(\d{4})(?:$|-)', match_url)
                    if match:
                        match_year = match.group(1)
                    else:
                        match_year = ''

                    if norm_title in scraper_utils.normalize_title(
                            match_title) and (not year or not match_year
                                              or year == match_year):
                        result = {
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year,
                            'url': scraper_utils.pathify_url(match_url)
                        }
                        results.append(result)

        return results

示例#12

0

显示文件

文件： dayt_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        page_url = scraper_utils.urljoin(self.base_url, '/search.php')
        html = self._http_get(page_url, params={'dayq': title}, cache_limit=48)
        html = re.sub('<!--.*?-->', '', html)
        norm_title = scraper_utils.normalize_title(title)
        for _attrs, td in dom_parser2.parse_dom(html, 'td', {'class': 'topic_content'}):
            match_url = dom_parser2.parse_dom(td, 'a', req='href')
            match_title_year = dom_parser2.parse_dom(td, 'img', req='alt')
            if not match_url or not match_title_year: continue

            match_url = match_url[0].attrs['href']
            match_title_year = match_title_year[0].attrs['alt']
            if not match_url.startswith('/'): match_url = '/tvseries/' + match_url
            match_title, match_year = scraper_utils.extra_year(match_title_year)
            if (norm_title in scraper_utils.normalize_title(match_title)) and (not year or not match_year or year == match_year):
                result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)

        return results

示例#13

0

显示文件

文件： movie25_scraper.py 项目： Lhse44/repository.deallen

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     search_url = urlparse.urljoin(self.base_url, '/search.php')
     html = self._http_get(search_url, params={'q': title}, cache_limit=4)
     for item in dom_parser.parse_dom(html, 'div',
                                      {'class': 'movie_about'}):
         match_url = dom_parser.parse_dom(item, 'a', ret='href')
         match_title_year = dom_parser.parse_dom(item, 'a')
         if match_url and match_title_year:
             match_url = match_url[0]
             match_title, match_year = scraper_utils.extra_year(
                 match_title_year[0])
             if not year or not match_year or year == match_year:
                 result = {
                     'url': scraper_utils.pathify_url(match_url),
                     'title': scraper_utils.cleanse_title(match_title),
                     'year': match_year
                 }
                 results.append(result)
     return results

示例#14

0

显示文件

    def __tv_search(self, title, year):
        results = []
        search_url = scraper_utils.urljoin(self.tv_base_url, '/showlist/')
        html = self._http_get(search_url, cache_limit=48)
        match_year = ''
        norm_title = scraper_utils.normalize_title(title)
        for attrs, match_title in dom_parser2.parse_dom(
                html, 'a', {'class': 'thread_link'}, req='href'):
            match_url = attrs['href']
            if match_title.upper().endswith(', THE'):
                match_title = 'The ' + match_title[:-5]

            if norm_title in scraper_utils.normalize_title(match_title) and (
                    not year or not match_year or year == match_year):
                result = {
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year,
                    'url': scraper_utils.pathify_url(match_url)
                }
                results.append(result)
        return results

示例#15

0

显示文件

文件： dizibox_scraper.py 项目： Lhse44/repository.deallen

 def _get_episode_url(self, show_url, video):
     show_url = scraper_utils.urljoin(self.base_url, show_url)
     html = self._http_get(show_url, cache_limit=8)
     pattern = '''href=['"]([^'"]+)[^>]+>\s*%s\.\s*Sezon<''' % (video.season)
     match = re.search(pattern, html)
     if match:
         episode_pattern = '''href=['"]([^'"]+-%s-sezon-%s-bolum[^'"]*)''' % (video.season, video.episode)
         season_url = scraper_utils.urljoin(self.base_url, match.group(1))
         html = self._http_get(season_url, cache_limit=2)
         ep_url = self._default_get_episode_url(html, video, episode_pattern)
         if ep_url: return ep_url
     
     # front page fallback
     html = self._http_get(self.base_url, cache_limit=2)
     for slug in reversed(show_url.split('/')):
         if slug: break
         
     ep_url_frag = 'href="([^"]+/{slug}-{season}-sezon-{episode}-bolum[^"]*)'.format(slug=slug, season=video.season, episode=video.episode)
     match = re.search(ep_url_frag, html)
     if match:
         return scraper_utils.pathify_url(match.group(1))

示例#16

0

显示文件

文件： watchseries_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/suggest.php')
        headers = {'Referer': self.base_url}
        headers.update(XHR)
        params = {'ajax': 1, 's': title, 'type': 'TVShows'}
        html = self._http_get(search_url, params=params, cache_limit=8)
        for attrs, match_title in dom_parser2.parse_dom(html, 'a', req='href'):
            match_url = attrs['href']
            match_title = re.sub('</?[^>]*>', '', match_title)
            match = re.search('\((\d{4})\)$', match_url)
            if match:
                match_year = match.group(1)
            else:
                match_year = ''

            if not year or not match_year or year == match_year:
                result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)

        return results

示例#17

0

显示文件

文件： moviewatcher_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search')
        search_type = 'movies' if video_type == VIDEO_TYPES.MOVIE else 'series'
        html = self._http_get(search_url,
                              params={
                                  'query': title.lower(),
                                  'type': search_type
                              },
                              cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'div',
                                                  {'class': 'one_movie-item'}):
            match_url = dom_parser2.parse_dom(item, 'a', req='href')
            match_title = dom_parser2.parse_dom(item, 'img', req='alt')
            media_type = dom_parser2.parse_dom(item, 'div',
                                               {'class': 'movie-series'})
            if not media_type:
                media_type = VIDEO_TYPES.MOVIE
            elif media_type[0].content == 'TV SERIE':
                media_type = VIDEO_TYPES.TVSHOW

            if match_url and match_title and video_type == media_type:
                match_url = match_url[0].attrs['href']
                match_title = match_title[0].attrs['alt']

                match_year = re.search('-(\d{4})-', match_url)
                if match_year:
                    match_year = match_year.group(1)
                else:
                    match_year = ''

                if not year or not match_year or year == match_year:
                    result = {
                        'url': scraper_utils.pathify_url(match_url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year
                    }
                    results.append(result)

        return results

示例#18

0

显示文件

文件： rainierland_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):
        results = []
        if video_type == VIDEO_TYPES.MOVIE:
            search_url = urlparse.urljoin(self.base_url, '/?s=')
            search_url += urllib.quote_plus('%s' % (title))
            html = self._http_get(search_url, cache_limit=1)
            links = dom_parser.parse_dom(html, 'a', {'class': 'clip-link'},
                                         'href')
            titles = dom_parser.parse_dom(html, 'a', {'class': 'clip-link'},
                                          'title')
            matches = zip(links, titles)
        else:
            html = self._http_get(self.base_url, cache_limit=8)
            matches = re.findall(
                '<li\s+class="cat-item[^>]+>\s*<a\s+href="([^"]+)[^>]+>([^<]+)',
                html)
        norm_title = scraper_utils.normalize_title(title)
        for item in matches:
            url = item[0].attrs['href']
            match_title_year = item[0].attrs['title']
            match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year)

            if match:
                match_title, match_year = match.groups()
            else:
                match_title = match_title_year
                match_year = ''

            if norm_title in scraper_utils.normalize_title(match_title) and (
                    not year or not match_year or year == match_year):
                log_utils.log('Rainierland - search - Match Found: ' +
                              str(norm_title))
                result = {
                    'title': scraper_utils.cleanse_title(match_title),
                    'year': match_year,
                    'url': scraper_utils.pathify_url(url)
                }
                results.append(result)

        return results

示例#19

0

显示文件

文件： oneclick.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/search')
        html = self._http_get(search_url, cache_limit=48)
        norm_title = scraper_utils.normalize_title(title)
        for item in dom_parser.parse_dom(html, 'li'):
            match = re.search('''href=["']([^"']+)[^>]+>([^<]+)''', item)
            if match:
                url, match_title = match.groups()
                match = re.search('(.*?)\s*\(Season\s+\d+', match_title)
                if match:
                    match_title = match.group(1)

                if norm_title in scraper_utils.normalize_title(match_title):
                    result = {
                        'url': scraper_utils.pathify_url(url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': ''
                    }
                    results.append(result)

        return results

示例#20

0

显示文件

文件： tvwtvs_scraper.py 项目： Lhse44/repository.deallen

    def __search(self, video_type, title, year):
        url = urlparse.urljoin(
            self.base_url, '/advanced-search/menu-id-111.html?view=buscador')
        html = self._http_get(url, cache_limit=48)
        results = []
        norm_title = scraper_utils.normalize_title(title)
        fragment = dom_parser.parse_dom(html, 'div', {'class': 'tagindex'})
        if fragment:
            for match in re.finditer('href="([^"]+)[^>]+>(.*?)</a>',
                                     fragment[0]):
                url, match_title = match.groups()
                match_title = re.sub('\s+\(\d+\)$', '', match_title)
                match_title = match_title.replace('&amp;', '&')
                if norm_title in scraper_utils.normalize_title(match_title):
                    result = {
                        'url': scraper_utils.pathify_url(url),
                        'title': match_title,
                        'year': ''
                    }
                    results.append(result)

        return results

示例#21

0

显示文件

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search')
        params = {'q': title, 's': 't'}
        html = self._http_get(search_url, params=params, cache_limit=1)
        for _attrs, content in dom_parser2.parse_dom(html, 'span',
                                                     {'class': 'title_list'}):
            match = dom_parser2.parse_dom(content, 'a', req=['href', 'title'])
            if match:
                attrs = match[0].attrs
                match_url, match_title_year = attrs['href'], attrs['title']
                match_title, match_year = scraper_utils.extra_year(
                    match_title_year)
                if not year or not match_year or year == match_year:
                    result = {
                        'url': scraper_utils.pathify_url(match_url),
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year
                    }
                    results.append(result)

        return results

示例#22

0

显示文件

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/')
        title = re.sub('[^A-Za-z0-9 ]', '', title)
        search_url += '%s.html' % (urllib.quote_plus(title))
        html = self._http_get(search_url, cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'div',
                                                  {'class': 'ml-item'}):
            match_title = dom_parser2.parse_dom(item, 'span',
                                                {'class': 'mli-info'})
            match_url = dom_parser2.parse_dom(item, 'a', req='href')
            match_year = re.search('class="jt-info">(\d{4})<', item)
            is_episodes = dom_parser2.parse_dom(item, 'span',
                                                {'class': 'mli-eps'})

            if (video_type == VIDEO_TYPES.MOVIE
                    and not is_episodes) or (video_type == VIDEO_TYPES.SEASON
                                             and is_episodes):
                if not match_title or not match_url: continue

                match_url = match_url[0].attrs['href']
                match_title = match_title[0].content
                match_title = re.sub('</?h2>', '', match_title)
                match_title = re.sub('\s+\d{4}$', '', match_title)
                if video_type == VIDEO_TYPES.SEASON:
                    if season and not re.search('Season\s+0*%s$' %
                                                (season), match_title):
                        continue

                match_year = match_year.group(1) if match_year else ''
                if not year or not match_year or year == match_year:
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results

示例#23

0

显示文件

文件： releasebb_scraper.py 项目： Lhse44/repository.deallen

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     referer = urlparse.urljoin(SEARCH_BASE_URL, '/search/')
     headers = {'Referer': referer + urllib.quote_plus(title)}
     headers.update(XHR)
     search_url = urlparse.urljoin(SEARCH_BASE_URL, '/lib/search526049.php')
     params = {'phrase': title, 'pindex': 1}
     html = self._http_get(search_url,
                           params=params,
                           headers=headers,
                           require_debrid=True,
                           cache_limit=1)
     for post in dom_parser.parse_dom(html, 'div', {'class': 'entry post'}):
         if not CATEGORIES[video_type] in post: continue
         if self.__too_old(post): continue
         result = self._blog_proc_results(post.get('post_title', ''),
                                          '(?P<post_title>.+)(?P<url>.*?)',
                                          '', video_type, title, year)
         if result:
             result[0]['url'] = scraper_utils.pathify_url(post['post_name'])
             results.append(result[0])
     return results

示例#24

0

显示文件

文件： spacemov_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        params = {'a': title, 'submit': 'Search'}
        html = self._http_get(self.base_url, params=params, cache_limit=8)
        fragment = dom_parser.parse_dom(html, 'div', {'id': 'single-post'})
        if fragment:
            for item in dom_parser.parse_dom(fragment[0], 'div',
                                             {'class': 'box-bg'}):
                match = re.search('href="([^"]+)[^>]+>([^<]+)', item)
                if match:
                    match_url, match_title_year = match.groups()
                    match_title, match_year = scraper_utils.extra_year(
                        match_title_year)
                    if not year or not match_year or year == match_year:
                        result = {
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year,
                            'url': scraper_utils.pathify_url(match_url)
                        }
                        results.append(result)

        return results

示例#25

0

显示文件

文件： direct_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search?query=')
        search_url += title.replace("'", "")
        html = self._http_get(search_url, cache_limit=.25)
        js_result = scraper_utils.parse_json(html, search_url)
        if 'error' in js_result:
            logger.log(
                'Direct API error: "%s" @ %s' %
                (js_result['error'], search_url), log_utils.LOGWARNING)
            return results

        for match in js_result:
            # url = search_url + '&quality=%s' % match['quality']
            result = {
                'url': scraper_utils.pathify_url(url),
                'title': scraper_utils.cleanse_title(match['release']),
                'quality': match['quality'],
                'year': ''
            }
            results.append(result)
        return results

示例#26

0

显示文件

    def search(self, video_type, title, year, season=''):
        search_url = urlparse.urljoin(self.base_url, '/movie/search/')
        title = re.sub('[^A-Za-z0-9 ]', '', title)
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        results = []
        for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}):
            match_title = dom_parser.parse_dom(item, 'span',
                                               {'class': 'mli-info'})
            match_url = re.search('href="([^"]+)', item, re.DOTALL)
            match_year = re.search('class="jt-info">(\d{4})<', item)
            is_episodes = dom_parser.parse_dom(item, 'span',
                                               {'class': 'mli-eps'})

            if (video_type == VIDEO_TYPES.MOVIE
                    and not is_episodes) or (video_type == VIDEO_TYPES.SEASON
                                             and is_episodes):
                if match_title and match_url:
                    match_title = match_title[0]
                    match_title = re.sub('</?h2>', '', match_title)
                    match_title = re.sub('\s+\d{4}$', '', match_title)
                    if video_type == VIDEO_TYPES.SEASON:
                        if season and not re.search('Season\s+%s$' %
                                                    (season), match_title):
                            continue

                    url = urlparse.urljoin(match_url.group(1), 'watching.html')
                    match_year = match_year.group(1) if match_year else ''

                    if not year or not match_year or year == match_year:
                        result = {
                            'title': scraper_utils.cleanse_title(match_title),
                            'year': match_year,
                            'url': scraper_utils.pathify_url(url)
                        }
                        results.append(result)

        return results

示例#27

0

显示文件

文件： sit2play_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(SEARCH_BASE,
                                           '/1/indexes/al_titles_index/query')
        params = {
            'x-algolia-agent': 'Algolia for vanilla JavaScript (lite) 3.22.1',
            'x-algolia-application-id': 'XXDAZCOUL3',
            'x-algolia-api-key': 'c5c1279f5ad09819ecf2af9d6b5ee06a'
        }
        data = {
            'params':
            urllib.urlencode({
                'query': title,
                'facets': '*',
                'hitsPerPage': 30
            })
        }
        headers = {'Origin': self.base_url}
        html = self._http_get(search_url,
                              params=params,
                              data=json.dumps(data),
                              headers=headers,
                              cache_limit=8)
        js_data = scraper_utils.parse_json(html, search_url)
        media_type = '/movies/' if video_type == VIDEO_TYPES.MOVIE else '/tv/'
        for item in js_data.get('hits', []):
            if 'permalink' in item and 'title' in item and media_type in item[
                    'permalink']:
                match_year = str(item.get('yr', ''))
                if not year or not match_year or year == match_year:
                    result = {
                        'title': scraper_utils.cleanse_title(item['title']),
                        'url': scraper_utils.pathify_url(item['permalink']),
                        'year': match_year
                    }
                    results.append(result)

        return results

示例#28

0

显示文件

文件： filmovizjia_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search.php')
        if video_type == VIDEO_TYPES.MOVIE:
            params = {'all': 'all', 'searchin': 'mov', 'subtitles': '', 'imdbfrom': '', 'yearrange': '', 'keywords': title}
        else:
            params = {'all': 'all', 'vselect': 'ser', 'keywords': title}
        html = self._http_get(search_url, params=params, cache_limit=8)
        fragment = dom_parser2.parse_dom(html, 'ul', {'class': 'cbp-rfgrid'})
        if not fragment: return results
        
        for item in dom_parser2.parse_dom(fragment, 'li'):
            match = dom_parser2.parse_dom(item, 'a', req=['title', 'href'])
            if not match: continue
            
            match_url = match[0].attrs['href']
            match_title_year = match[0].attrs['title']
            match_title, match_year = scraper_utils.extra_year(match_title_year)
            if not year or not match_year or year == match_year:
                result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)

        return results

示例#29

0

显示文件

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        show_list_url = scraper_utils.urljoin(self.base_url, '/tv-lists/')
        html = self._http_get(show_list_url, cache_limit=8)
        results = []
        seen_urls = set()
        norm_title = scraper_utils.normalize_title(title)
        for _attrs, item in dom_parser2.parse_dom(html, 'li'):
            match = dom_parser2.parse_dom(item, 'a', req='href')
            if match:
                match_url = scraper_utils.pathify_url(match[0].attrs['href'])
                match_title = match[0].content
                if match_url in seen_urls: continue
                seen_urls.add(match_url)
                match_title = re.sub('</?strong[^>]*>', '', match_title)
                if norm_title in scraper_utils.normalize_title(match_title):
                    result = {
                        'url': match_url,
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': ''
                    }
                    results.append(result)

        return results

示例#30

0

显示文件

文件： hdmoviefree_scraper.py 项目： Lhse44/repository.deallen

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/%s.html')
        search_url = search_url % (scraper_utils.to_slug(title))
        html = self._http_get(search_url, cache_limit=8)
        for _attrs, item in dom_parser2.parse_dom(html, 'div',
                                                  {'class': 'slideposter'}):
            match_url = dom_parser2.parse_dom(item, 'a', req='href')
            match_title_year = dom_parser2.parse_dom(item, 'img', req='alt')
            if match_url and match_title_year:
                match_url = match_url[0].attrs['href']
                match_title_year = match_title_year[0].attrs['alt']
                match_title, match_year = scraper_utils.extra_year(
                    match_title_year)
                if not year or not match_year or year == match_year:
                    result = {
                        'title': scraper_utils.cleanse_title(match_title),
                        'year': match_year,
                        'url': scraper_utils.pathify_url(match_url)
                    }
                    results.append(result)

        return results