Exemplo n.º 1
0
    def horrible_entries(requests, page_url):
        entries = []

        try:
            soup = get_soup(requests.get(page_url).content)
        except RequestException as e:
            log.error('HorribleSubs request failed: %s', e)
            return entries

        for li_label in soup.findAll('li'):
            title = '[HorribleSubs] {0}{1}'.format(
                str(li_label.find('span').next_sibling), str(li_label.find('strong').text)
            )
            log.debug('Found title `%s`', title)
            url = li_label.find('a')['href']
            episode = re.sub(r'.*#', '', url)
            # Get show ID
            try:
                soup = get_soup(requests.get('https://horriblesubs.info/{0}'.format(url)).content)
            except RequestException as e:
                log.error('HorribleSubs request failed: %s', e)
                return entries
            show_id = re.sub(r'[^0-9]', '', soup(text=re.compile('hs_showid'))[0])
            entries = HorribleSubs.horrible_get_downloads(
                requests,
                title,
                'https://horriblesubs.info/api.php?method=getshows&type=show&mode=filter&showid={0}&value={1}'.format(
                    show_id, episode
                ),
            )
        return entries
Exemplo n.º 2
0
    def parse_html_list(self, task, config, url, params, headers):
        page = self.fetch_page(task, url, params, headers)
        soup = get_soup(page.text)
        try:
            item_text = soup.find('div', class_='lister-total-num-results').string.split()
            total_item_count = int(item_text[0].replace(',', ''))
            log.verbose('imdb list contains %d items', total_item_count)
        except AttributeError:
            total_item_count = 0
        except (ValueError, TypeError) as e:
            # TODO Something is wrong if we get a ValueError, I think
            raise plugin.PluginError(
                'Received invalid movie count: %s ; %s'
                % (soup.find('div', class_='lister-total-num-results').string, e)
            )

        if not total_item_count:
            log.verbose('No movies were found in imdb list: %s', config['list'])
            return

        entries = []
        items_processed = 0
        page_no = 1
        while items_processed < total_item_count:
            # Fetch the next page unless we've just begun
            if items_processed:
                page_no += 1
                params['page'] = page_no
                page = self.fetch_page(task, url, params, headers)
                soup = get_soup(page.text)

            items = soup.find_all('div', class_='lister-item')
            if not items:
                log.debug('no items found on page: %s, aborting.', url)
                break
            log.debug('%d items found on page %d', len(items), page_no)

            for item in items:
                items_processed += 1
                a = item.find('h3', class_='lister-item-header').find('a')
                if not a:
                    log.debug('no title link found for row, skipping')
                    continue

                link = ('http://www.imdb.com' + a.get('href')).rstrip('/')
                entry = Entry()
                entry['title'] = a.text
                try:
                    year = int(item.find('span', class_='lister-item-year').text)
                    entry['title'] += ' (%s)' % year
                    entry['imdb_year'] = year
                except (ValueError, TypeError):
                    pass
                entry['url'] = link
                entry['imdb_id'] = extract_id(link)
                entry['imdb_name'] = entry['title']
                entries.append(entry)

        return entries
Exemplo n.º 3
0
    def on_task_input(self, task, config):
        # use rss plugin
        # since we have to do 2 page lookups per trailer, use all_entries False to lighten load
        rss_config = {'url': self.rss_url, 'all_entries': False}
        rss_entries = super(AppleTrailers, self).on_task_input(task, rss_config)

        # Multiple entries can point to the same movie page (trailer 1, clip1, etc.)
        trailers = {}
        for entry in rss_entries:
            url = entry['original_url']
            trailers.setdefault(url, []).append(entry['title'])

        result = []
        if config == '720p':
            url_extension = 'includes/extralarge.html'
        else:
            url_extension = 'includes/large.html'
        for url, titles in trailers.iteritems():
            inc_url = url + url_extension
            try:
                page = task.requests.get(inc_url)
            except RequestException as err:
                log.warning("RequestsException when opening playlist page: %s" % err)
                continue

            soup = get_soup(page.text)
            for title in titles:
                trailer = soup.find(text=title.split(' - ')[1])
                if not trailer:
                    log.debug('did not find trailer link')
                    continue
                trailers_link = trailer.find_parent('a')
                if not trailers_link:
                    log.debug('did not find trailer link')
                    continue
                try:
                    page = task.requests.get(urlparse.urljoin(url, trailers_link['href']))
                except RequestException as e:
                    log.debug('error getting trailers page')
                    continue
                trailer_soup = get_soup(page.text)
                link = trailer_soup.find('a', attrs={'class': 'movieLink'})
                if not link:
                    log.debug('could not find download link')
                    continue
                # Need to add an 'h' in front of the resolution
                entry_url = link['href']
                entry_url = entry_url[:entry_url.find(config + '.mov')] + 'h%s.mov' % config
                entry = Entry(title, entry_url)
                # Populate a couple entry fields for making pretty filenames
                entry['movie_name'], entry['apple_trailers_name'] = title.split(' - ')
                result.append(entry)

        return result
Exemplo n.º 4
0
 def get_fuzer_soup(search_term, categories_list):
     params = {'matchquery': 'any'}
     page = requests.get(
         'https://www.fuzer.me/browse.php?ref_=advanced&query={}&{}'.format(search_term, '&'.join(categories_list)),
         params=params)
     log.debug('Using %s as fuzer search url' % page.url)
     return get_soup(page.content)
  def search(self, entry, config):
    url = "https://tehconnection.eu/torrents.php?searchstr=%s" \
      % entry.get("imdb_id");

    page = urlopener(url, log)
    soup = get_soup(page)

    results = set()

    for row in soup.find_all("tr", class_="group_torrent"):
      link = row.find(title="Download")
      info = row.find(colspan="1").contents[3].contents[0].strip()
      seeders = int(row.find_all("td")[6].contents[0].strip())
      leechers = int(row.find_all("td")[7].contents[0].strip())

      result = Entry();
      result["title"] = entry.get("title") + " / " + info
      result["imdb_id"] = entry.get("imdb_id")
      result["url"] = "https://tehconnection.eu" + link.get("href")
      result["torrent_seeds"] = seeders
      result["torrent_leeches"] = leechers
      result["search_sort"] = torrent_availability(result['torrent_seeds'],
                                                   result['torrent_leeches'])

      results.add(result)

    return results
Exemplo n.º 6
0
    def search(self, task, entry, config=None):
        """
            Search for entries on SceneAccess
        """

        if not session.cookies:
            log.debug('Logging in to %s...' % URL)
            params = {'username': config['username'],
                      'password': config['password'],
                      'submit': 'come on in'}
            session.post(URL + 'login', data=params)

        if config.has_key('gravity_multiplier'):
            multip = config['gravity_multiplier']
        else:
            multip = 1

        # Prepare queries...
        BASE_URLS = list()
        entries = set()
        for category in self.processCategories(config):
            BASE_URLS.append(URL + '%(url_path)s?method=2%(category_url_string)s' % category)

        # Search...
        for search_string in entry.get('search_strings', [entry['title']]):
            search_string_normalized = normalize_unicode(clean_title(search_string))
            search_string_url_fragment = '&search=' + quote(search_string_normalized.encode('utf8'))

            for url in BASE_URLS:
                url += search_string_url_fragment
                log.debug('Search URL for `%s`: %s' % (search_string, url))

                page = session.get(url).content
                soup = get_soup(page)

                for result in soup.findAll('tr', attrs={'class': 'tt_row'}):
                    entry = Entry()
                    entry['title'] = result.find('a', href=re.compile(r'details\?id=\d+'))['title']
                    entry['url'] = URL + result.find('a', href=re.compile(r'.torrent$'))['href']

                    entry['torrent_seeds'] = result.find('td', attrs={'class': 'ttr_seeders'}).text
                    entry['torrent_leeches'] = result.find('td', attrs={'class': 'ttr_leechers'}).text
                    entry['search_sort'] = torrent_availability(entry['torrent_seeds'], entry['torrent_leeches'])*multip

                    size = result.find('td', attrs={'class': 'ttr_size'}).next
                    size = re.search('(\d+(?:[.,]\d+)*)\s?([KMG]B)', size)

                    if size:
                        if size.group(2) == 'GB':
                            entry['content_size'] = int(float(size.group(1)) * 1000 ** 3 / 1024 ** 2)
                        elif size.group(2) == 'MB':
                            entry['content_size'] = int(float(size.group(1)) * 1000 ** 2 / 1024 ** 2)
                        elif size.group(2) == 'KB':
                            entry['content_size'] = int(float(size.group(1)) * 1000 / 1024 ** 2)
                        else:
                            entry['content_size'] = int(float(size.group(1)) / 1024 ** 2)

                    entries.add(entry)

        return entries
Exemplo n.º 7
0
    def url_rewrite(self, task, entry):
        url = entry['url']
        page = None
        for (scheme, netloc) in EZTV_MIRRORS:
            try:
                _, _, path, params, query, fragment = urlparse(url)
                url = urlunparse((scheme, netloc, path, params, query, fragment))
                page = task.requests.get(url).content
            except RequestException as e:
                log.debug('Eztv mirror `%s` seems to be down', url)
                continue
            break

        if not page:
            raise UrlRewritingError('No mirrors found for url %s' % entry['url'])

        log.debug('Eztv mirror `%s` chosen', url)
        try:
            soup = get_soup(page)
            mirrors = soup.find_all('a', attrs={'class': re.compile(r'download_\d')})
        except Exception as e:
            raise UrlRewritingError(e)

        log.debug('%d torrent mirrors found', len(mirrors))

        if not mirrors:
            raise UrlRewritingError('Unable to locate download link from url %s' % url)

        entry['urls'] = [m.get('href') for m in mirrors]
        entry['url'] = mirrors[0].get('href')
    def search(self, task, entry, config=None):

        txheaders = {
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-us,en;q=0.5',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Keep-Alive': '300',
            'Connection': 'keep-alive',
        }
        nzbs = set()
        for search_string in entry.get('search_strings', [entry['title']]):
            query = entry['title']
            url = u'http://newzleech.com/?%s' % str(urllib.urlencode({'q': query.encode('latin1'),
                                                                      'm': 'search', 'group': '', 'min': 'min',
                                                                      'max': 'max', 'age': '', 'minage': '',
                                                                      'adv': ''}))
            # log.debug('Search url: %s' % url)

            req = urllib2.Request(url, headers=txheaders)
            page = urlopener(req, log)
            soup = get_soup(page)

            for item in soup.find_all('table', attrs={'class': 'contentt'}):
                subject_tag = item.find('td', attrs={'class': 'subject'}).next
                subject = ''.join(subject_tag.find_all(text=True))
                complete = item.find('td', attrs={'class': 'complete'}).contents[0]
                size = item.find('td', attrs={'class': 'size'}).contents[0]
                nzb_url = 'http://newzleech.com/' + item.find('td', attrs={'class': 'get'}).next.get('href')

                # generate regexp from entry title and see if it matches subject
                regexp = query
                wildcardize = [' ', '-']
                for wild in wildcardize:
                    regexp = regexp.replace(wild, '.')
                regexp = '.*' + regexp + '.*'
                # log.debug('Title regexp: %s' % regexp)

                if re.match(regexp, subject):
                    log.debug('%s matches to regexp' % subject)
                    if complete != u'100':
                        log.debug('Match is incomplete %s from newzleech, skipping ..' % query)
                        continue
                    log.info('Found \'%s\'' % query)

                    try:
                        size_num = float(size[:-3])
                    except (ValueError, TypeError):
                        log.error('Failed to parse_size %s' % size)
                        size_num = 0
                    # convert into megabytes
                    if 'GB' in size:
                        size_num *= 1024
                    if 'KB' in size:
                        size_num /= 1024

                    # choose largest file
                    nzbs.add(Entry(title=subject, url=nzb_url, content_size=size_num, search_sort=size_num))

        return nzbs
Exemplo n.º 9
0
    def on_task_input(self, task, config):
        session = requests.Session()
        data = {'username': config['username'], 'password': config['password'], 'sub_login': '******'}
        try:
            r = session.post('http://www.pogdesign.co.uk/cat/', data=data)
            if 'U / P Invalid' in r.text:
                raise plugin.PluginError('Invalid username/password for pogdesign.')
            page = session.get('http://www.pogdesign.co.uk/cat/showselect.php')
        except requests.RequestException as e:
            raise plugin.PluginError('Error retrieving source: %s' % e)
        soup = get_soup(page.text)
        entries = []
        for row in soup.find_all('label', {'class': 'label_check'}):
            if row.find(attrs={'checked': 'checked'}):
                t = row.find('strong').text
                if t.endswith('[The]'):
                    t = 'The ' + t[:-6]

                # Make certain names friendlier
                if t in self.name_map:
                    t = self.name_map[t]

                e = Entry()
                e['title'] = t
                url = row.find_next('a', {'class': 'slink'})
                e['url'] = 'http://www.pogdesign.co.uk' + url['href']
                entries.append(e)
        return entries
Exemplo n.º 10
0
    def parse_download_page(self, url):
        if 'newpct1.com' in url:
            log.verbose('Newpct1 URL: %s', url)
            url = url.replace('newpct1.com/', 'newpct1.com/descarga-torrent/')
        else:
            log.verbose('Newpct URL: %s', url)

        try:
            page = requests.get(url)
        except requests.exceptions.RequestException as e:
            raise UrlRewritingError(e)
        try:
            soup = get_soup(page.text)
        except Exception as e:
            raise UrlRewritingError(e)

        if 'newpct1.com' in url:
            torrent_id_prog = re.compile(r'descargar-torrent/(.+)/')
            torrent_ids = soup.findAll(href=torrent_id_prog)
        else:
            torrent_id_prog = re.compile("(?:parametros\s*=\s*\n?)\s*{\s*\n(?:\s*'\w+'\s*:.*\n)+\s*'(?:torrentID|id)'\s*:\s*'(\d+)'")
            torrent_ids = soup.findAll(text=torrent_id_prog)

        if len(torrent_ids) == 0:
            raise UrlRewritingError('Unable to locate torrent ID from url %s' % url)

        if 'newpct1.com' in url:
            torrent_id = torrent_id_prog.search(torrent_ids[0]['href']).group(1)
            return NEWPCT1_TORRENT_FORMAT.format(torrent_id)
        else:
            torrent_id = torrent_id_prog.search(torrent_ids[0]).group(1)
            return NEWPCT_TORRENT_FORMAT.format(torrent_id)
Exemplo n.º 11
0
    def _get_watchlist_entries(self, task, config):
        email = config.get("email")
        log.info("Retrieving npo.nl episode watchlist for %s", email)

        response = self._get_page(task, config, "https://mijn.npo.nl/profiel/kijklijst")
        page = get_soup(response.content)

        self.csrf_token = page.find("meta", attrs={"name": "csrf-token"})["content"]

        entries = list()
        for listItem in page.findAll("div", class_="watch-list-item"):
            url = listItem.find("a")["href"]
            series_name = next(listItem.find("h3").stripped_strings)
            remove_url = listItem.find("a", class_="unwatch-confirm")["href"]
            entry_date = self._parse_date(listItem.find("span", class_="global__content-info").text)

            episode_id = url.split("/")[-1]
            title = "{} ({})".format(series_name, episode_id)

            e = Entry()
            e["url"] = self._prefix_url("https://mijn.npo.nl", url)
            e["title"] = title
            e["series_name"] = series_name
            e["series_name_plain"] = self._strip_accents(series_name)
            e["series_date"] = entry_date
            e["series_id_type"] = "date"
            e["description"] = listItem.find("p").text
            e["remove_url"] = self._prefix_url("https://mijn.npo.nl", remove_url)

            if config.get("remove_accepted"):
                e.on_complete(self.entry_complete, task=task)

            entries.append(e)

        return entries
Exemplo n.º 12
0
    def search(self, task, entry, config=None):
        """
        Search for name from piratebay.
        """
        if not isinstance(config, dict):
            config = {}
        self.set_urls(config.get('url', URL))
        sort = SORT.get(config.get('sort_by', 'seeds'))
        if config.get('sort_reverse'):
            sort += 1
        if isinstance(config.get('category'), int):
            category = config['category']
        else:
            category = CATEGORIES.get(config.get('category', 'all'))
        filter_url = '/0/%d/%d' % (sort, category)

        entries = set()
        for search_string in entry.get('search_strings', [entry['title']]):
            query = normalize_unicode(search_string)

            # TPB search doesn't like dashes or quotes
            query = query.replace('-', ' ').replace("'", " ")

            # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand
            url = '%s/search/%s%s' % (self.url, quote(query.encode('utf-8')), filter_url)
            log.debug('Using %s as piratebay search url' % url)
            page = task.requests.get(url).content
            soup = get_soup(page)
            for link in soup.find_all('a', attrs={'class': 'detLink'}):
                entry = Entry()
                entry['title'] = self.extract_title(link)
                if not entry['title']:
                    log.error('Malformed search result. No title or url found. Skipping.')
                    continue
                href = link.get('href')
                if href.startswith('/'):  # relative link?
                    href = self.url + href
                entry['url'] = href
                tds = link.parent.parent.parent.find_all('td')
                entry['torrent_seeds'] = int(tds[-2].contents[0])
                entry['torrent_leeches'] = int(tds[-1].contents[0])
                entry['torrent_availability'] = torrent_availability(
                    entry['torrent_seeds'], entry['torrent_leeches']
                )
                # Parse content_size
                size_text = link.find_next(attrs={'class': 'detDesc'}).get_text()
                if size_text:
                    size = re.search(r'Size (\d+(\.\d+)?\xa0(?:[PTGMK])?i?B)', size_text)
                    if size:
                        entry['content_size'] = parse_filesize(size.group(1))
                    else:
                        log.error(
                            'Malformed search result? Title: "%s", No size? %s',
                            entry['title'],
                            size_text,
                        )

                entries.add(entry)

        return sorted(entries, reverse=True, key=lambda x: x.get('torrent_availability'))
Exemplo n.º 13
0
    def on_feed_input(self, feed, config):
        config = self.build_config(config)

        log.debug('InputPlugin html requesting url %s' % config['url'])

        if config.get('username') and config.get('password'):
            log.debug('Basic auth enabled. User: %s Password: %s' % (config['username'], config['password']))
            passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
            passman.add_password(None, config['url'], config['username'], config['password'])
            handlers = [urllib2.HTTPBasicAuthHandler(passman)]
        else:
            handlers = None
        page = urlopener(config['url'], log, handlers=handlers)
        soup = get_soup(page)
        log.debug('Detected encoding %s' % soup.originalEncoding)

        # dump received content into a file
        if 'dump' in config:
            name = config['dump']
            log.info('Dumping %s into %s' % (config['url'], name))
            data = soup.prettify()
            f = open(name, 'w')
            f.write(data)
            f.close()

        return self.create_entries(config['url'], soup, config)
Exemplo n.º 14
0
    def horrible_get_downloads(requests, title, page_url):
        entries = []

        try:
            soup = get_soup(requests.get(page_url).content)
        except RequestException as e:
            log.error('HorribleSubs request failed: %s', e)
            return entries
        for div in soup.findAll('div', attrs={'class': 'rls-link'}):
            ttitle = '{0} [{1}]'.format(title, re.sub(r'.*-', '', div['id']))
            urls = []
            for url in div.findAll('a'):
                # skip non torrent based links
                if (
                    'hs-ddl-link' in url.parent.attrs['class']
                    or 'hs-xdcc-link' in url.parent.attrs['class']
                ):
                    continue
                log.debug('Found url `%s`', url)
                urls.append(url.attrs['href'])
            # move magnets to last, a bit hacky
            for url in urls[:]:
                if url.startswith('magnet'):
                    urls.remove(url)
                    urls.append(url)
            entries.append(Entry(title=ttitle, url=urls[0], urls=urls))
        return entries
Exemplo n.º 15
0
    def _get_favorites_entries(self, task, config):
        email = config.get('email')
        max_age = config.get('max_episode_age_days')

        log.info('Retrieving npo.nl favorite series for %s', email)
        response = self._get_page(task, config, 'https://mijn.npo.nl/profiel/favorieten')
        page = get_soup(response.content)

        entries = list()
        for listItem in page.findAll('div', class_='thumb-item'):
            url = listItem.find('a')['href']
            
            if url == '/profiel/favorieten/favorieten-toevoegen':
                log.debug("Skipping 'add favorite' button")
                continue

            url = self._prefix_url('https://mijn.npo.nl', url)
            series_name = next(listItem.find('div', class_='thumb-item__title').stripped_strings)
            
            last_aired_text = listItem.find('div', class_='thumb-item__subtitle').text
            last_aired_text = last_aired_text.rsplit('Laatste aflevering ')[-1]
            last_aired = self._parse_date(last_aired_text)
            
            if last_aired is None:
                log.info('Series %s did not yet start', series_name)
                continue
            elif max_age >= 0 and (date.today() - last_aired) > timedelta(days=max_age):
                log.debug('Skipping %s, last aired on %s', series_name, last_aired)
                continue
            elif (date.today() - last_aired) > timedelta(days=365*2):
                log.info('Series %s last aired on %s', series_name, last_aired)
            
            entries += self._get_series_episodes(task, config, series_name, url)

        return entries
Exemplo n.º 16
0
    def _get_watchlist_entries(self, task, config):
        email = config.get('email')
        log.info('Retrieving npo.nl episode watchlist for %s', email)

        response = self._get_page(task, config, 'https://mijn.npo.nl/profiel/kijklijst')
        page = get_soup(response.content)

        self.csrf_token = page.find('meta', attrs={'name': 'csrf-token'})['content']

        entries = list()
        for list_item in page.findAll('div', class_='watch-list-item'):
            url = list_item.find('a')['href']
            series_name = next(list_item.find('h3').stripped_strings)
            remove_url = list_item.find('a', class_='unwatch-confirm')['href']
            entry_date = self._parse_date(list_item.find('span', class_='global__content-info').text)

            episode_id = url.split('/')[-1]
            title = '{} ({})'.format(series_name, episode_id)

            e = Entry()
            e['url'] = self._prefix_url('https://mijn.npo.nl', url)
            e['title'] = title
            e['series_name'] = series_name
            e['series_name_plain'] = self._convert_plain(series_name)
            e['series_date'] = entry_date
            e['series_id_type'] = 'date'
            e['description'] = list_item.find('p').text
            e['remove_url'] = self._prefix_url('https://mijn.npo.nl', remove_url)

            if config.get('remove_accepted'):
                e.on_complete(self.entry_complete, task=task)

            entries.append(e)

        return entries
Exemplo n.º 17
0
    def horrible_entries(requests, page_url):
        entries = []

        try:
            soup = get_soup(requests.get(page_url).content)
        except RequestException as e:
            log.error('HorribleSubs request failed: %s', e)
            return entries

        for td_label in soup.findAll('td', attrs={'class': 'dl-label'}):
            title = '[HorribleSubs] {0}'.format(str(td_label.find('i').string))
            urls = []
            log.debug('Found title `{0}`'.format(title))
            for span in td_label.parent.findAll('span', attrs={'class': 'dl-link'}):
                # skip non torrent based links
                if 'hs-ddl-link' in span.parent.attrs['class']:
                    continue
                url = str(span.find('a').attrs['href'])
                log.debug('Found url `{0}`'.format(url))
                urls.append(url)
            # move magnets to last, a bit hacky
            for url in urls[:]:
                if url.startswith('magnet'):
                    urls.remove(url)
                    urls.append(url)
            entries.append(Entry(title=title, url=urls[0], urls=urls))
        return entries
Exemplo n.º 18
0
    def _solveCaptcha(self, output, url_auth, params, opener):
        """
        When trying to connect too many times with wrong password, a captcha can be requested.
        This captcha is really simple and can be solved by the provider.

        <label for="pass">204 + 65 = </label>
            <input type="text" size="40" name="captchaAnswer" id="lgn" value=""/>
            <input type="hidden" name="captchaQuery" value="204 + 65 = ">
            <input type="hidden" name="captchaToken" value="005d54a7428aaf587460207408e92145">
        <br/>

        :param output: initial login output
        :return: output after captcha resolution
        """
        html = get_soup(output)

        query = html.find("input", {"name": "captchaQuery"})
        token = html.find("input", {"name": "captchaToken"})
        if not query or not token:
            log.error("Unable to solve login captcha.")
            return output

        query_expr = query.attrs["value"].strip("= ")
        log.debug("Captcha query: " + query_expr)
        answer = arithmeticEval(query_expr)

        log.debug("Captcha answer: %s" % answer)

        params["captchaAnswer"] = answer
        params["captchaQuery"] = query.attrs["value"]
        params["captchaToken"] = token.attrs["value"]

        return opener.open(url_auth, urllib.parse.urlencode(params)).read()
Exemplo n.º 19
0
    def url_rewrite(self, task, entry):
        url = entry["url"]
        page = None
        for (scheme, netloc) in EZTV_MIRRORS:
            try:
                _, _, path, params, query, fragment = urlparse(url)
                url = urlunparse((scheme, netloc, path, params, query, fragment))
                page = task.requests.get(url).content
            except RequestException as e:
                log.debug("Eztv mirror `%s` seems to be down", url)
                continue
            break

        if not page:
            raise UrlRewritingError("No mirrors found for url %s" % entry["url"])

        log.debug("Eztv mirror `%s` chosen", url)
        try:
            soup = get_soup(page)
            mirrors = soup.find_all("a", attrs={"class": re.compile(r"download_\d")})
        except Exception as e:
            raise UrlRewritingError(e)

        log.debug("%d torrent mirrors found", len(mirrors))

        if not mirrors:
            raise UrlRewritingError("Unable to locate download link from url %s" % url)

        entry["urls"] = [m.get("href") for m in mirrors]
        entry["url"] = mirrors[0].get("href")
Exemplo n.º 20
0
    def on_task_input(self, task, config=None):
        config = self.build_config(config)
        url = base_url + config['p_slug'] + config['sort_by']
        max_results = config.get('max_results', 1)
        rcount = 0
        next_page = ''

        log.verbose('Looking for films in Letterboxd list: %s' % url)

        entries = []
        while next_page is not None and rcount < max_results:
            try:
                page = requests.get(url).content
            except RequestException as e:
                raise plugin.PluginError('Error retrieving list from Letterboxd: %s' % e)
            soup = get_soup(page)

            for film in soup.find_all(attrs={config['f_slug']: True}):
                if rcount < max_results:
                    entry = self.parse_film(film, config)
                    entries.append(entry)
                    if 'max_results' in config:
                        rcount += 1

            next_page = soup.select_one('.paginate-nextprev .next')
            if next_page is not None:
                next_page = next_page.get('href')
                if next_page is not None:
                    url = base_url + next_page

        return entries
Exemplo n.º 21
0
    def parse_site(self, url, task):
        """Parse configured url and return releases array"""

        try:
            page = task.requests.get(url).content
        except RequestException as e:
            raise plugin.PluginError('Error getting input page: %e' % e)
        soup = get_soup(page)

        releases = []
        for entry in soup.find_all('div', attrs={'class': 'entry'}):
            release = {}
            title = entry.find('h2')
            if not title:
                log.debug('No h2 entrytitle')
                continue
            release['title'] = title.a.contents[0].strip()

            log.debug('Processing title %s' % (release['title']))

            for link in entry.find_all('a'):
                # no content in the link
                if not link.contents:
                    continue
                link_name = link.contents[0]
                if link_name is None:
                    continue
                if not isinstance(link_name, NavigableString):
                    continue
                link_name = link_name.strip().lower()
                if link.has_attr('href'):
                    link_href = link['href']
                else:
                    continue
                log.debug('found link %s -> %s' % (link_name, link_href))
                # handle imdb link
                if link_name.lower() == 'imdb':
                    log.debug('found imdb link %s' % link_href)
                    release['imdb_id'] = extract_id(link_href)

                # test if entry with this url would be rewritable by known plugins (ie. downloadable)
                temp = {}
                temp['title'] = release['title']
                temp['url'] = link_href
                urlrewriting = plugin.get_plugin_by_name('urlrewriting')
                if urlrewriting['instance'].url_rewritable(task, temp):
                    release['url'] = link_href
                    log.trace('--> accepting %s (resolvable)' % link_href)
                else:
                    log.trace('<-- ignoring %s (non-resolvable)' % link_href)

            # reject if no torrent link
            if 'url' not in release:
                from flexget.utils.log import log_once
                log_once('%s skipped due to missing or unsupported (unresolvable) download link' % (release['title']),
                         log)
            else:
                releases.append(release)

        return releases
Exemplo n.º 22
0
    def search(self, task, entry, config=None):
        """
        Search for name from iptorrents
        """

        categories = config.get('category', 'All')
        # Make sure categories is a list
        if not isinstance(categories, list):
            categories = [categories]

        # If there are any text categories, turn them into their id number
        categories = [c if isinstance(c, int) else CATEGORIES[c]
                      for c in categories]
        filter_url = '&'.join((str(c) + '=') for c in categories)

        entries = set()

        for search_string in entry.get('search_strings', [entry['title']]):
            query = normalize_unicode(search_string)
            query = quote_plus(query.encode('utf8'))

            url = "{base_url}/t?{filter}&q={query}&qf=".format(base_url=BASE_URL, filter=filter_url, query=query)
            log.debug('searching with url: %s' % url)
            req = requests.get(url, cookies={'uid': str(config['uid']), 'pass': config['password']})

            if '/u/' + str(config['uid']) not in req.text:
                raise plugin.PluginError("Invalid cookies (user not logged in)...")

            soup = get_soup(req.content, parser="html.parser")
            torrents = soup.find('table', {'id': 'torrents'})

            results = torrents.findAll('tr')
            for torrent in results:
                if torrent.th and 'ac' in torrent.th.get('class'):
                    # Header column
                    continue
                if torrent.find('td', {'colspan': '99'}):
                    log.debug('No results found for search %s', search_string)
                    break
                entry = Entry()
                link = torrent.find('a', href=re.compile('download'))['href']
                entry['url'] = "{base}{link}?torrent_pass={key}".format(
                    base=BASE_URL, link=link, key=config.get('rss_key'))
                entry['title'] = torrent.find('a', href=re.compile('details')).text

                seeders = torrent.findNext('td', {'class': 'ac t_seeders'}).text
                leechers = torrent.findNext('td', {'class': 'ac t_leechers'}).text
                entry['torrent_seeds'] = int(seeders)
                entry['torrent_leeches'] = int(leechers)
                entry['search_sort'] = torrent_availability(entry['torrent_seeds'],
                                                            entry['torrent_leeches'])

                size = torrent.findNext(text=re.compile('^([\.\d]+) ([GMK]?)B$'))
                size = re.search('^([\.\d]+) ([GMK]?)B$', size)

                entry['content_size'] = parse_filesize(size.group(0))
                log.debug('Found entry %s', entry)
                entries.add(entry)

        return entries
Exemplo n.º 23
0
    def parse_download_page(self, url):
        if 'newpct1.com' in url:
            log.verbose('Newpct1 URL: %s', url)
            url = url.replace('newpct1.com/', 'newpct1.com/descarga-torrent/')
        else:
            log.verbose('Newpct URL: %s', url)

        try:
            page = requests.get(url)
        except requests.exceptions.RequestException as e:
            raise UrlRewritingError(e)
        try:
            soup = get_soup(page.text)
        except Exception as e:
            raise UrlRewritingError(e)

        if 'newpct1.com' in url:
            torrent_id_prog = re.compile(r'descargar-torrent/(.+)/')
            torrent_ids = soup.findAll(href=torrent_id_prog)
        else:
            torrent_id_prog = re.compile("'(?:torrentID|id)'\s*:\s*'(\d+)'")
            torrent_ids = soup.findAll(text=torrent_id_prog)

        if len(torrent_ids) == 0:
            raise UrlRewritingError('Unable to locate torrent ID from url %s' % url)

        if 'newpct1.com' in url:
            torrent_id = torrent_id_prog.search(torrent_ids[0]['href']).group(1)
            return 'http://www.newpct1.com/download/%s.torrent' % torrent_id
        else:
            torrent_id = torrent_id_prog.search(torrent_ids[0]).group(1)
            return 'http://www.newpct.com/torrents/{:0>6}.torrent'.format(torrent_id)
Exemplo n.º 24
0
    def search(self, query, comparator=StringComparator(), config=None):
        """
        Search for name from piratebay.
        """
        if not isinstance(config, dict):
            config = {}
        sort = SORT.get(config.get('sort_by', 'seeds'))
        if config.get('sort_reverse'):
            sort += 1
        if isinstance(config.get('category'), int):
            category = config['category']
        else:
            category = CATEGORIES.get(config.get('category', 'all'))
        filter_url = '/0/%d/%d' % (sort, category)

        comparator.set_seq1(query)
        query = comparator.search_string()
        # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand
        url = 'http://thepiratebay.se/search/' + urllib.quote(query.encode('utf-8')) + filter_url
        log.debug('Using %s as piratebay search url' % url)
        page = requests.get(url).content
        soup = get_soup(page)
        entries = []
        for link in soup.find_all('a', attrs={'class': 'detLink'}):
            comparator.set_seq2(link.contents[0])
            log.debug('name: %s' % comparator.a)
            log.debug('found name: %s' % comparator.b)
            log.debug('confidence: %s' % comparator.ratio())
            if not comparator.matches():
                continue
            entry = Entry()
            entry['title'] = link.contents[0]
            entry['url'] = 'http://thepiratebay.se' + link.get('href')
            tds = link.parent.parent.parent.find_all('td')
            entry['torrent_seeds'] = int(tds[-2].contents[0])
            entry['torrent_leeches'] = int(tds[-1].contents[0])
            entry['search_ratio'] = comparator.ratio()
            entry['search_sort'] = torrent_availability(entry['torrent_seeds'], entry['torrent_leeches'])
            # Parse content_size
            size = link.find_next(attrs={'class': 'detDesc'}).contents[0]
            size = re.search('Size ([\.\d]+)\xa0([GMK])iB', size)
            if size:
                if size.group(2) == 'G':
                    entry['content_size'] = int(float(size.group(1)) * 1000 ** 3 / 1024 ** 2)
                elif size.group(2) == 'M':
                    entry['content_size'] = int(float(size.group(1)) * 1000 ** 2 / 1024 ** 2)
                else:
                    entry['content_size'] = int(float(size.group(1)) * 1000 / 1024 ** 2)
            entries.append(entry)

        if not entries:
            dashindex = query.rfind('-')
            if dashindex != -1:
                return self.search(query[:dashindex], comparator=comparator)
            else:
                raise PluginWarning('No close matches for %s' % query, log, log_once=True)

        entries.sort(reverse=True, key=lambda x: x.get('search_sort'))

        return entries
Exemplo n.º 25
0
    def _get_page(self, task, config, url):
        login_response = task.requests.get(url)
        if login_response.url == url:
            log.debug("Already logged in")
            return login_response
        elif login_response.url != "https://mijn.npo.nl/inloggen":
            raise plugin.PluginError("Unexpected login page: {}".format(login_response.url))

        login_page = get_soup(login_response.content)
        token = login_page.find("input", attrs={"name": "authenticity_token"})["value"]

        email = config.get("email")
        password = config.get("password")

        try:
            profile_response = task.requests.post(
                "https://mijn.npo.nl/sessions", {"authenticity_token": token, "email": email, "password": password}
            )
        except requests.RequestException as e:
            raise plugin.PluginError("Request error: %s" % e.args[0])

        if profile_response.url == "https://mijn.npo.nl/sessions":
            raise plugin.PluginError("Failed to login. Check username and password.")
        elif profile_response.url != url:
            raise plugin.PluginError("Unexpected page: {} (expected {})".format(profile_response.url, url))

        return profile_response
Exemplo n.º 26
0
    def on_task_input(self, task, config=None):
        config = self.build_config(config)
        url = base_url + config["p_slug"] + config["sort_by"]
        max_results = config.get("max_results", 1)
        rcount = 0
        next_page = ""

        log.verbose("Looking for films in Letterboxd list: %s" % url)

        entries = []
        while next_page is not None and rcount < max_results:
            try:
                page = requests.get(url).content
            except RequestException as e:
                raise plugin.PluginError("Error retrieving list from Letterboxd: %s" % e)
            soup = get_soup(page)

            for film in soup.find_all(attrs={config["f_slug"]: True}):
                if rcount < max_results:
                    entry = self.parse_film(film, config)
                    entries.append(entry)
                    if "max_results" in config:
                        rcount += 1

            next_page = soup.find(class_="paginate-next")
            if next_page is not None:
                next_page = next_page.get("href")
                if next_page is not None:
                    url = base_url + next_page

        return entries
Exemplo n.º 27
0
    def parse_download_page(self, url):
        txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
        req = urllib2.Request(url, None, txheaders)
        page = urlopener(req, log)
        try:
            soup = get_soup(page)
        except Exception as e:
            raise UrlRewritingError(e)

        config = self.config or {}
        config.setdefault('quality', 'hd')

        links = soup.find_all('a', text="Descargar", href=re.compile("/subtitles"))
        if not links:
            raise UrlRewritingError('Unable to locate subtitle download link from url %s' % url)

        subtitle_url = ''
        for link in links:
            sub_url = link['href']
            log.verbose('Found url %s', sub_url)
            if config['quality'] == 'hd' and re.search("720p|1080p",sub_url):
                subtitle_url = 'http://www.argenteam.net' + sub_url
                log.verbose('is a match')
                break
            if config['quality'] == 'sd' and re.search("720p|1080p",sub_url) == None:
                subtitle_url = 'http://www.argenteam.net' + sub_url
                log.verbose('is a match')
                break
        if subtitle_url == '':
            raise UrlRewritingError('Unable to locate download link %s from url %s' % (config['quality'], url))
        return subtitle_url
Exemplo n.º 28
0
    def _get_page(self, task, config, url):
        login_response = requests.get(url)
        if login_response.url == url:
            log.debug('Already logged in')
            return login_response
        elif login_response.url != 'https://mijn.npo.nl/inloggen':
            raise plugin.PluginError('Unexpected login page: {}'.format(login_response.url))

        login_page = get_soup(login_response.content)
        token = login_page.find('input', attrs={'name': 'authenticity_token'})['value']

        email = config.get('email')
        password = config.get('password')

        try:
            profile_response = requests.post('https://mijn.npo.nl/sessions',
                                                  {'authenticity_token': token,
                                                   'email': email,
                                                   'password': password})
        except requests.RequestException as e:
            raise plugin.PluginError('Request error: %s' % e.args[0])

        if profile_response.url == 'https://mijn.npo.nl/sessions':
            raise plugin.PluginError('Failed to login. Check username and password.')
        elif profile_response.url != url:
            raise plugin.PluginError('Unexpected page: {} (expected {})'.format(profile_response.url, url))

        return profile_response
Exemplo n.º 29
0
    def url_rewrite(self, task, entry):
        log.debug('Requesting %s' % entry['url'])
        page = requests.get(entry['url'])
        soup = get_soup(page.text)

        for link in soup.findAll('a', attrs={'href': re.compile(r'^/url')}):
            # Extract correct url from google internal link
            href = 'http://google.com' + link['href']
            args = parse_qs(urlparse(href).query)
            href = args['q'][0]

            # import IPython; IPython.embed()
            # import sys
            # sys.exit(1)
            # href = link['href'].lstrip('/url?q=').split('&')[0]

            # Test if entry with this url would be recognized by some urlrewriter
            log.trace('Checking if %s is known by some rewriter' % href)
            fake_entry = {'title': entry['title'], 'url': href}
            urlrewriting = plugin.get_plugin_by_name('urlrewriting')
            if urlrewriting['instance'].url_rewritable(task, fake_entry):
                log.debug('--> rewriting %s (known url pattern)' % href)
                entry['url'] = href
                return
            else:
                log.debug('<-- ignoring %s (unknown url pattern)' % href)
        raise UrlRewritingError('Unable to resolve')
Exemplo n.º 30
0
    def _login(self, task, config):
        if 'isAuthenticatedUser' in requests.cookies:
            log.debug('Already logged in')
            return

        login_url = 'https://www.npostart.nl/login'
        login_api_url = 'https://www.npostart.nl/api/login'

        try:
            login_response = requests.get(login_url)
            if login_response.url != login_url:
                raise plugin.PluginError('Unexpected login page: {}'.format(login_response.url))

            login_page = get_soup(login_response.content)
            token = login_page.find('input', attrs={'name': '_token'})['value']

            email = config.get('email')
            password = config.get('password')

            profile_response = requests.post(login_api_url,
                                             {'_token': token,
                                              'username': email,
                                              'password': password})

            if 'isAuthenticatedUser' not in profile_response.cookies:
                raise plugin.PluginError('Failed to login. Check username and password.')
            log.debug('Succesfully logged in: %s', email)
        except RequestException as e:
            raise plugin.PluginError('Request error: %s' % str(e))
Exemplo n.º 31
0
    def search(self, task, entry, config=None):
        """
        Search for name from piratebay.
        """
        if not isinstance(config, dict):
            config = {}
        sort = SORT.get(config.get('sort_by', 'seeds'))
        if config.get('sort_reverse'):
            sort += 1
        if isinstance(config.get('category'), int):
            category = config['category']
        else:
            category = CATEGORIES.get(config.get('category', 'all'))
        filter_url = '/0/%d/%d' % (sort, category)

        entries = set()
        for search_string in entry.get('search_strings', [entry['title']]):
            query = normalize_unicode(search_string)
            # TPB search doesn't like dashes
            query = query.replace('-', ' ')
            # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand
            url = 'http://thepiratebay.%s/search/%s%s' % (
                CUR_TLD, urllib.quote(query.encode('utf-8')), filter_url)
            log.debug('Using %s as piratebay search url' % url)
            page = requests.get(url).content
            soup = get_soup(page)
            for link in soup.find_all('a', attrs={'class': 'detLink'}):
                entry = Entry()
                entry['title'] = link.contents[0]
                entry['url'] = 'http://thepiratebay.%s%s' % (CUR_TLD,
                                                             link.get('href'))
                tds = link.parent.parent.parent.find_all('td')
                entry['torrent_seeds'] = int(tds[-2].contents[0])
                entry['torrent_leeches'] = int(tds[-1].contents[0])
                entry['search_sort'] = torrent_availability(
                    entry['torrent_seeds'], entry['torrent_leeches'])
                # Parse content_size
                size = link.find_next(attrs={'class': 'detDesc'}).contents[0]
                size = re.search('Size ([\.\d]+)\xa0([GMK])iB', size)
                if size:
                    if size.group(2) == 'G':
                        entry['content_size'] = int(
                            float(size.group(1)) * 1000**3 / 1024**2)
                    elif size.group(2) == 'M':
                        entry['content_size'] = int(
                            float(size.group(1)) * 1000**2 / 1024**2)
                    else:
                        entry['content_size'] = int(
                            float(size.group(1)) * 1000 / 1024**2)
                entries.add(entry)

        return sorted(entries,
                      reverse=True,
                      key=lambda x: x.get('search_sort'))
Exemplo n.º 32
0
 def parse_download_page(self, url, requests):
     txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
     page = requests.get(url, headers=txheaders)
     try:
         soup = get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(e)
     down_link = soup.find('a', attrs={'href': re.compile(".+mp4")})
     if not down_link:
         raise UrlRewritingError('Unable to locate download link from url %s' % url)
     return down_link.get('href')
Exemplo n.º 33
0
    def authenticate(self):
        """Authenticates a session with imdb, and grabs any IDs needed for getting/modifying list."""
        try:
            r = self._session.get(
                'https://www.imdb.com/ap/signin?openid.return_to=https%3A%2F%2Fwww.imdb.com%2Fap-signin-'
                'handler&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&'
                'openid.assoc_handle=imdb_mobile_us&openid.mode=checkid_setup&openid.claimed_id=http%3A%'
                '2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.ope'
                'nid.net%2Fauth%2F2.0')
        except RequestException as e:
            raise PluginError(e.args[0])
        soup = get_soup(r.content)
        inputs = soup.select('form#ap_signin_form input')
        data = dict(
            (i['name'], i.get('value')) for i in inputs if i.get('name'))
        data['email'] = self.config['login']
        data['password'] = self.config['password']
        d = self._session.post('https://www.imdb.com/ap/signin', data=data)
        # Get user id by extracting from redirect url
        r = self._session.head('http://www.imdb.com/profile',
                               allow_redirects=False)
        if not r.headers.get('location') or 'login' in r.headers['location']:
            raise plugin.PluginError(
                'Login to imdb failed. Check your credentials.')
        self.user_id = re.search('ur\d+(?!\d)', r.headers['location']).group()
        # Get list ID
        if self.config['list'] == 'watchlist':
            data = {'consts[]': 'tt0133093', 'tracking_tag': 'watchlistRibbon'}
            wl_data = self._session.post(
                'http://www.imdb.com/list/_ajax/watchlist_has',
                data=data).json()
            try:
                self.list_id = wl_data['list_id']
            except KeyError:
                raise PluginError(
                    'No list ID could be received. Please initialize list by '
                    'manually adding an item to it and try again')
        elif self.config['list'] in IMMUTABLE_LISTS or self.config[
                'list'].startswith('ls'):
            self.list_id = self.config['list']
        else:
            data = {'tconst': 'tt0133093'}
            list_data = self._session.post(
                'http://www.imdb.com/list/_ajax/wlb_dropdown',
                data=data).json()
            for li in list_data['items']:
                if li['wlb_text'] == self.config['list']:
                    self.list_id = li['data_list_id']
                    break
            else:
                raise plugin.PluginError('Could not find list %s' %
                                         self.config['list'])

        self._authenticated = True
Exemplo n.º 34
0
 def parse_download_page(self, url):
     try:
         page = requests.get(url).content
         soup = get_soup(page, 'html.parser')
         download_link = soup.findAll(
             href=re.compile('redirect|redirectlink'))
         download_href = download_link[0]['href']
         return download_href
     except Exception:
         raise UrlRewritingError('Unable to locate torrent from url %s' %
                                 url)
Exemplo n.º 35
0
    def on_task_input(self, task, config):
        # use rss plugin
        rss_config = {'url': self.rss_url}
        rss_entries = super(AppleTrailers,
                            self).on_task_input(task, rss_config)

        # Multiple entries can point to the same movie page (trailer 1, clip
        # 1, etc.)
        entries = {}
        for entry in rss_entries:
            url = entry['original_url']
            if url in entries:
                continue
            else:
                title = entry['title']
                entries[url] = title[:title.rfind('-')].rstrip()

        result = []

        for url, title in entries.iteritems():
            inc_url = url + 'includes/playlists/web.inc'
            try:
                page = urlopener(inc_url, log)
            except HTTPError, err:
                log.warning("HTTPError when opening playlist page: %d %s" %
                            (err.code, err.reason))
                continue

            soup = get_soup(page)
            links = soup.find_all('a',
                                  attrs={
                                      'class': 'target-quicktimeplayer',
                                      'href': re.compile(r'_h?480p\.mov$')
                                  })
            for link in links:
                url = link.get('href')
                url = url[:url.rfind('_')]
                quality = self.quality.lower()

                if quality == 'ipod':
                    url += '_i320.m4v'
                else:
                    url += '_h' + quality + '.mov'

                entry = Entry()
                entry['url'] = url
                entry['title'] = title

                match = re.search(r'.*/([^?#]*)', url)
                entry['filename'] = match.group(1)

                result.append(entry)
                log.debug('found trailer %s', url)
Exemplo n.º 36
0
    def search(self, task, entry, config=None):
        if not config:
            log.debug('Divxatope disabled')
            return set()
        log.debug('Search DivxATope')
        url_search = 'http://divxatope1.com/buscar/descargas'
        results = set()
        for search_string in entry.get('search_strings', [entry['title']]):
            query = normalize_unicode(search_string)
            query = re.sub(' \(\d\d\d\d\)$', '', query)
            log.debug('Searching DivxATope %s' % query)
            query = query.encode('utf8', 'ignore')
            data = {'q': query}
            try:
                response = task.requests.post(url_search, data=data)
            except requests.RequestException as e:
                log.error('Error searching DivxATope: %s' % e)
                return
            content = response.content

            soup = get_soup(content)
            if 'divxatope1.com' in url_search:
                soup2 = soup.find('ul', attrs={'class': 'buscar-list'})
            else:
                soup2 = soup.find('ul', attrs={'class': 'peliculas-box'})
            children = soup2.findAll('a', href=True)
            for child in children:
                entry = Entry()
                entry['url'] = child['href']
                entry_title = child.find('h2')
                if entry_title is None:
                    continue
                entry_title = entry_title.contents
                if not entry_title:
                    continue
                else:
                    entry_title = entry_title[0]
                quality_lan = child.find('strong')
                if quality_lan is None:
                    continue
                quality_lan = quality_lan.contents
                if len(quality_lan) > 2:
                    if (isinstance(quality_lan[0], Tag)):
                        entry_quality_lan = quality_lan[1]
                    else:
                        entry_quality_lan = quality_lan[0] + ' ' + quality_lan[
                            2]
                elif len(quality_lan) == 2:
                    entry_quality_lan = quality_lan[1]
                entry['title'] = entry_title + ' ' + entry_quality_lan
                results.add(entry)
        log.debug('Finish search DivxATope with %d entries' % len(results))
        return results
Exemplo n.º 37
0
 def parse_download_page(self, url, requests):
     txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
     page = requests.get(url, headers=txheaders)
     try:
         soup = get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(e)
     tag_a = soup.find('a', attrs={'class': 'download_link'})
     if not tag_a:
         raise UrlRewritingError('Unable to locate download link from url %s' % url)
     torrent_url = 'http://www.bakabt.com' + tag_a.get('href')
     return torrent_url
Exemplo n.º 38
0
    def on_task_input(self, task, config):
        if not task.requests.cookies:
            username = config['username']
            password = config['password']

            log.debug("Logging in to %s ..." % URL)
            params = {
                'username': username,
                'password': password,
                'action': 'Login'
            }
            loginsrc = task.requests.post(URL + 'login.php?action=login', data=params).content
            if str(username) not in loginsrc:
                raise plugin.PluginWarning(('Login to myepisodes.com failed, please check '
                                 'your account data or see if the site is down.'), log)

        page = task.requests.get(URL + "myshows/manage/").content
        try:
            soup = get_soup(page)
        except Exception as e:
            raise plugin.PluginError("Unable to parse myepisodes.com page: %s" % (e,))

        entries = []

        def show_list(select_id):
            return soup.find('select', {'id': select_id}).findAll('option')

        options = show_list('shows')
        if config['include_ignored']:
            options = chain(options, show_list('ignored_shows'))
        for option in options:
            name = option.text
            if config.get('strip_dates'):
                # Remove year from end of name if present
                name = re.sub(r'\s+\(\d{4}\)$', '', name)
            showid = option.get('value')
            url = '%sviews.php?type=epsbyshow&showid=%s' % (URL, showid)

            entry = Entry()
            entry['title'] = name
            entry['url'] = url
            entry['series_name'] = name
            entry['myepisodes_id'] = showid

            if entry.isvalid():
                entries.append(entry)
            else:
                log.debug('Invalid entry created? %s' % entry)

        if not entries:
            log.warn("No shows found on myepisodes.com list. Maybe you need to add some first?")

        return entries
Exemplo n.º 39
0
 def parse_download_page(self, url):
     page = requests.get(url)
     try:
         soup = get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(e)
     torrent_id_prog = re.compile("'(?:torrentID|id)'\s*:\s*'(\d+)'")
     torrent_ids = soup.findAll(text=torrent_id_prog)
     if len(torrent_ids) == 0:
         raise UrlRewritingError('Unable to locate torrent ID from url %s' % url)
     torrent_id = torrent_id_prog.search(torrent_ids[0]).group(1)
     return 'http://www.newpct.com/descargar/torrent/%s/dummy.html' % torrent_id
Exemplo n.º 40
0
    def parse_downloads(self, series_url, search_title):
        page = requests.get(series_url).content
        try:
            soup = get_soup(page)
        except Exception as e:
            raise UrlRewritingError(e)

        urls = []
        # find all titles
        episode_titles = self.find_all_titles(search_title)
        if not episode_titles:
            raise UrlRewritingError('Unable to find episode')

        for ep_title in episode_titles:
            # find matching download
            episode_title = soup.find('strong', text=re.compile(ep_title, re.I))
            if not episode_title:
                continue

            # find download container
            episode = episode_title.parent
            if not episode:
                continue

            # find episode language
            episode_lang = episode.find_previous('strong', text=re.compile('Sprache')).next_sibling
            if not episode_lang:
                log.warning('No language found for: %s', series_url)
                continue

            # filter language
            if not self.check_language(episode_lang):
                log.warning('languages not matching: %s <> %s', self.config['language'], episode_lang)
                continue

            # find download links
            links = episode.find_all('a')
            if not links:
                log.warning('No links found for: %s', series_url)
                continue

            for link in links:
                if not link.has_attr('href'):
                    continue

                url = link['href']
                pattern = 'http:\/\/download\.serienjunkies\.org.*%s_.*\.html' % self.config['hoster']

                if re.match(pattern, url) or self.config['hoster'] == 'all':
                    urls.append(url)
                else:
                    continue
        return urls
Exemplo n.º 41
0
    def search(self, task, entry, config=None):
        config = self.prepare_config(config)

        if not session.cookies:
            log.debug('Logging in to %s...' % URL)
            params = {
                'username': config['username'],
                'password': config['password'],
                'keeplogged': '1',
                'login': '******'
            }
            session.post(URL + 'login.php', data=params)

        cat = ''.join([
            '&' + ('filter_cat[%s]' % id) + '=1' for id in config['category']
        ])
        rls = 'release_type=' + config['type']
        url_params = rls + cat
        multip = config['gravity_multiplier']

        entries = set()
        for search_string in entry.get('search_strings', [entry['title']]):
            srch = normalize_unicode(clean_title(search_string))
            srch = '&searchstr=' + quote(srch.encode('utf8'))

            url = URL + 'torrents.php?' + url_params + srch
            log.debug('Fetching URL for `%s`: %s' % (search_string, url))

            page = session.get(url).content
            soup = get_soup(page)

            for result in soup.findAll('tr', attrs={'class': 'torrent'}):
                entry = Entry()
                entry['title'] = result.find('span',
                                             attrs={
                                                 'class': 'torrent_name_link'
                                             }).text
                entry['url'] = URL + result.find(
                    'a', href=re.compile(
                        'torrents\.php\?action=download')).get('href')
                entry['torrent_seeds'], entry['torrent_leeches'] = [
                    r.text for r in result.findAll('td')[-2:]
                ]
                entry['search_sort'] = torrent_availability(
                    entry['torrent_seeds'], entry['torrent_leeches']) * multip

                size = result.findAll('td')[-4].text
                size = re.search('(\d+(?:[.,]\d+)*)\s?([KMG]B)', size)

                entry['content_size'] = parse_filesize(size.group(0))

                entries.add(entry)
        return entries
Exemplo n.º 42
0
    def on_task_input(self, task):
        pageurl = "http://tvtorrents.com/loggedin/recently_aired.do"
        log.debug("InputPlugin tvtorrents requesting url %s" % pageurl)

        page = urlopener(pageurl, log)
        soup = get_soup(page)

        hscript = soup.find('script', src=None).contents[0]
        hlines = hscript.splitlines()
        hash = hlines[15].strip().split("'")[1]
        digest = hlines[16].strip().split("'")[1]
        hurl = hlines[17].strip().split("'")
        hashurl = hurl[1] + "%s" + hurl[3] + digest + hurl[5] + hash

        for link in soup.find_all('a'):
            if not 'href' in link:
                continue
            url = link['href']
            title = link.contents[0]

            if link.has_attr(
                    'onclick') and link['onclick'].find("loadTorrent") != -1:
                infohash = link['onclick'].split("'")[1]
                td = link.parent.parent.contents[4]
                sname = td.contents[0].strip()
                epi = td.contents[2].contents[0].strip()
                title = "%s - %s" % (sname, epi)
                url = hashurl % (infohash, )
            else:
                continue
            if title is None:
                continue

            title = title.strip()
            if not title:
                continue

            # fix broken urls
            if url.startswith('//'):
                url = "http:" + url
            elif not url.startswith('http://') or not url.startswith(
                    'https://'):
                url = urlparse.urljoin(pageurl, url)

            # in case the title contains xxxxxxx.torrent - foooo.torrent clean it a bit (get upto first .torrent)
            if title.lower().find('.torrent') > 0:
                title = title[:title.lower().find(".torrent")]

            entry = Entry()
            entry['url'] = url
            entry['title'] = title

            task.entries.append(entry)
Exemplo n.º 43
0
    def search(self, task, entry, config):
        if not session.cookies:
            try:
                login_params = {
                    'username': config['username'],
                    'password': config['password'],
                    'loginkey': config['login_key'],
                }
                r = session.post('https://piratethenet.org/takelogin.php',
                                 data=login_params,
                                 verify=False)
            except requests.RequestException as e:
                log.error('Error while logging in to PtN: %s', e)
                raise plugin.PluginError('Could not log in to PtN')

            passkey = re.search(r'passkey=([\d\w]+)"', r.text)
            if not passkey:
                log.error("It doesn't look like PtN login worked properly.")
                raise plugin.PluginError('PTN cookie info invalid')

        search_params = default_search_params.copy()
        if 'movie_name' in entry:
            if 'movie_year' in entry:
                search_params[
                    'advancedsearchparameters'] = '[year=%s]' % entry[
                        'movie_year']
            searches = [entry['movie_name']]
        else:
            searches = entry.get('search_strings', [entry['title']])

        results = set()
        for search in searches:
            search_params['searchstring'] = search
            try:
                r = session.get('http://piratethenet.org/torrentsutils.php',
                                params=search_params)
            except requests.RequestException as e:
                log.error('Error searching ptn: %s' % e)
                continue
            # html5parser doesn't work properly for some reason
            soup = get_soup(r.text, parser='html.parser')
            for movie in soup.select('.torrentstd'):
                imdb_id = movie.find('a',
                                     href=re.compile('.*imdb\.com/title/tt'))
                if imdb_id:
                    imdb_id = extract_id(imdb_id['href'])
                if imdb_id and 'imdb_id' in entry and imdb_id != entry[
                        'imdb_id']:
                    continue
                results.update(self.create_entries(movie, imdb_id=imdb_id))

        return results
Exemplo n.º 44
0
    def on_task_input(self, task, config):
        # Create entries by parsing AniDB wishlist page html using beautifulsoup
        log.verbose('Retrieving AniDB list: mywishlist')
        url = 'http://anidb.net/perl-bin/animedb.pl?show=mywishlist&uid=%s' % config['user_id']
        log.debug('Requesting: %s' % url)

        page = task.requests.get(url)
        if page.status_code != 200:
            raise plugin.PluginError('Unable to get AniDB list. Either the list is private or does not exist.')

        soup = get_soup(page.text)
        soup = soup.find('table', class_='wishlist')

        trs = soup.find_all('tr')
        if not trs:
            log.verbose('No movies were found in AniDB list: mywishlist')
            return

        entries = []
        entry_type = ''
        if config['type'] == 'movies':
            entry_type = 'Type: Movie'
        elif config['type'] == 'shows':
            entry_type = 'Type: TV Series'
        for tr in trs:
            if tr.find('span', title=entry_type):
                a = tr.find('td', class_='name').find('a')
                if not a:
                    log.debug('No title link found for the row, skipping')
                    continue

                anime_title = a.string
                if config.get('strip_dates'):
                    # Remove year from end of series name if present
                    anime_title = re.sub(r'\s+\(\d{4}\)$', '', anime_title)

                link = ('http://anidb.net/perl-bin/' + a.get('href'))

                anime_id = ""
                match = re.search(r'aid=([\d]{1,5})', a.get('href'))
                if match:
                    anime_id = match.group(1)

                entry = Entry()
                entry['title'] = anime_title
                entry['url'] = link
                entry['anidb_id'] = anime_id
                entry['anidb_name'] = entry['title']
                entries.append(entry)
            else:
                log.verbose('Entry does not match the requested type')
        return entries
Exemplo n.º 45
0
    def on_task_input(self, task, config):
        config = self.prepare_config(config)
        url = config.get('url')
        user_agent = config.get('user-agent')
        cookie = config.get('cookie')
        root_element_selector = config.get('root_element_selector')
        fields = config.get('fields')
        params = config.get('params')
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'user-agent': user_agent
        }

        entries = []

        try:
            task.requests.headers.update(headers)
            task.requests.cookies.update(NetUtils.cookie_str_to_dict(cookie))
            response = task.requests.get(url, timeout=60)
            content = NetUtils.decode(response)
        except RequestException as e:
            raise plugin.PluginError(
                'Unable to download the Html for task {} ({}): {}'.format(
                    task.name, url, e))
        elements = get_soup(content).select(root_element_selector)
        if len(elements) == 0:
            logger.debug(f'no elements found in response: {content}')
            return entries

        for element in elements:
            logger.debug('element in element_selector: {}', element)
            entry = Entry()
            for key, value in fields.items():
                entry[key] = ''
                sub_element = element.select_one(value['element_selector'])
                if sub_element:
                    if value['attribute'] == 'textContent':
                        sub_element_content = sub_element.get_text()
                    else:
                        sub_element_content = sub_element.get(
                            value['attribute'], '')
                    entry[key] = sub_element_content
                logger.debug('key: {}, value: {}', key, entry[key])
            if entry['title'] and entry['url']:
                base_url = urljoin(url, entry['url'])
                if params.startswith("&"):
                    entry['url'] = base_url + params
                else:
                    entry['url'] = urljoin(base_url, params)
                entry['original_url'] = entry['url']
                entries.append(entry)
        return entries
Exemplo n.º 46
0
    def search(self, task, entry, config=None):
        """
        Search for name from iptorrents
        """

        categories = config.get('category', 'all')
        # Make sure categories is a list
        if not isinstance(categories, list):
            categories = [categories]

        # If there are any text categories, turn them into their id number
        categories = [c if isinstance(c, int) else CATEGORIES[c]
                      for c in categories]
        filter_url = '&'.join((str(c) + '=') for c in categories)

        entries = set()

        for search_string in entry.get('search_strings', [entry['title']]):
            query = normalize_unicode(search_string)
            query = quote_plus(query.encode('utf8'))

            url = "{base_url}/t?{filter}&q={query}&qf=".format(base_url=BASE_URL, filter=filter_url, query=query)
            log.debug('searching with url: %s' % url)
            req = requests.get(url, cookies={'uid': str(config['uid']), 'pass': config['password']})

            if '/u/' + str(config.get('uid')) not in req.content:
                raise plugin.PluginError("Invalid cookies (user not logged in)...")

            soup = get_soup(req.content, parser="html.parser")
            torrents = soup.find('table', {'class': 'torrents'})

            for torrent in torrents.findAll('a', href=re.compile('\.torrent$')):
                entry = Entry()
                entry['url'] = "{base}{link}?torrent_pass={key}".format(
                    base=BASE_URL, link=torrent['href'], key=config.get('rss_key'))
                entry['title'] = torrent.findPrevious("a", attrs={'class': 't_title'}).text

                seeders = torrent.findNext('td', {'class': 'ac t_seeders'}).text
                leechers = torrent.findNext('td', {'class': 'ac t_leechers'}).text
                entry['torrent_seeds'] = int(seeders)
                entry['torrent_leeches'] = int(leechers)
                entry['search_sort'] = torrent_availability(entry['torrent_seeds'],
                                                            entry['torrent_leeches'])

                size = torrent.findNext(text=re.compile('^([\.\d]+) ([GMK]?)B$'))
                size = re.search('^([\.\d]+) ([GMK]?)B$', size)

                entry['content_size'] = parse_filesize(size.group(0))

                entries.add(entry)

        return entries
Exemplo n.º 47
0
    def parse_page(self, scraper, url: str):
        try:
            logger.debug('page url: {}', url)
            page = scraper.get(url)
        except RequestException as e:
            raise plugin.PluginError(str(e))
        if page.status_code == 404:
            raise Page404Error()
        if page.status_code != 200:
            raise plugin.PluginError(
                f'HTTP Request failed {page.status_code}. Url: {url}')

        soup = get_soup(page.text)
        soup_table = soup.find('table', class_='download')
        if not soup_table:
            # very likely no result
            return
        table_tbody = soup_table.find('tbody')
        if not table_tbody:
            raise plugin.PluginError(
                'Parsing crashed, no tbody, please report the issue')

        trs = table_tbody.find_all('tr')
        if not trs:
            logger.critical('Nothing to parse')
            return
        for tr in trs:
            try:
                magnet_td = tr.find('td', class_='m')
                if not magnet_td:
                    # skip empty trs
                    continue
                magnet_a = magnet_td.find('a')
                magnet = magnet_a['href']
                title_td = tr.find('td', class_='n')
                title_a = title_td.find('a')
                title = title_a['title']
                seed_td = tr.find('td', class_='s')
                seed = int(seed_td.text)
                leech = int(tr.find('td', class_='l').text)
                content_size = parse_filesize(seed_td.previous_sibling.text)
                yield Entry(
                    url=magnet,
                    title=title,
                    torrent_seeds=seed,
                    torrent_leech=leech,
                    content_size=content_size,
                )
            except AttributeError as e:
                raise plugin.PluginError(
                    'Parsing crashed, please report the issue') from e
Exemplo n.º 48
0
    def get_nexusphp_message(self, task, entry):
        message_url = entry['message_url'] if entry[
            'message_url'] else urljoin(entry['url'], '/messages.php')
        message_box_response = self._request(task,
                                             entry,
                                             'get',
                                             message_url,
                                             headers=entry['headers'])
        state = self.check_net_state(entry, message_box_response, message_url)
        if state:
            entry['messages'] = 'Can not read message box!'
            return
        if message_box_response:
            unread_elements = get_soup(self._decode(
                message_box_response)).select('td > img[alt*="Unread"]')
            for unread_element in unread_elements:
                td = unread_element.parent.nextSibling.nextSibling
                title = td.text
                href = td.a.get('href')
                message_url = urljoin(message_url, href)
                message_response = self._request(task,
                                                 entry,
                                                 'get',
                                                 message_url,
                                                 headers=entry['headers'])

                message_body = 'Can not read message body!'
                if message_response:
                    body_element = get_soup(self._decode(
                        message_response)).select_one('td[colspan*="2"]')
                    if body_element:
                        message_body = body_element.text.strip()

                entry['messages'] = entry['messages'] + (
                    '\nTitle: {}\nLink: {}\n{}'.format(title, message_url,
                                                       message_body))
        else:
            entry['messages'] = 'Can not read message box!'
Exemplo n.º 49
0
    def _request_url(self, task, config, url, auth, dump_name=None):
        log.verbose('Requesting: %s' % url)
        page = task.requests.get(url, auth=auth)
        log.verbose('Response: %s (%s)' % (page.status_code, page.reason))
        soup = get_soup(page.content)

        # dump received content into a file
        if dump_name:
            log.verbose('Dumping: %s' % dump_name)
            data = soup.prettify()
            with open(dump_name, 'w') as f:
                f.write(data)

        return self.create_entries(url, soup, config)
Exemplo n.º 50
0
    def search(self, task, entry, config=None):
        """
        Search for name from piratebay.
        """
        if not isinstance(config, dict):
            config = {}
        sort = SORT.get(config.get('sort_by', 'seeds'))
        if config.get('sort_reverse'):
            sort += 1
        if isinstance(config.get('category'), int):
            category = config['category']
        else:
            category = CATEGORIES.get(config.get('category', 'all'))
        filter_url = '/0/%d/%d' % (sort, category)

        entries = set()
        for search_string in entry.get('search_strings', [entry['title']]):
            query = normalize_unicode(search_string)

            # TPB search doesn't like dashes or quotes
            query = query.replace('-', ' ').replace("'", " ")

            # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand
            url = 'http://thepiratebay.%s/search/%s%s' % (CUR_TLD, quote(query.encode('utf-8')), filter_url)
            log.debug('Using %s as piratebay search url' % url)
            page = task.requests.get(url).content
            soup = get_soup(page)
            for link in soup.find_all('a', attrs={'class': 'detLink'}):
                entry = Entry()
                entry['title'] = self.extract_title(link)
                if not entry['title']:
                    log.error('Malformed search result. No title or url found. Skipping.')
                    continue
                entry['url'] = 'http://thepiratebay.%s%s' % (CUR_TLD, link.get('href'))
                tds = link.parent.parent.parent.find_all('td')
                entry['torrent_seeds'] = int(tds[-2].contents[0])
                entry['torrent_leeches'] = int(tds[-1].contents[0])
                entry['search_sort'] = torrent_availability(entry['torrent_seeds'], entry['torrent_leeches'])
                # Parse content_size
                size_text = link.find_next(attrs={'class': 'detDesc'}).get_text()
                if size_text:
                    size = re.search('Size (\d+(\.\d+)?\xa0(?:[PTGMK])?i?B)', size_text)
                    if size:
                        entry['content_size'] = parse_filesize(size.group(1))
                    else:
                        log.error('Malformed search result? Title: "%s", No size? %s', entry['title'], size_text)

                entries.add(entry)

        return sorted(entries, reverse=True, key=lambda x: x.get('search_sort'))
Exemplo n.º 51
0
 def parse_download_page(self, url, requests):
     txheaders = {
         'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
     }
     page = requests.get(url, headers=txheaders)
     try:
         soup = get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(e)
     tag_a = soup.select_one('a[href^="magnet:"]')
     if not tag_a:
         raise UrlRewritingError(
             f"Unable to locate download link from url {url}")
     return tag_a.get('href')
Exemplo n.º 52
0
    def horrible_entries(requests, page_url):
        entries = []

        try:
            soup = get_soup(requests.get(page_url).content)
        except RequestException as e:
            log.error('HorribleSubs request failed: %s', e)
            return entries

        for li_label in soup.findAll('li'):
            title = '[HorribleSubs] {0}{1}'.format(str(li_label.find('span').next_sibling),str(li_label.find('strong').text))
            log.debug('Found title `%s`', title)
            url = li_label.find('a')['href']
            episode = re.sub(r'.*#', '', url)
            # Get show ID
            try:
                soup = get_soup(requests.get('https://horriblesubs.info/{0}'.format(url)).content)
            except RequestException as e:
                log.error('HorribleSubs request failed: %s', e)
                return entries
            show_id = re.sub(r'[^0-9]', '', soup(text=re.compile('hs_showid'))[0])
            entries = HorribleSubs.horrible_get_downloads(requests, title, 'https://horriblesubs.info/api.php?method=getshows&type=show&mode=filter&showid={0}&value={1}'.format(show_id,episode))
        return entries
Exemplo n.º 53
0
    def get_nexusphp_message(self,
                             entry,
                             config,
                             messages_url='/messages.php'):
        message_url = urljoin(entry['url'], messages_url)
        message_box_response = self._request(entry, 'get', message_url)
        net_state = self.check_net_state(entry, message_box_response,
                                         message_url)
        if net_state:
            entry.fail_with_prefix(
                'Can not read message box! url:{}'.format(message_url))
            return

        unread_elements = get_soup(self._decode(message_box_response)).select(
            'td > img[alt*="Unread"]')
        failed = False
        for unread_element in unread_elements:
            td = unread_element.parent.nextSibling.nextSibling
            title = td.text
            href = td.a.get('href')
            message_url = urljoin(message_url, href)
            message_response = self._request(entry, 'get', message_url)
            net_state = self.check_net_state(entry, message_response,
                                             message_url)
            if net_state:
                message_body = 'Can not read message body!'
                failed = True
            else:
                body_element = get_soup(self._decode(
                    message_response)).select_one('td[colspan*="2"]')
                if body_element:
                    message_body = body_element.text.strip()
            entry['messages'] = entry['messages'] + (
                '\nTitle: {}\nLink: {}\n{}'.format(title, message_url,
                                                   message_body))
        if failed:
            entry.fail_with_prefix('Can not read message body!')
    def on_task_input(self, task, config):
        config = self.prepare_config(config)
        url = config.get('url')
        root_element_selector = config.get('root_element_selector')
        fields = config.get('fields')
        params = config.get('params')

        entries = []
        elements = []
        if url and root_element_selector:
            try:
                if brotli:
                    config.get(
                        'headers')['accept-encoding'] = 'gzip, deflate, br'
                response = task.requests.get(url,
                                             headers=config.get('headers'),
                                             timeout=60)
                content = self._decode(response)
            except RequestException as e:
                raise plugin.PluginError(
                    'Unable to download the Html for task {} ({}): {}'.format(
                        task.name, url, e))
            elements = get_soup(content).select(root_element_selector)
            if len(elements) == 0:
                return entries

        for element in elements:
            logger.debug('element in element_selector: {}', element)
            entry = Entry()
            for key, value in fields.items():
                entry[key] = ''
                sub_element = element.select_one(value['element_selector'])
                if sub_element:
                    if value['attribute'] == 'textContent':
                        sub_element_content = sub_element.get_text()
                    else:
                        sub_element_content = sub_element.get(
                            value['attribute'], '')
                    entry[key] = sub_element_content
                logger.debug('key: {}, value: {}', key, entry[key])
            if entry['title'] and entry['url']:
                base_url = urljoin(url, entry['url'])
                if params.startswith("&"):
                    entry['url'] = base_url + params
                else:
                    entry['url'] = urljoin(base_url, params)
                entry['original_url'] = entry['url']
                entries.append(entry)
        return entries
 def parse_download_page(self, page_url):
     page = urlopener(page_url, log)
     try:
         soup = get_soup(page)
     except Exception as e:
         raise UrlRewritingError(e)
     tag_a = soup.find("a", {"class": "dl_link"})
     if not tag_a:
         raise UrlRewritingError(
             'FTDB Unable to locate download link from url %s and tag_a is : %s'
             % (page_url, tag_a))
     torrent_url = "http://www3.frenchtorrentdb.com" + tag_a.get(
         'href') + "&js=1"
     log.debug('TORRENT URL is : %s' % torrent_url)
     return torrent_url
Exemplo n.º 56
0
    def get_gazelle_message(self, task, entry):
        message_url = urljoin(entry['url'], '/inbox.php')
        message_box_response = self._request(task,
                                             entry,
                                             'get',
                                             message_url,
                                             headers=entry['headers'])
        state = self.check_net_state(entry, message_box_response, message_url)
        if state:
            entry['messages'] = 'Can not read message box!'
            return
        if message_box_response:
            unread_elements = get_soup(self._decode(
                message_box_response)).select("tr.unreadpm > td > strong > a")
            for unread_element in unread_elements:
                title = unread_element.text
                href = unread_element.get('href')
                message_url = urljoin(message_url, href)
                message_response = self._request(task,
                                                 entry,
                                                 'get',
                                                 message_url,
                                                 headers=entry['headers'])

                message_body = 'Can not read message body!'
                if message_response:
                    body_element = get_soup(self._decode(
                        message_response)).select_one('div[id*="message"]')
                    if body_element:
                        message_body = body_element.text.strip()

                entry['messages'] = entry['messages'] + (
                    '\nTitle: {}\nLink: {}\n{}'.format(title, message_url,
                                                       message_body))
        else:
            entry['messages'] = 'Can not read message box!'
Exemplo n.º 57
0
    def parse_download_page(self, url):
        page = urlopener(url, log)
        log.debug('%s opened', url)
        try:
            soup = get_soup(page)
            torrent_url = 'http://www.t411.me' + soup.find(
                text='Télécharger').findParent().get('href')
        except Exception as e:
            raise UrlRewritingError(e)

        if not torrent_url:
            raise UrlRewritingError(
                'Unable to locate download link from url %s' % url)

        return torrent_url
Exemplo n.º 58
0
 def parse_download_page(self, url, requests):
     page = requests.get(url).content
     try:
         soup = get_soup(page)
         tag_div = soup.find('div', attrs={'class': 'download'})
         if not tag_div:
             raise UrlRewritingError('Unable to locate download link from url %s' % url)
         tag_a = tag_div.find('a')
         torrent_url = tag_a.get('href')
         # URL is sometimes missing the schema
         if torrent_url.startswith('//'):
             torrent_url = urlparse(url).scheme + ':' + torrent_url
         return torrent_url
     except Exception as e:
         raise UrlRewritingError(e)
Exemplo n.º 59
0
    def entries_from_search(self, name, url=None):
        """Parses torrent download url from search results"""
        name = normalize_unicode(name)
        if not url:
            url = 'http://www.newtorrents.info/search/%s' % urllib.quote(name.encode('utf-8'), safe=':/~?=&%')

        log.debug('search url: %s' % url)

        html = urlopener(url, log).read()
        # fix </SCR'+'IPT> so that BS does not crash
        # TODO: should use beautifulsoup massage
        html = re.sub(r'(</SCR.*?)...(.*?IPT>)', r'\1\2', html)

        soup = get_soup(html)
        # saving torrents in dict
        torrents = []
        for link in soup.find_all('a', attrs={'href': re.compile('down.php')}):
            torrent_url = 'http://www.newtorrents.info%s' % link.get('href')
            release_name = link.parent.next.get('title')
            # quick dirty hack
            seed = link.find_next('td', attrs={'class': re.compile('s')}).renderContents()
            if seed == 'n/a':
                seed = 0
            else:
                try:
                    seed = int(seed)
                except ValueError:
                    log.warning('Error converting seed value (%s) from newtorrents to integer.' % seed)
                    seed = 0

            #TODO: also parse content_size and peers from results
            torrents.append(Entry(title=release_name, url=torrent_url, torrent_seeds=seed,
                                  search_sort=torrent_availability(seed, 0)))
        # sort with seed number Reverse order
        torrents.sort(reverse=True, key=lambda x: x.get('search_sort', 0))
        # choose the torrent
        if not torrents:
            dashindex = name.rfind('-')
            if dashindex != -1:
                return self.entries_from_search(name[:dashindex])
            else:
                raise PluginWarning('No matches for %s' % name, log, log_once=True)
        else:
            if len(torrents) == 1:
                log.debug('found only one matching search result.')
            else:
                log.debug('search result contains multiple matches, sorted %s by most seeders' % torrents)
            return torrents
Exemplo n.º 60
0
 def parse_download_page(self, url):
     txheaders = {
         'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
     }
     req = urllib2.Request(url, None, txheaders)
     page = urlopener(req, log)
     try:
         soup = get_soup(page)
     except Exception as e:
         raise UrlRewritingError(e)
     down_link = soup.find(
         'a', attrs={'href': re.compile("download/\d+/.*\.torrent")})
     if not down_link:
         raise UrlRewritingError(
             'Unable to locate download link from url %s' % url)
     return 'http://www.deadfrog.us/' + down_link.get('href')