Exemplo n.º 1
0
    def url_rewrite(self, task, entry):
        url = entry['url']
        page = None
        for (scheme, netloc) in EZTV_MIRRORS:
            try:
                _, _, path, params, query, fragment = urlparse(url)
                url = urlunparse((scheme, netloc, path, params, query, fragment))
                page = task.requests.get(url).content
            except RequestException as e:
                log.debug('Eztv mirror `%s` seems to be down', url)
                continue
            break

        if not page:
            raise UrlRewritingError('No mirrors found for url %s' % entry['url'])

        log.debug('Eztv mirror `%s` chosen', url)
        try:
            soup = get_soup(page)
            mirrors = soup.find_all('a', attrs={'class': re.compile(r'download_\d')})
        except Exception as e:
            raise UrlRewritingError(e)

        log.debug('%d torrent mirrors found', len(mirrors))

        if not mirrors:
            raise UrlRewritingError('Unable to locate download link from url %s' % url)

        entry['urls'] = [m.get('href') for m in mirrors]
        entry['url'] = mirrors[0].get('href')
Exemplo n.º 2
0
    def url_rewrite(self, task, entry):
        soup = self._get_soup(task, entry['url'])

        link_re = re.compile('rarefile\.net.*\.rar$')

        # grab links from the main entry:
        blog_entry = soup.find('div', class_="entry")
        num_links = 0
        link_list = None
        for paragraph in blog_entry.find_all('p'):
            links = paragraph.find_all('a', href=link_re)
            if len(links) > num_links:
                link_list = links
                num_links = len(links)
        if 'urls' in entry:
            urls = list(entry['urls'])
        else:
            urls = []
        if link_list is not None:
            for link in link_list:
                urls.append(normalize_unicode(link['href']))
        else:
            raise UrlRewritingError('No useable links found at %s' %
                                    entry['url'])

        num_links = len(urls)
        log.verbose('Found %d links at %s.', num_links, entry['url'])
        if num_links:
            entry['urls'] = urls
            entry['url'] = urls[0]
        else:
            raise UrlRewritingError('No useable links found at %s' %
                                    entry['url'])
Exemplo n.º 3
0
    def parse_downloads(self, series_url, search_title):
        page = requests.get(series_url).content
        try:
            soup = get_soup(page)
        except Exception as e:
            raise UrlRewritingError(e)

        urls = []
        # find all titles
        episode_titles = self.find_all_titles(search_title)
        if not episode_titles:
            raise UrlRewritingError('Unable to find episode')

        for ep_title in episode_titles:
            # find matching download
            episode_title = soup.find('strong',
                                      text=re.compile(ep_title, re.I))
            if not episode_title:
                continue

            # find download container
            episode = episode_title.parent
            if not episode:
                continue

            # find episode language
            episode_lang = episode.find_previous(
                'strong', text=re.compile('Sprache')).next_sibling
            if not episode_lang:
                log.warning('No language found for: %s', series_url)
                continue

            # filter language
            if not self.check_language(episode_lang):
                log.warning('languages not matching: %s <> %s',
                            self.config['language'], episode_lang)
                continue

            # find download links
            links = episode.find_all('a')
            if not links:
                log.warning('No links found for: %s', series_url)
                continue

            for link in links:
                if not link.has_attr('href'):
                    continue

                url = link['href']
                pattern = (
                    r'http:\/\/download\.serienjunkies\.org.*%s_.*\.html' %
                    self.config['hoster'])

                if re.match(pattern, url) or self.config['hoster'] == 'all':
                    urls.append(url)
                else:
                    continue
        return urls
Exemplo n.º 4
0
 def _get_soup(self, task, url):
     try:
         page = task.requests.get(url)
     except RequestException as e:
         raise UrlRewritingError(str(e))
     try:
         return get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(str(e))
Exemplo n.º 5
0
 def parse_download_page(self, url, requests):
     txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
     page = requests.get(url, headers=txheaders)
     try:
         soup = get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(e)
     down_link = soup.find('a', attrs={'href': re.compile(r"download/\d+/.*\.torrent")})
     if not down_link:
         raise UrlRewritingError('Unable to locate download link from url %s' % url)
     return 'http://www.deadfrog.us/' + down_link.get('href')
Exemplo n.º 6
0
 def parse_download_page(self, url, requests):
     txheaders = {
         'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
     }
     page = requests.get(url, headers=txheaders)
     try:
         soup = get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(e)
     tag_a = soup.select_one('a[href^="magnet:"]')
     if not tag_a:
         raise UrlRewritingError(
             f"Unable to locate download link from url {url}")
     return tag_a.get('href')
Exemplo n.º 7
0
 def parse_download_page(self, url, requests):
     page = requests.get(url).content
     try:
         soup = get_soup(page)
         tag_div = soup.find('div', attrs={'class': 'download'})
         if not tag_div:
             raise UrlRewritingError('Unable to locate download link from url %s' % url)
         tag_a = tag_div.find('a')
         torrent_url = tag_a.get('href')
         # URL is sometimes missing the schema
         if torrent_url.startswith('//'):
             torrent_url = urlparse(url).scheme + ':' + torrent_url
         return torrent_url
     except Exception as e:
         raise UrlRewritingError(e)
Exemplo n.º 8
0
 def parse_download_page(self, url, requests):
     txheaders = {
         'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
     }
     page = requests.get(url, headers=txheaders)
     try:
         soup = get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(e)
     tag_a = soup.find('a', attrs={'class': 'download_link'})
     if not tag_a:
         raise UrlRewritingError(
             'Unable to locate download link from url %s' % url)
     torrent_url = 'https://bakabt.me/' + tag_a.get('href')
     return torrent_url
Exemplo n.º 9
0
    def url_rewrite(self, task, entry):
        """
            Gets the download information for 1337x result
        """

        url = entry['url']

        logger.info('1337x rewriting download url: {}', url)

        try:
            page = task.requests.get(url)
            logger.debug('requesting: {}', page.url)
        except RequestException as e:
            logger.error('1337x request failed: {}', e)
            raise UrlRewritingError('1337x request failed: %s' % e)

        soup = get_soup(page.content)

        magnet_url = str(
            soup.find('a',
                      href=re.compile(r'^magnet:\?')).get('href')).lower()
        torrent_url = str(
            soup.find('a',
                      href=re.compile(r'\.torrent$')).get('href')).lower()

        entry['url'] = torrent_url
        entry.setdefault('urls', []).append(torrent_url)
        entry['urls'].append(magnet_url)
Exemplo n.º 10
0
 def url_from_page(self, url):
     """Parses torrent url from newtorrents download page"""
     try:
         page = requests.get(url)
         data = page.text
     except requests.RequestException:
         raise UrlRewritingError('URLerror when retrieving page')
     p = re.compile(r"copy\(\'(.*)\'\)", re.IGNORECASE)
     f = p.search(data)
     if not f:
         # the link in which plugin relies is missing!
         raise UrlRewritingError(
             'Failed to get url from download page. Plugin may need a update.'
         )
     else:
         return f.group(1)
Exemplo n.º 11
0
    def url_rewrite(self, task, entry):
        """
            Gets torrent information for topic from rutracker api
        """

        url = entry['url']
        logger.info('rewriting download url: {}', url)

        topic_id = parse_qs(urlparse(url).query)['t'][0]

        api_url = f"{self.base_url}/v1/get_tor_topic_data"
        api_params = {
            'by': 'topic_id',
            'val': topic_id,
        }
        try:
            topic_request = task.requests.get(api_url, params=api_params)
        except RequestException as e:
            raise UrlRewritingError(f'rutracker request failed: {e}')

        topic = topic_request.json()['result'][topic_id]

        magnet = {
            'xt': f"urn:btih:{topic['info_hash']}",
            'tr': [f'http://bt{i}.t-ru.org/ann?magnet' for i in ['', '2', '3', '4']],
            'dn': topic['topic_title']
        }
        magnet_qs = urlencode(magnet, doseq=True, safe=':')
        magnet_uri = f"magnet:?{magnet_qs}"
        entry['url'] = magnet_uri
Exemplo n.º 12
0
    def url_rewrite(self, task, entry):
        logger.debug('Requesting {}', entry['url'])
        page = requests.get(entry['url'])
        soup = get_soup(page.text)

        for link in soup.findAll('a', attrs={'href': re.compile(r'^/url')}):
            # Extract correct url from google internal link
            href = 'http://google.com' + link['href']
            args = parse_qs(urlparse(href).query)
            href = args['q'][0]

            # import IPython; IPython.embed()
            # import sys
            # sys.exit(1)
            # href = link['href'].lstrip('/url?q=').split('&')[0]

            # Test if entry with this url would be recognized by some urlrewriter
            logger.trace('Checking if {} is known by some rewriter', href)
            fake_entry = {'title': entry['title'], 'url': href}
            urlrewriting = plugin.get('urlrewriting', self)
            if urlrewriting.url_rewritable(task, fake_entry):
                logger.debug('--> rewriting {} (known url pattern)', href)
                entry['url'] = href
                return
            else:
                logger.debug('<-- ignoring {} (unknown url pattern)', href)
        raise UrlRewritingError('Unable to resolve')
Exemplo n.º 13
0
    def url_rewrite(self, task, entry):
        url = entry['url']
        if url.startswith('http://www.newtorrents.info/?q=') or url.startswith(
                'http://www.newtorrents.info/search'):
            results = self.entries_from_search(entry['title'], url=url)
            if not results:
                raise UrlRewritingError("No matches for %s" % entry['title'])
            url = results[0]['url']
        else:
            url = self.url_from_page(url)

        if url:
            entry['url'] = url
            self.resolved.append(url)
        else:
            raise UrlRewritingError('Bug in newtorrents urlrewriter')
Exemplo n.º 14
0
 def url_rewrite(self, task, entry):
     try:
         page = task.requests.get(entry['url'])
     except RequestException as e:
         raise UrlRewritingError(str(e))
     try:
         soup = get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(str(e))
     link_elements = soup.find_all('pre', class_='links')
     if 'urls' in entry:
         urls = list(entry['urls'])
     else:
         urls = []
     for element in link_elements:
         urls.extend(element.text.splitlines())
     regexps = self.config.get('filehosters_re', [])
     filtered_urls = []
     for i, url in enumerate(urls):
         urls[i] = normalize_unicode(url)
         for regexp in regexps:
             if re.search(regexp, urls[i]):
                 filtered_urls.append(urls[i])
                 log.debug('Url: "%s" matched filehoster filter: %s',
                           urls[i], regexp)
                 break
         else:
             if regexps:
                 log.debug(
                     'Url: "%s" does not match any of the given filehoster filters: %s',
                     urls[i],
                     str(regexps),
                 )
     if regexps:
         log.debug('Using filehosters_re filters: %s', str(regexps))
         urls = filtered_urls
     else:
         log.debug(
             'No filehoster filters configured, using all found links.')
     num_links = len(urls)
     log.verbose('Found %d links at %s.', num_links, entry['url'])
     if num_links:
         entry['urls'] = urls
         entry['url'] = urls[0]
     else:
         raise UrlRewritingError('No useable links found at %s' %
                                 entry['url'])
Exemplo n.º 15
0
    def parse_download_page(self, url, requests):
        txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
        try:
            page = requests.get(url, headers=txheaders)
        except requests.exceptions.RequestException as e:
            msg = 'Cannot open "%s" : %s' % (url, str(e))
            logger.error(msg)
            raise UrlRewritingError(msg)

        try:
            soup = get_soup(page.text)
        except Exception as e:
            raise UrlRewritingError(str(e))

        down_link = soup.find('a', attrs={'href': re.compile(r"down\.php\?.*")})
        if not down_link:
            raise UrlRewritingError('Unable to locate download link from url "%s"' % url)
        return 'http://bt.hliang.com/' + down_link.get('href')
Exemplo n.º 16
0
 def url_rewrite(self, task, entry):
     if 'url' not in entry:
         log.error('Didn\'t actually get a URL...')
     else:
         log.debug('Got the URL: %s', entry['url'])
     if entry['url'].startswith('https://www.torrentday.com/browse'):
         # use search
         results = self.search(task, entry)
         if not results:
             raise UrlRewritingError('No search results found')
         entry['url'] = results[0]['url']
Exemplo n.º 17
0
 def parse_download_page(self, page_url, requests):
     page = requests.get(page_url)
     try:
         soup = get_soup(page.text)
     except Exception as e:
         raise UrlRewritingError(e)
     tag_a = soup.find("a", {"class": "dl_link"})
     if not tag_a:
         if soup.findAll(text="Connexion ?"):
             raise UrlRewritingError('You are not logged in,\
                                      check if your cookie for\
                                      authentication is up to date')
         else:
             raise UrlRewritingError('You have reached your download\
                                     limit per 24hours, so I cannot\
                                     get the torrent')
     torrent_url = "http://www.frenchtorrentdb.com" + tag_a.get(
         'href') + "&js=1"
     logger.debug('TORRENT URL is : {}', torrent_url)
     return torrent_url
Exemplo n.º 18
0
 def url_rewrite(self, task, entry):
     try:
         # need to fake user agent
         txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
         page = task.requests.get(entry['url'], headers=txheaders)
         soup = get_soup(page.text)
         results = soup.find_all('a', attrs={'class': 'l'})
         if not results:
             raise UrlRewritingError('No results')
         for res in results:
             url = res.get('href')
             url = url.replace('/interstitial?url=', '')
             # generate match regexp from google search result title
             regexp = '.*'.join([x.contents[0] for x in res.find_all('em')])
             if re.match(regexp, entry['title']):
                 logger.debug('resolved, found with {}', regexp)
                 entry['url'] = url
                 return
         raise UrlRewritingError('Unable to resolve')
     except Exception as e:
         raise UrlRewritingError(e)
Exemplo n.º 19
0
 def url_rewrite(self, task, entry):
     if 'url' not in entry:
         logger.error("Didn't actually get a URL...")
     else:
         logger.debug('Got the URL: {}', entry['url'])
     if entry['url'].startswith('https://www.torrentleech.org/torrents/browse/list/query/'):
         # use search
         results = self.search(task, entry)
         if not results:
             raise UrlRewritingError("No search results found")
         # TODO: Search doesn't enforce close match to title, be more picky
         entry['url'] = results[0]['url']
Exemplo n.º 20
0
 def url_rewrite(self, task, entry):
     if 'url' not in entry:
         logger.error("Didn't actually get a URL...")
     else:
         logger.debug('Got the URL: {}', entry['url'])
     if entry['url'].startswith(SEARCH_URL):
         # use search
         results = self.search(task, entry)
         if not results:
             raise UrlRewritingError("No search results found")
         # TODO: Search doesn't enforce close match to title, be more picky
         entry['url'] = results[0]['url']
Exemplo n.º 21
0
 def url_rewrite(self, task, entry):
     if 'url' not in entry:
         logger.error("Didn't actually get a URL...")
     else:
         logger.debug('Got the URL: {}', entry['url'])
     if self.url_search.match(entry['url']):
         # use search
         results = self.search(task, entry)
         if not results:
             raise UrlRewritingError("No search results found")
         # TODO: Close matching was taken out of search methods, this may need to be fixed to be more picky
         entry['url'] = results[0]['url']
     else:
         torrent_id = self.url_match(entry['url']).group(1)
         url = f"{self.url}/api/v1/torrents/{torrent_id}"
         logger.debug('Getting info for torrent ID {}', torrent_id)
         json_result = task.requests.get(url).json()
         # if json_result['error'] == '404':
         if not 'result' in json_result:
             raise UrlRewritingError("Torrent with ID %s does not exist." %
                                     torrent_id)
         entry['url'] = json_result['result']['rating']['magnet']
Exemplo n.º 22
0
 def url_rewrite(self, task, entry):
     if 'url' not in entry:
         logger.error("Didn't actually get a URL...")
     else:
         logger.debug('Got the URL: {}', entry['url'])
     if self.url_search.match(entry['url']):
         # use search
         results = self.search(task, entry)
         if not results:
             raise UrlRewritingError("No search results found")
         # TODO: Close matching was taken out of search methods, this may need to be fixed to be more picky
         entry['url'] = results[0]['url']
     else:
         torrent_id = self.url_match.match(entry['url']).group(1)
         url = f"{self.url}/t.php?id={torrent_id}"
         logger.debug('Getting info for torrent ID {}', torrent_id)
         json_result = task.requests.get(url).json()
         if json_result['id'] == '0':
             raise UrlRewritingError(
                 f"Torrent with ID {torrent_id} does not exist.")
         entry['url'] = self.info_hash_to_magnet(json_result['info_hash'],
                                                 json_result['name'])
Exemplo n.º 23
0
    def parse_download_page(self, url, task):
        log.verbose('Descargas2020 URL: %s', url)

        try:
            page = self.requests.get(url)
        except requests.RequestException as e:
            raise UrlRewritingError(e)
        try:
            soup = get_soup(page.text)
        except Exception as e:
            raise UrlRewritingError(e)

        torrent_id = None
        url_format = DESCARGAS2020_TORRENT_FORMAT

        torrent_id_prog = re.compile(
            r"(?:parametros\s*=\s*\n?)\s*{\s*\n(?:\s*'\w+'\s*:.*\n)+\s*'(?:torrentID|id)"
            "'\s*:\s*'(\d+)'")
        torrent_ids = soup.findAll(text=torrent_id_prog)
        if torrent_ids:
            match = torrent_id_prog.search(torrent_ids[0])
            if match:
                torrent_id = match.group(1)
        if not torrent_id:
            log.debug('torrent ID not found, searching openTorrent script')
            match = re.search(
                r'function openTorrent.*\n.*\{.*(\n.*)+window\.location\.href =\s*\".*\/(\d+.*)\";',
                page.text,
                re.MULTILINE,
            )
            if match:
                torrent_id = match.group(2).rstrip('/')

        if not torrent_id:
            raise UrlRewritingError('Unable to locate torrent ID from url %s' %
                                    url)

        return url_format.format(torrent_id)
Exemplo n.º 24
0
 def url_rewrite(self, task, entry):
     if 'url' not in entry:
         log.error("Didn't actually get a URL...")
     else:
         log.debug("Got the URL: %s" % entry['url'])
     if self.url_search.match(entry['url']):
         # use search
         results = self.search(task, entry)
         if not results:
             raise UrlRewritingError("No search results found")
         # TODO: Close matching was taken out of search methods, this may need to be fixed to be more picky
         entry['url'] = results[0]['url']
     else:
         # parse download page
         entry['url'] = self.parse_download_page(entry['url'], task.requests)
Exemplo n.º 25
0
    def url_rewrite(self, task, entry):
        for name, config in self.resolves.get(task.name, {}).items():
            regexp = config['regexp_compiled']
            format = config['format']
            if regexp.search(entry['url']):
                log.debug('Regexp resolving %s with %s' % (entry['url'], name))

                # run the regexp
                entry['url'] = regexp.sub(format, entry['url'])

                if regexp.match(entry['url']):
                    entry.fail('urlrewriting')
                    raise UrlRewritingError(
                        'Regexp %s result should NOT continue to match!' % name
                    )
                return
Exemplo n.º 26
0
    def url_rewrite(self, task, entry):
        soup = self._get_soup(task, entry['url'])

        # grab links from the main post:
        link_elements = []
        log.debug(
            'Searching %s for a tags where the text matches one of: %s',
            entry['url'],
            str(self.config.get('link_text_re')),
        )
        for regexp in self.config.get('link_text_re'):
            link_elements.extend(soup.find_all('a', string=re.compile(regexp)))
        log.debug('Original urls: %s', str(entry['urls']))
        if 'urls' in entry:
            urls = list(entry['urls'])
            log.debug('Original urls: %s', str(entry['urls']))
        else:
            urls = []
        log.debug('Found link elements: %s', str(link_elements))
        for element in link_elements:
            if re.search('nfo1.rlsbb.(ru|com)', element['href']):
                # grab multipart links
                urls.extend(self.grab_multilinks(task, element['href']))
            else:
                urls.append(element['href'])

        # grab links from comments
        regexps = self.config.get('filehosters_re', [])
        if self.config.get('parse_comments'):
            comments = soup.find_all('div', id=re.compile("commentbody"))
            log.debug('Comment parsing enabled: found %d comments.',
                      len(comments))
            if comments and not regexps:
                log.warn(
                    'You have enabled comment parsing but you did not define any filehoster_re filter. You may get a lot of unwanted and potentially dangerous links from the comments.'
                )
            for comment in comments:
                links = comment.find_all('a')
                for link in links:
                    urls.append(link['href'])

        # filter urls:
        filtered_urls = []
        for i, url in enumerate(urls):
            urls[i] = normalize_unicode(url)
            for regexp in regexps:
                if re.search(regexp, urls[i]):
                    filtered_urls.append(urls[i])
                    log.debug('Url: "%s" matched filehoster filter: %s',
                              urls[i], regexp)
                    break
            else:
                if regexps:
                    log.debug(
                        'Url: "%s" was discarded because it does not match any of the given filehoster filters: %s',
                        urls[i],
                        str(regexps),
                    )
        if regexps:
            log.debug('Using filehosters_re filters: %s', str(regexps))
            urls = filtered_urls
        else:
            log.debug(
                'No filehoster filters configured, using all found links.')
        num_links = len(urls)
        log.verbose('Found %d links at %s.', num_links, entry['url'])
        if num_links:
            entry['urls'] = urls
            entry['url'] = urls[0]
        else:
            raise UrlRewritingError('No useable links found at %s' %
                                    entry['url'])