def horrible_entries(requests, page_url): entries = [] try: soup = get_soup(requests.get(page_url).content) except RequestException as e: log.error('HorribleSubs request failed: %s', e) return entries for li_label in soup.findAll('li'): title = '[HorribleSubs] {0}{1}'.format( str(li_label.find('span').next_sibling), str(li_label.find('strong').text) ) log.debug('Found title `%s`', title) url = li_label.find('a')['href'] episode = re.sub(r'.*#', '', url) # Get show ID try: soup = get_soup(requests.get('https://horriblesubs.info/{0}'.format(url)).content) except RequestException as e: log.error('HorribleSubs request failed: %s', e) return entries show_id = re.sub(r'[^0-9]', '', soup(text=re.compile('hs_showid'))[0]) entries = HorribleSubs.horrible_get_downloads( requests, title, 'https://horriblesubs.info/api.php?method=getshows&type=show&mode=filter&showid={0}&value={1}'.format( show_id, episode ), ) return entries
def parse_html_list(self, task, config, url, params, headers): page = self.fetch_page(task, url, params, headers) soup = get_soup(page.text) try: item_text = soup.find('div', class_='lister-total-num-results').string.split() total_item_count = int(item_text[0].replace(',', '')) log.verbose('imdb list contains %d items', total_item_count) except AttributeError: total_item_count = 0 except (ValueError, TypeError) as e: # TODO Something is wrong if we get a ValueError, I think raise plugin.PluginError( 'Received invalid movie count: %s ; %s' % (soup.find('div', class_='lister-total-num-results').string, e) ) if not total_item_count: log.verbose('No movies were found in imdb list: %s', config['list']) return entries = [] items_processed = 0 page_no = 1 while items_processed < total_item_count: # Fetch the next page unless we've just begun if items_processed: page_no += 1 params['page'] = page_no page = self.fetch_page(task, url, params, headers) soup = get_soup(page.text) items = soup.find_all('div', class_='lister-item') if not items: log.debug('no items found on page: %s, aborting.', url) break log.debug('%d items found on page %d', len(items), page_no) for item in items: items_processed += 1 a = item.find('h3', class_='lister-item-header').find('a') if not a: log.debug('no title link found for row, skipping') continue link = ('http://www.imdb.com' + a.get('href')).rstrip('/') entry = Entry() entry['title'] = a.text try: year = int(item.find('span', class_='lister-item-year').text) entry['title'] += ' (%s)' % year entry['imdb_year'] = year except (ValueError, TypeError): pass entry['url'] = link entry['imdb_id'] = extract_id(link) entry['imdb_name'] = entry['title'] entries.append(entry) return entries
def on_task_input(self, task, config): # use rss plugin # since we have to do 2 page lookups per trailer, use all_entries False to lighten load rss_config = {'url': self.rss_url, 'all_entries': False} rss_entries = super(AppleTrailers, self).on_task_input(task, rss_config) # Multiple entries can point to the same movie page (trailer 1, clip1, etc.) trailers = {} for entry in rss_entries: url = entry['original_url'] trailers.setdefault(url, []).append(entry['title']) result = [] if config == '720p': url_extension = 'includes/extralarge.html' else: url_extension = 'includes/large.html' for url, titles in trailers.iteritems(): inc_url = url + url_extension try: page = task.requests.get(inc_url) except RequestException as err: log.warning("RequestsException when opening playlist page: %s" % err) continue soup = get_soup(page.text) for title in titles: trailer = soup.find(text=title.split(' - ')[1]) if not trailer: log.debug('did not find trailer link') continue trailers_link = trailer.find_parent('a') if not trailers_link: log.debug('did not find trailer link') continue try: page = task.requests.get(urlparse.urljoin(url, trailers_link['href'])) except RequestException as e: log.debug('error getting trailers page') continue trailer_soup = get_soup(page.text) link = trailer_soup.find('a', attrs={'class': 'movieLink'}) if not link: log.debug('could not find download link') continue # Need to add an 'h' in front of the resolution entry_url = link['href'] entry_url = entry_url[:entry_url.find(config + '.mov')] + 'h%s.mov' % config entry = Entry(title, entry_url) # Populate a couple entry fields for making pretty filenames entry['movie_name'], entry['apple_trailers_name'] = title.split(' - ') result.append(entry) return result
def get_fuzer_soup(search_term, categories_list): params = {'matchquery': 'any'} page = requests.get( 'https://www.fuzer.me/browse.php?ref_=advanced&query={}&{}'.format(search_term, '&'.join(categories_list)), params=params) log.debug('Using %s as fuzer search url' % page.url) return get_soup(page.content)
def search(self, entry, config): url = "https://tehconnection.eu/torrents.php?searchstr=%s" \ % entry.get("imdb_id"); page = urlopener(url, log) soup = get_soup(page) results = set() for row in soup.find_all("tr", class_="group_torrent"): link = row.find(title="Download") info = row.find(colspan="1").contents[3].contents[0].strip() seeders = int(row.find_all("td")[6].contents[0].strip()) leechers = int(row.find_all("td")[7].contents[0].strip()) result = Entry(); result["title"] = entry.get("title") + " / " + info result["imdb_id"] = entry.get("imdb_id") result["url"] = "https://tehconnection.eu" + link.get("href") result["torrent_seeds"] = seeders result["torrent_leeches"] = leechers result["search_sort"] = torrent_availability(result['torrent_seeds'], result['torrent_leeches']) results.add(result) return results
def search(self, task, entry, config=None): """ Search for entries on SceneAccess """ if not session.cookies: log.debug('Logging in to %s...' % URL) params = {'username': config['username'], 'password': config['password'], 'submit': 'come on in'} session.post(URL + 'login', data=params) if config.has_key('gravity_multiplier'): multip = config['gravity_multiplier'] else: multip = 1 # Prepare queries... BASE_URLS = list() entries = set() for category in self.processCategories(config): BASE_URLS.append(URL + '%(url_path)s?method=2%(category_url_string)s' % category) # Search... for search_string in entry.get('search_strings', [entry['title']]): search_string_normalized = normalize_unicode(clean_title(search_string)) search_string_url_fragment = '&search=' + quote(search_string_normalized.encode('utf8')) for url in BASE_URLS: url += search_string_url_fragment log.debug('Search URL for `%s`: %s' % (search_string, url)) page = session.get(url).content soup = get_soup(page) for result in soup.findAll('tr', attrs={'class': 'tt_row'}): entry = Entry() entry['title'] = result.find('a', href=re.compile(r'details\?id=\d+'))['title'] entry['url'] = URL + result.find('a', href=re.compile(r'.torrent$'))['href'] entry['torrent_seeds'] = result.find('td', attrs={'class': 'ttr_seeders'}).text entry['torrent_leeches'] = result.find('td', attrs={'class': 'ttr_leechers'}).text entry['search_sort'] = torrent_availability(entry['torrent_seeds'], entry['torrent_leeches'])*multip size = result.find('td', attrs={'class': 'ttr_size'}).next size = re.search('(\d+(?:[.,]\d+)*)\s?([KMG]B)', size) if size: if size.group(2) == 'GB': entry['content_size'] = int(float(size.group(1)) * 1000 ** 3 / 1024 ** 2) elif size.group(2) == 'MB': entry['content_size'] = int(float(size.group(1)) * 1000 ** 2 / 1024 ** 2) elif size.group(2) == 'KB': entry['content_size'] = int(float(size.group(1)) * 1000 / 1024 ** 2) else: entry['content_size'] = int(float(size.group(1)) / 1024 ** 2) entries.add(entry) return entries
def url_rewrite(self, task, entry): url = entry['url'] page = None for (scheme, netloc) in EZTV_MIRRORS: try: _, _, path, params, query, fragment = urlparse(url) url = urlunparse((scheme, netloc, path, params, query, fragment)) page = task.requests.get(url).content except RequestException as e: log.debug('Eztv mirror `%s` seems to be down', url) continue break if not page: raise UrlRewritingError('No mirrors found for url %s' % entry['url']) log.debug('Eztv mirror `%s` chosen', url) try: soup = get_soup(page) mirrors = soup.find_all('a', attrs={'class': re.compile(r'download_\d')}) except Exception as e: raise UrlRewritingError(e) log.debug('%d torrent mirrors found', len(mirrors)) if not mirrors: raise UrlRewritingError('Unable to locate download link from url %s' % url) entry['urls'] = [m.get('href') for m in mirrors] entry['url'] = mirrors[0].get('href')
def search(self, task, entry, config=None): txheaders = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Keep-Alive': '300', 'Connection': 'keep-alive', } nzbs = set() for search_string in entry.get('search_strings', [entry['title']]): query = entry['title'] url = u'http://newzleech.com/?%s' % str(urllib.urlencode({'q': query.encode('latin1'), 'm': 'search', 'group': '', 'min': 'min', 'max': 'max', 'age': '', 'minage': '', 'adv': ''})) # log.debug('Search url: %s' % url) req = urllib2.Request(url, headers=txheaders) page = urlopener(req, log) soup = get_soup(page) for item in soup.find_all('table', attrs={'class': 'contentt'}): subject_tag = item.find('td', attrs={'class': 'subject'}).next subject = ''.join(subject_tag.find_all(text=True)) complete = item.find('td', attrs={'class': 'complete'}).contents[0] size = item.find('td', attrs={'class': 'size'}).contents[0] nzb_url = 'http://newzleech.com/' + item.find('td', attrs={'class': 'get'}).next.get('href') # generate regexp from entry title and see if it matches subject regexp = query wildcardize = [' ', '-'] for wild in wildcardize: regexp = regexp.replace(wild, '.') regexp = '.*' + regexp + '.*' # log.debug('Title regexp: %s' % regexp) if re.match(regexp, subject): log.debug('%s matches to regexp' % subject) if complete != u'100': log.debug('Match is incomplete %s from newzleech, skipping ..' % query) continue log.info('Found \'%s\'' % query) try: size_num = float(size[:-3]) except (ValueError, TypeError): log.error('Failed to parse_size %s' % size) size_num = 0 # convert into megabytes if 'GB' in size: size_num *= 1024 if 'KB' in size: size_num /= 1024 # choose largest file nzbs.add(Entry(title=subject, url=nzb_url, content_size=size_num, search_sort=size_num)) return nzbs
def on_task_input(self, task, config): session = requests.Session() data = {'username': config['username'], 'password': config['password'], 'sub_login': '******'} try: r = session.post('http://www.pogdesign.co.uk/cat/', data=data) if 'U / P Invalid' in r.text: raise plugin.PluginError('Invalid username/password for pogdesign.') page = session.get('http://www.pogdesign.co.uk/cat/showselect.php') except requests.RequestException as e: raise plugin.PluginError('Error retrieving source: %s' % e) soup = get_soup(page.text) entries = [] for row in soup.find_all('label', {'class': 'label_check'}): if row.find(attrs={'checked': 'checked'}): t = row.find('strong').text if t.endswith('[The]'): t = 'The ' + t[:-6] # Make certain names friendlier if t in self.name_map: t = self.name_map[t] e = Entry() e['title'] = t url = row.find_next('a', {'class': 'slink'}) e['url'] = 'http://www.pogdesign.co.uk' + url['href'] entries.append(e) return entries
def parse_download_page(self, url): if 'newpct1.com' in url: log.verbose('Newpct1 URL: %s', url) url = url.replace('newpct1.com/', 'newpct1.com/descarga-torrent/') else: log.verbose('Newpct URL: %s', url) try: page = requests.get(url) except requests.exceptions.RequestException as e: raise UrlRewritingError(e) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) if 'newpct1.com' in url: torrent_id_prog = re.compile(r'descargar-torrent/(.+)/') torrent_ids = soup.findAll(href=torrent_id_prog) else: torrent_id_prog = re.compile("(?:parametros\s*=\s*\n?)\s*{\s*\n(?:\s*'\w+'\s*:.*\n)+\s*'(?:torrentID|id)'\s*:\s*'(\d+)'") torrent_ids = soup.findAll(text=torrent_id_prog) if len(torrent_ids) == 0: raise UrlRewritingError('Unable to locate torrent ID from url %s' % url) if 'newpct1.com' in url: torrent_id = torrent_id_prog.search(torrent_ids[0]['href']).group(1) return NEWPCT1_TORRENT_FORMAT.format(torrent_id) else: torrent_id = torrent_id_prog.search(torrent_ids[0]).group(1) return NEWPCT_TORRENT_FORMAT.format(torrent_id)
def _get_watchlist_entries(self, task, config): email = config.get("email") log.info("Retrieving npo.nl episode watchlist for %s", email) response = self._get_page(task, config, "https://mijn.npo.nl/profiel/kijklijst") page = get_soup(response.content) self.csrf_token = page.find("meta", attrs={"name": "csrf-token"})["content"] entries = list() for listItem in page.findAll("div", class_="watch-list-item"): url = listItem.find("a")["href"] series_name = next(listItem.find("h3").stripped_strings) remove_url = listItem.find("a", class_="unwatch-confirm")["href"] entry_date = self._parse_date(listItem.find("span", class_="global__content-info").text) episode_id = url.split("/")[-1] title = "{} ({})".format(series_name, episode_id) e = Entry() e["url"] = self._prefix_url("https://mijn.npo.nl", url) e["title"] = title e["series_name"] = series_name e["series_name_plain"] = self._strip_accents(series_name) e["series_date"] = entry_date e["series_id_type"] = "date" e["description"] = listItem.find("p").text e["remove_url"] = self._prefix_url("https://mijn.npo.nl", remove_url) if config.get("remove_accepted"): e.on_complete(self.entry_complete, task=task) entries.append(e) return entries
def search(self, task, entry, config=None): """ Search for name from piratebay. """ if not isinstance(config, dict): config = {} self.set_urls(config.get('url', URL)) sort = SORT.get(config.get('sort_by', 'seeds')) if config.get('sort_reverse'): sort += 1 if isinstance(config.get('category'), int): category = config['category'] else: category = CATEGORIES.get(config.get('category', 'all')) filter_url = '/0/%d/%d' % (sort, category) entries = set() for search_string in entry.get('search_strings', [entry['title']]): query = normalize_unicode(search_string) # TPB search doesn't like dashes or quotes query = query.replace('-', ' ').replace("'", " ") # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand url = '%s/search/%s%s' % (self.url, quote(query.encode('utf-8')), filter_url) log.debug('Using %s as piratebay search url' % url) page = task.requests.get(url).content soup = get_soup(page) for link in soup.find_all('a', attrs={'class': 'detLink'}): entry = Entry() entry['title'] = self.extract_title(link) if not entry['title']: log.error('Malformed search result. No title or url found. Skipping.') continue href = link.get('href') if href.startswith('/'): # relative link? href = self.url + href entry['url'] = href tds = link.parent.parent.parent.find_all('td') entry['torrent_seeds'] = int(tds[-2].contents[0]) entry['torrent_leeches'] = int(tds[-1].contents[0]) entry['torrent_availability'] = torrent_availability( entry['torrent_seeds'], entry['torrent_leeches'] ) # Parse content_size size_text = link.find_next(attrs={'class': 'detDesc'}).get_text() if size_text: size = re.search(r'Size (\d+(\.\d+)?\xa0(?:[PTGMK])?i?B)', size_text) if size: entry['content_size'] = parse_filesize(size.group(1)) else: log.error( 'Malformed search result? Title: "%s", No size? %s', entry['title'], size_text, ) entries.add(entry) return sorted(entries, reverse=True, key=lambda x: x.get('torrent_availability'))
def on_feed_input(self, feed, config): config = self.build_config(config) log.debug('InputPlugin html requesting url %s' % config['url']) if config.get('username') and config.get('password'): log.debug('Basic auth enabled. User: %s Password: %s' % (config['username'], config['password'])) passman = urllib2.HTTPPasswordMgrWithDefaultRealm() passman.add_password(None, config['url'], config['username'], config['password']) handlers = [urllib2.HTTPBasicAuthHandler(passman)] else: handlers = None page = urlopener(config['url'], log, handlers=handlers) soup = get_soup(page) log.debug('Detected encoding %s' % soup.originalEncoding) # dump received content into a file if 'dump' in config: name = config['dump'] log.info('Dumping %s into %s' % (config['url'], name)) data = soup.prettify() f = open(name, 'w') f.write(data) f.close() return self.create_entries(config['url'], soup, config)
def horrible_get_downloads(requests, title, page_url): entries = [] try: soup = get_soup(requests.get(page_url).content) except RequestException as e: log.error('HorribleSubs request failed: %s', e) return entries for div in soup.findAll('div', attrs={'class': 'rls-link'}): ttitle = '{0} [{1}]'.format(title, re.sub(r'.*-', '', div['id'])) urls = [] for url in div.findAll('a'): # skip non torrent based links if ( 'hs-ddl-link' in url.parent.attrs['class'] or 'hs-xdcc-link' in url.parent.attrs['class'] ): continue log.debug('Found url `%s`', url) urls.append(url.attrs['href']) # move magnets to last, a bit hacky for url in urls[:]: if url.startswith('magnet'): urls.remove(url) urls.append(url) entries.append(Entry(title=ttitle, url=urls[0], urls=urls)) return entries
def _get_favorites_entries(self, task, config): email = config.get('email') max_age = config.get('max_episode_age_days') log.info('Retrieving npo.nl favorite series for %s', email) response = self._get_page(task, config, 'https://mijn.npo.nl/profiel/favorieten') page = get_soup(response.content) entries = list() for listItem in page.findAll('div', class_='thumb-item'): url = listItem.find('a')['href'] if url == '/profiel/favorieten/favorieten-toevoegen': log.debug("Skipping 'add favorite' button") continue url = self._prefix_url('https://mijn.npo.nl', url) series_name = next(listItem.find('div', class_='thumb-item__title').stripped_strings) last_aired_text = listItem.find('div', class_='thumb-item__subtitle').text last_aired_text = last_aired_text.rsplit('Laatste aflevering ')[-1] last_aired = self._parse_date(last_aired_text) if last_aired is None: log.info('Series %s did not yet start', series_name) continue elif max_age >= 0 and (date.today() - last_aired) > timedelta(days=max_age): log.debug('Skipping %s, last aired on %s', series_name, last_aired) continue elif (date.today() - last_aired) > timedelta(days=365*2): log.info('Series %s last aired on %s', series_name, last_aired) entries += self._get_series_episodes(task, config, series_name, url) return entries
def _get_watchlist_entries(self, task, config): email = config.get('email') log.info('Retrieving npo.nl episode watchlist for %s', email) response = self._get_page(task, config, 'https://mijn.npo.nl/profiel/kijklijst') page = get_soup(response.content) self.csrf_token = page.find('meta', attrs={'name': 'csrf-token'})['content'] entries = list() for list_item in page.findAll('div', class_='watch-list-item'): url = list_item.find('a')['href'] series_name = next(list_item.find('h3').stripped_strings) remove_url = list_item.find('a', class_='unwatch-confirm')['href'] entry_date = self._parse_date(list_item.find('span', class_='global__content-info').text) episode_id = url.split('/')[-1] title = '{} ({})'.format(series_name, episode_id) e = Entry() e['url'] = self._prefix_url('https://mijn.npo.nl', url) e['title'] = title e['series_name'] = series_name e['series_name_plain'] = self._convert_plain(series_name) e['series_date'] = entry_date e['series_id_type'] = 'date' e['description'] = list_item.find('p').text e['remove_url'] = self._prefix_url('https://mijn.npo.nl', remove_url) if config.get('remove_accepted'): e.on_complete(self.entry_complete, task=task) entries.append(e) return entries
def horrible_entries(requests, page_url): entries = [] try: soup = get_soup(requests.get(page_url).content) except RequestException as e: log.error('HorribleSubs request failed: %s', e) return entries for td_label in soup.findAll('td', attrs={'class': 'dl-label'}): title = '[HorribleSubs] {0}'.format(str(td_label.find('i').string)) urls = [] log.debug('Found title `{0}`'.format(title)) for span in td_label.parent.findAll('span', attrs={'class': 'dl-link'}): # skip non torrent based links if 'hs-ddl-link' in span.parent.attrs['class']: continue url = str(span.find('a').attrs['href']) log.debug('Found url `{0}`'.format(url)) urls.append(url) # move magnets to last, a bit hacky for url in urls[:]: if url.startswith('magnet'): urls.remove(url) urls.append(url) entries.append(Entry(title=title, url=urls[0], urls=urls)) return entries
def _solveCaptcha(self, output, url_auth, params, opener): """ When trying to connect too many times with wrong password, a captcha can be requested. This captcha is really simple and can be solved by the provider. <label for="pass">204 + 65 = </label> <input type="text" size="40" name="captchaAnswer" id="lgn" value=""/> <input type="hidden" name="captchaQuery" value="204 + 65 = "> <input type="hidden" name="captchaToken" value="005d54a7428aaf587460207408e92145"> <br/> :param output: initial login output :return: output after captcha resolution """ html = get_soup(output) query = html.find("input", {"name": "captchaQuery"}) token = html.find("input", {"name": "captchaToken"}) if not query or not token: log.error("Unable to solve login captcha.") return output query_expr = query.attrs["value"].strip("= ") log.debug("Captcha query: " + query_expr) answer = arithmeticEval(query_expr) log.debug("Captcha answer: %s" % answer) params["captchaAnswer"] = answer params["captchaQuery"] = query.attrs["value"] params["captchaToken"] = token.attrs["value"] return opener.open(url_auth, urllib.parse.urlencode(params)).read()
def url_rewrite(self, task, entry): url = entry["url"] page = None for (scheme, netloc) in EZTV_MIRRORS: try: _, _, path, params, query, fragment = urlparse(url) url = urlunparse((scheme, netloc, path, params, query, fragment)) page = task.requests.get(url).content except RequestException as e: log.debug("Eztv mirror `%s` seems to be down", url) continue break if not page: raise UrlRewritingError("No mirrors found for url %s" % entry["url"]) log.debug("Eztv mirror `%s` chosen", url) try: soup = get_soup(page) mirrors = soup.find_all("a", attrs={"class": re.compile(r"download_\d")}) except Exception as e: raise UrlRewritingError(e) log.debug("%d torrent mirrors found", len(mirrors)) if not mirrors: raise UrlRewritingError("Unable to locate download link from url %s" % url) entry["urls"] = [m.get("href") for m in mirrors] entry["url"] = mirrors[0].get("href")
def on_task_input(self, task, config=None): config = self.build_config(config) url = base_url + config['p_slug'] + config['sort_by'] max_results = config.get('max_results', 1) rcount = 0 next_page = '' log.verbose('Looking for films in Letterboxd list: %s' % url) entries = [] while next_page is not None and rcount < max_results: try: page = requests.get(url).content except RequestException as e: raise plugin.PluginError('Error retrieving list from Letterboxd: %s' % e) soup = get_soup(page) for film in soup.find_all(attrs={config['f_slug']: True}): if rcount < max_results: entry = self.parse_film(film, config) entries.append(entry) if 'max_results' in config: rcount += 1 next_page = soup.select_one('.paginate-nextprev .next') if next_page is not None: next_page = next_page.get('href') if next_page is not None: url = base_url + next_page return entries
def parse_site(self, url, task): """Parse configured url and return releases array""" try: page = task.requests.get(url).content except RequestException as e: raise plugin.PluginError('Error getting input page: %e' % e) soup = get_soup(page) releases = [] for entry in soup.find_all('div', attrs={'class': 'entry'}): release = {} title = entry.find('h2') if not title: log.debug('No h2 entrytitle') continue release['title'] = title.a.contents[0].strip() log.debug('Processing title %s' % (release['title'])) for link in entry.find_all('a'): # no content in the link if not link.contents: continue link_name = link.contents[0] if link_name is None: continue if not isinstance(link_name, NavigableString): continue link_name = link_name.strip().lower() if link.has_attr('href'): link_href = link['href'] else: continue log.debug('found link %s -> %s' % (link_name, link_href)) # handle imdb link if link_name.lower() == 'imdb': log.debug('found imdb link %s' % link_href) release['imdb_id'] = extract_id(link_href) # test if entry with this url would be rewritable by known plugins (ie. downloadable) temp = {} temp['title'] = release['title'] temp['url'] = link_href urlrewriting = plugin.get_plugin_by_name('urlrewriting') if urlrewriting['instance'].url_rewritable(task, temp): release['url'] = link_href log.trace('--> accepting %s (resolvable)' % link_href) else: log.trace('<-- ignoring %s (non-resolvable)' % link_href) # reject if no torrent link if 'url' not in release: from flexget.utils.log import log_once log_once('%s skipped due to missing or unsupported (unresolvable) download link' % (release['title']), log) else: releases.append(release) return releases
def search(self, task, entry, config=None): """ Search for name from iptorrents """ categories = config.get('category', 'All') # Make sure categories is a list if not isinstance(categories, list): categories = [categories] # If there are any text categories, turn them into their id number categories = [c if isinstance(c, int) else CATEGORIES[c] for c in categories] filter_url = '&'.join((str(c) + '=') for c in categories) entries = set() for search_string in entry.get('search_strings', [entry['title']]): query = normalize_unicode(search_string) query = quote_plus(query.encode('utf8')) url = "{base_url}/t?{filter}&q={query}&qf=".format(base_url=BASE_URL, filter=filter_url, query=query) log.debug('searching with url: %s' % url) req = requests.get(url, cookies={'uid': str(config['uid']), 'pass': config['password']}) if '/u/' + str(config['uid']) not in req.text: raise plugin.PluginError("Invalid cookies (user not logged in)...") soup = get_soup(req.content, parser="html.parser") torrents = soup.find('table', {'id': 'torrents'}) results = torrents.findAll('tr') for torrent in results: if torrent.th and 'ac' in torrent.th.get('class'): # Header column continue if torrent.find('td', {'colspan': '99'}): log.debug('No results found for search %s', search_string) break entry = Entry() link = torrent.find('a', href=re.compile('download'))['href'] entry['url'] = "{base}{link}?torrent_pass={key}".format( base=BASE_URL, link=link, key=config.get('rss_key')) entry['title'] = torrent.find('a', href=re.compile('details')).text seeders = torrent.findNext('td', {'class': 'ac t_seeders'}).text leechers = torrent.findNext('td', {'class': 'ac t_leechers'}).text entry['torrent_seeds'] = int(seeders) entry['torrent_leeches'] = int(leechers) entry['search_sort'] = torrent_availability(entry['torrent_seeds'], entry['torrent_leeches']) size = torrent.findNext(text=re.compile('^([\.\d]+) ([GMK]?)B$')) size = re.search('^([\.\d]+) ([GMK]?)B$', size) entry['content_size'] = parse_filesize(size.group(0)) log.debug('Found entry %s', entry) entries.add(entry) return entries
def parse_download_page(self, url): if 'newpct1.com' in url: log.verbose('Newpct1 URL: %s', url) url = url.replace('newpct1.com/', 'newpct1.com/descarga-torrent/') else: log.verbose('Newpct URL: %s', url) try: page = requests.get(url) except requests.exceptions.RequestException as e: raise UrlRewritingError(e) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) if 'newpct1.com' in url: torrent_id_prog = re.compile(r'descargar-torrent/(.+)/') torrent_ids = soup.findAll(href=torrent_id_prog) else: torrent_id_prog = re.compile("'(?:torrentID|id)'\s*:\s*'(\d+)'") torrent_ids = soup.findAll(text=torrent_id_prog) if len(torrent_ids) == 0: raise UrlRewritingError('Unable to locate torrent ID from url %s' % url) if 'newpct1.com' in url: torrent_id = torrent_id_prog.search(torrent_ids[0]['href']).group(1) return 'http://www.newpct1.com/download/%s.torrent' % torrent_id else: torrent_id = torrent_id_prog.search(torrent_ids[0]).group(1) return 'http://www.newpct.com/torrents/{:0>6}.torrent'.format(torrent_id)
def search(self, query, comparator=StringComparator(), config=None): """ Search for name from piratebay. """ if not isinstance(config, dict): config = {} sort = SORT.get(config.get('sort_by', 'seeds')) if config.get('sort_reverse'): sort += 1 if isinstance(config.get('category'), int): category = config['category'] else: category = CATEGORIES.get(config.get('category', 'all')) filter_url = '/0/%d/%d' % (sort, category) comparator.set_seq1(query) query = comparator.search_string() # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand url = 'http://thepiratebay.se/search/' + urllib.quote(query.encode('utf-8')) + filter_url log.debug('Using %s as piratebay search url' % url) page = requests.get(url).content soup = get_soup(page) entries = [] for link in soup.find_all('a', attrs={'class': 'detLink'}): comparator.set_seq2(link.contents[0]) log.debug('name: %s' % comparator.a) log.debug('found name: %s' % comparator.b) log.debug('confidence: %s' % comparator.ratio()) if not comparator.matches(): continue entry = Entry() entry['title'] = link.contents[0] entry['url'] = 'http://thepiratebay.se' + link.get('href') tds = link.parent.parent.parent.find_all('td') entry['torrent_seeds'] = int(tds[-2].contents[0]) entry['torrent_leeches'] = int(tds[-1].contents[0]) entry['search_ratio'] = comparator.ratio() entry['search_sort'] = torrent_availability(entry['torrent_seeds'], entry['torrent_leeches']) # Parse content_size size = link.find_next(attrs={'class': 'detDesc'}).contents[0] size = re.search('Size ([\.\d]+)\xa0([GMK])iB', size) if size: if size.group(2) == 'G': entry['content_size'] = int(float(size.group(1)) * 1000 ** 3 / 1024 ** 2) elif size.group(2) == 'M': entry['content_size'] = int(float(size.group(1)) * 1000 ** 2 / 1024 ** 2) else: entry['content_size'] = int(float(size.group(1)) * 1000 / 1024 ** 2) entries.append(entry) if not entries: dashindex = query.rfind('-') if dashindex != -1: return self.search(query[:dashindex], comparator=comparator) else: raise PluginWarning('No close matches for %s' % query, log, log_once=True) entries.sort(reverse=True, key=lambda x: x.get('search_sort')) return entries
def _get_page(self, task, config, url): login_response = task.requests.get(url) if login_response.url == url: log.debug("Already logged in") return login_response elif login_response.url != "https://mijn.npo.nl/inloggen": raise plugin.PluginError("Unexpected login page: {}".format(login_response.url)) login_page = get_soup(login_response.content) token = login_page.find("input", attrs={"name": "authenticity_token"})["value"] email = config.get("email") password = config.get("password") try: profile_response = task.requests.post( "https://mijn.npo.nl/sessions", {"authenticity_token": token, "email": email, "password": password} ) except requests.RequestException as e: raise plugin.PluginError("Request error: %s" % e.args[0]) if profile_response.url == "https://mijn.npo.nl/sessions": raise plugin.PluginError("Failed to login. Check username and password.") elif profile_response.url != url: raise plugin.PluginError("Unexpected page: {} (expected {})".format(profile_response.url, url)) return profile_response
def on_task_input(self, task, config=None): config = self.build_config(config) url = base_url + config["p_slug"] + config["sort_by"] max_results = config.get("max_results", 1) rcount = 0 next_page = "" log.verbose("Looking for films in Letterboxd list: %s" % url) entries = [] while next_page is not None and rcount < max_results: try: page = requests.get(url).content except RequestException as e: raise plugin.PluginError("Error retrieving list from Letterboxd: %s" % e) soup = get_soup(page) for film in soup.find_all(attrs={config["f_slug"]: True}): if rcount < max_results: entry = self.parse_film(film, config) entries.append(entry) if "max_results" in config: rcount += 1 next_page = soup.find(class_="paginate-next") if next_page is not None: next_page = next_page.get("href") if next_page is not None: url = base_url + next_page return entries
def parse_download_page(self, url): txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} req = urllib2.Request(url, None, txheaders) page = urlopener(req, log) try: soup = get_soup(page) except Exception as e: raise UrlRewritingError(e) config = self.config or {} config.setdefault('quality', 'hd') links = soup.find_all('a', text="Descargar", href=re.compile("/subtitles")) if not links: raise UrlRewritingError('Unable to locate subtitle download link from url %s' % url) subtitle_url = '' for link in links: sub_url = link['href'] log.verbose('Found url %s', sub_url) if config['quality'] == 'hd' and re.search("720p|1080p",sub_url): subtitle_url = 'http://www.argenteam.net' + sub_url log.verbose('is a match') break if config['quality'] == 'sd' and re.search("720p|1080p",sub_url) == None: subtitle_url = 'http://www.argenteam.net' + sub_url log.verbose('is a match') break if subtitle_url == '': raise UrlRewritingError('Unable to locate download link %s from url %s' % (config['quality'], url)) return subtitle_url
def _get_page(self, task, config, url): login_response = requests.get(url) if login_response.url == url: log.debug('Already logged in') return login_response elif login_response.url != 'https://mijn.npo.nl/inloggen': raise plugin.PluginError('Unexpected login page: {}'.format(login_response.url)) login_page = get_soup(login_response.content) token = login_page.find('input', attrs={'name': 'authenticity_token'})['value'] email = config.get('email') password = config.get('password') try: profile_response = requests.post('https://mijn.npo.nl/sessions', {'authenticity_token': token, 'email': email, 'password': password}) except requests.RequestException as e: raise plugin.PluginError('Request error: %s' % e.args[0]) if profile_response.url == 'https://mijn.npo.nl/sessions': raise plugin.PluginError('Failed to login. Check username and password.') elif profile_response.url != url: raise plugin.PluginError('Unexpected page: {} (expected {})'.format(profile_response.url, url)) return profile_response
def url_rewrite(self, task, entry): log.debug('Requesting %s' % entry['url']) page = requests.get(entry['url']) soup = get_soup(page.text) for link in soup.findAll('a', attrs={'href': re.compile(r'^/url')}): # Extract correct url from google internal link href = 'http://google.com' + link['href'] args = parse_qs(urlparse(href).query) href = args['q'][0] # import IPython; IPython.embed() # import sys # sys.exit(1) # href = link['href'].lstrip('/url?q=').split('&')[0] # Test if entry with this url would be recognized by some urlrewriter log.trace('Checking if %s is known by some rewriter' % href) fake_entry = {'title': entry['title'], 'url': href} urlrewriting = plugin.get_plugin_by_name('urlrewriting') if urlrewriting['instance'].url_rewritable(task, fake_entry): log.debug('--> rewriting %s (known url pattern)' % href) entry['url'] = href return else: log.debug('<-- ignoring %s (unknown url pattern)' % href) raise UrlRewritingError('Unable to resolve')
def _login(self, task, config): if 'isAuthenticatedUser' in requests.cookies: log.debug('Already logged in') return login_url = 'https://www.npostart.nl/login' login_api_url = 'https://www.npostart.nl/api/login' try: login_response = requests.get(login_url) if login_response.url != login_url: raise plugin.PluginError('Unexpected login page: {}'.format(login_response.url)) login_page = get_soup(login_response.content) token = login_page.find('input', attrs={'name': '_token'})['value'] email = config.get('email') password = config.get('password') profile_response = requests.post(login_api_url, {'_token': token, 'username': email, 'password': password}) if 'isAuthenticatedUser' not in profile_response.cookies: raise plugin.PluginError('Failed to login. Check username and password.') log.debug('Succesfully logged in: %s', email) except RequestException as e: raise plugin.PluginError('Request error: %s' % str(e))
def search(self, task, entry, config=None): """ Search for name from piratebay. """ if not isinstance(config, dict): config = {} sort = SORT.get(config.get('sort_by', 'seeds')) if config.get('sort_reverse'): sort += 1 if isinstance(config.get('category'), int): category = config['category'] else: category = CATEGORIES.get(config.get('category', 'all')) filter_url = '/0/%d/%d' % (sort, category) entries = set() for search_string in entry.get('search_strings', [entry['title']]): query = normalize_unicode(search_string) # TPB search doesn't like dashes query = query.replace('-', ' ') # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand url = 'http://thepiratebay.%s/search/%s%s' % ( CUR_TLD, urllib.quote(query.encode('utf-8')), filter_url) log.debug('Using %s as piratebay search url' % url) page = requests.get(url).content soup = get_soup(page) for link in soup.find_all('a', attrs={'class': 'detLink'}): entry = Entry() entry['title'] = link.contents[0] entry['url'] = 'http://thepiratebay.%s%s' % (CUR_TLD, link.get('href')) tds = link.parent.parent.parent.find_all('td') entry['torrent_seeds'] = int(tds[-2].contents[0]) entry['torrent_leeches'] = int(tds[-1].contents[0]) entry['search_sort'] = torrent_availability( entry['torrent_seeds'], entry['torrent_leeches']) # Parse content_size size = link.find_next(attrs={'class': 'detDesc'}).contents[0] size = re.search('Size ([\.\d]+)\xa0([GMK])iB', size) if size: if size.group(2) == 'G': entry['content_size'] = int( float(size.group(1)) * 1000**3 / 1024**2) elif size.group(2) == 'M': entry['content_size'] = int( float(size.group(1)) * 1000**2 / 1024**2) else: entry['content_size'] = int( float(size.group(1)) * 1000 / 1024**2) entries.add(entry) return sorted(entries, reverse=True, key=lambda x: x.get('search_sort'))
def parse_download_page(self, url, requests): txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} page = requests.get(url, headers=txheaders) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) down_link = soup.find('a', attrs={'href': re.compile(".+mp4")}) if not down_link: raise UrlRewritingError('Unable to locate download link from url %s' % url) return down_link.get('href')
def authenticate(self): """Authenticates a session with imdb, and grabs any IDs needed for getting/modifying list.""" try: r = self._session.get( 'https://www.imdb.com/ap/signin?openid.return_to=https%3A%2F%2Fwww.imdb.com%2Fap-signin-' 'handler&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&' 'openid.assoc_handle=imdb_mobile_us&openid.mode=checkid_setup&openid.claimed_id=http%3A%' '2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.ope' 'nid.net%2Fauth%2F2.0') except RequestException as e: raise PluginError(e.args[0]) soup = get_soup(r.content) inputs = soup.select('form#ap_signin_form input') data = dict( (i['name'], i.get('value')) for i in inputs if i.get('name')) data['email'] = self.config['login'] data['password'] = self.config['password'] d = self._session.post('https://www.imdb.com/ap/signin', data=data) # Get user id by extracting from redirect url r = self._session.head('http://www.imdb.com/profile', allow_redirects=False) if not r.headers.get('location') or 'login' in r.headers['location']: raise plugin.PluginError( 'Login to imdb failed. Check your credentials.') self.user_id = re.search('ur\d+(?!\d)', r.headers['location']).group() # Get list ID if self.config['list'] == 'watchlist': data = {'consts[]': 'tt0133093', 'tracking_tag': 'watchlistRibbon'} wl_data = self._session.post( 'http://www.imdb.com/list/_ajax/watchlist_has', data=data).json() try: self.list_id = wl_data['list_id'] except KeyError: raise PluginError( 'No list ID could be received. Please initialize list by ' 'manually adding an item to it and try again') elif self.config['list'] in IMMUTABLE_LISTS or self.config[ 'list'].startswith('ls'): self.list_id = self.config['list'] else: data = {'tconst': 'tt0133093'} list_data = self._session.post( 'http://www.imdb.com/list/_ajax/wlb_dropdown', data=data).json() for li in list_data['items']: if li['wlb_text'] == self.config['list']: self.list_id = li['data_list_id'] break else: raise plugin.PluginError('Could not find list %s' % self.config['list']) self._authenticated = True
def parse_download_page(self, url): try: page = requests.get(url).content soup = get_soup(page, 'html.parser') download_link = soup.findAll( href=re.compile('redirect|redirectlink')) download_href = download_link[0]['href'] return download_href except Exception: raise UrlRewritingError('Unable to locate torrent from url %s' % url)
def on_task_input(self, task, config): # use rss plugin rss_config = {'url': self.rss_url} rss_entries = super(AppleTrailers, self).on_task_input(task, rss_config) # Multiple entries can point to the same movie page (trailer 1, clip # 1, etc.) entries = {} for entry in rss_entries: url = entry['original_url'] if url in entries: continue else: title = entry['title'] entries[url] = title[:title.rfind('-')].rstrip() result = [] for url, title in entries.iteritems(): inc_url = url + 'includes/playlists/web.inc' try: page = urlopener(inc_url, log) except HTTPError, err: log.warning("HTTPError when opening playlist page: %d %s" % (err.code, err.reason)) continue soup = get_soup(page) links = soup.find_all('a', attrs={ 'class': 'target-quicktimeplayer', 'href': re.compile(r'_h?480p\.mov$') }) for link in links: url = link.get('href') url = url[:url.rfind('_')] quality = self.quality.lower() if quality == 'ipod': url += '_i320.m4v' else: url += '_h' + quality + '.mov' entry = Entry() entry['url'] = url entry['title'] = title match = re.search(r'.*/([^?#]*)', url) entry['filename'] = match.group(1) result.append(entry) log.debug('found trailer %s', url)
def search(self, task, entry, config=None): if not config: log.debug('Divxatope disabled') return set() log.debug('Search DivxATope') url_search = 'http://divxatope1.com/buscar/descargas' results = set() for search_string in entry.get('search_strings', [entry['title']]): query = normalize_unicode(search_string) query = re.sub(' \(\d\d\d\d\)$', '', query) log.debug('Searching DivxATope %s' % query) query = query.encode('utf8', 'ignore') data = {'q': query} try: response = task.requests.post(url_search, data=data) except requests.RequestException as e: log.error('Error searching DivxATope: %s' % e) return content = response.content soup = get_soup(content) if 'divxatope1.com' in url_search: soup2 = soup.find('ul', attrs={'class': 'buscar-list'}) else: soup2 = soup.find('ul', attrs={'class': 'peliculas-box'}) children = soup2.findAll('a', href=True) for child in children: entry = Entry() entry['url'] = child['href'] entry_title = child.find('h2') if entry_title is None: continue entry_title = entry_title.contents if not entry_title: continue else: entry_title = entry_title[0] quality_lan = child.find('strong') if quality_lan is None: continue quality_lan = quality_lan.contents if len(quality_lan) > 2: if (isinstance(quality_lan[0], Tag)): entry_quality_lan = quality_lan[1] else: entry_quality_lan = quality_lan[0] + ' ' + quality_lan[ 2] elif len(quality_lan) == 2: entry_quality_lan = quality_lan[1] entry['title'] = entry_title + ' ' + entry_quality_lan results.add(entry) log.debug('Finish search DivxATope with %d entries' % len(results)) return results
def parse_download_page(self, url, requests): txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} page = requests.get(url, headers=txheaders) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) tag_a = soup.find('a', attrs={'class': 'download_link'}) if not tag_a: raise UrlRewritingError('Unable to locate download link from url %s' % url) torrent_url = 'http://www.bakabt.com' + tag_a.get('href') return torrent_url
def on_task_input(self, task, config): if not task.requests.cookies: username = config['username'] password = config['password'] log.debug("Logging in to %s ..." % URL) params = { 'username': username, 'password': password, 'action': 'Login' } loginsrc = task.requests.post(URL + 'login.php?action=login', data=params).content if str(username) not in loginsrc: raise plugin.PluginWarning(('Login to myepisodes.com failed, please check ' 'your account data or see if the site is down.'), log) page = task.requests.get(URL + "myshows/manage/").content try: soup = get_soup(page) except Exception as e: raise plugin.PluginError("Unable to parse myepisodes.com page: %s" % (e,)) entries = [] def show_list(select_id): return soup.find('select', {'id': select_id}).findAll('option') options = show_list('shows') if config['include_ignored']: options = chain(options, show_list('ignored_shows')) for option in options: name = option.text if config.get('strip_dates'): # Remove year from end of name if present name = re.sub(r'\s+\(\d{4}\)$', '', name) showid = option.get('value') url = '%sviews.php?type=epsbyshow&showid=%s' % (URL, showid) entry = Entry() entry['title'] = name entry['url'] = url entry['series_name'] = name entry['myepisodes_id'] = showid if entry.isvalid(): entries.append(entry) else: log.debug('Invalid entry created? %s' % entry) if not entries: log.warn("No shows found on myepisodes.com list. Maybe you need to add some first?") return entries
def parse_download_page(self, url): page = requests.get(url) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) torrent_id_prog = re.compile("'(?:torrentID|id)'\s*:\s*'(\d+)'") torrent_ids = soup.findAll(text=torrent_id_prog) if len(torrent_ids) == 0: raise UrlRewritingError('Unable to locate torrent ID from url %s' % url) torrent_id = torrent_id_prog.search(torrent_ids[0]).group(1) return 'http://www.newpct.com/descargar/torrent/%s/dummy.html' % torrent_id
def parse_downloads(self, series_url, search_title): page = requests.get(series_url).content try: soup = get_soup(page) except Exception as e: raise UrlRewritingError(e) urls = [] # find all titles episode_titles = self.find_all_titles(search_title) if not episode_titles: raise UrlRewritingError('Unable to find episode') for ep_title in episode_titles: # find matching download episode_title = soup.find('strong', text=re.compile(ep_title, re.I)) if not episode_title: continue # find download container episode = episode_title.parent if not episode: continue # find episode language episode_lang = episode.find_previous('strong', text=re.compile('Sprache')).next_sibling if not episode_lang: log.warning('No language found for: %s', series_url) continue # filter language if not self.check_language(episode_lang): log.warning('languages not matching: %s <> %s', self.config['language'], episode_lang) continue # find download links links = episode.find_all('a') if not links: log.warning('No links found for: %s', series_url) continue for link in links: if not link.has_attr('href'): continue url = link['href'] pattern = 'http:\/\/download\.serienjunkies\.org.*%s_.*\.html' % self.config['hoster'] if re.match(pattern, url) or self.config['hoster'] == 'all': urls.append(url) else: continue return urls
def search(self, task, entry, config=None): config = self.prepare_config(config) if not session.cookies: log.debug('Logging in to %s...' % URL) params = { 'username': config['username'], 'password': config['password'], 'keeplogged': '1', 'login': '******' } session.post(URL + 'login.php', data=params) cat = ''.join([ '&' + ('filter_cat[%s]' % id) + '=1' for id in config['category'] ]) rls = 'release_type=' + config['type'] url_params = rls + cat multip = config['gravity_multiplier'] entries = set() for search_string in entry.get('search_strings', [entry['title']]): srch = normalize_unicode(clean_title(search_string)) srch = '&searchstr=' + quote(srch.encode('utf8')) url = URL + 'torrents.php?' + url_params + srch log.debug('Fetching URL for `%s`: %s' % (search_string, url)) page = session.get(url).content soup = get_soup(page) for result in soup.findAll('tr', attrs={'class': 'torrent'}): entry = Entry() entry['title'] = result.find('span', attrs={ 'class': 'torrent_name_link' }).text entry['url'] = URL + result.find( 'a', href=re.compile( 'torrents\.php\?action=download')).get('href') entry['torrent_seeds'], entry['torrent_leeches'] = [ r.text for r in result.findAll('td')[-2:] ] entry['search_sort'] = torrent_availability( entry['torrent_seeds'], entry['torrent_leeches']) * multip size = result.findAll('td')[-4].text size = re.search('(\d+(?:[.,]\d+)*)\s?([KMG]B)', size) entry['content_size'] = parse_filesize(size.group(0)) entries.add(entry) return entries
def on_task_input(self, task): pageurl = "http://tvtorrents.com/loggedin/recently_aired.do" log.debug("InputPlugin tvtorrents requesting url %s" % pageurl) page = urlopener(pageurl, log) soup = get_soup(page) hscript = soup.find('script', src=None).contents[0] hlines = hscript.splitlines() hash = hlines[15].strip().split("'")[1] digest = hlines[16].strip().split("'")[1] hurl = hlines[17].strip().split("'") hashurl = hurl[1] + "%s" + hurl[3] + digest + hurl[5] + hash for link in soup.find_all('a'): if not 'href' in link: continue url = link['href'] title = link.contents[0] if link.has_attr( 'onclick') and link['onclick'].find("loadTorrent") != -1: infohash = link['onclick'].split("'")[1] td = link.parent.parent.contents[4] sname = td.contents[0].strip() epi = td.contents[2].contents[0].strip() title = "%s - %s" % (sname, epi) url = hashurl % (infohash, ) else: continue if title is None: continue title = title.strip() if not title: continue # fix broken urls if url.startswith('//'): url = "http:" + url elif not url.startswith('http://') or not url.startswith( 'https://'): url = urlparse.urljoin(pageurl, url) # in case the title contains xxxxxxx.torrent - foooo.torrent clean it a bit (get upto first .torrent) if title.lower().find('.torrent') > 0: title = title[:title.lower().find(".torrent")] entry = Entry() entry['url'] = url entry['title'] = title task.entries.append(entry)
def search(self, task, entry, config): if not session.cookies: try: login_params = { 'username': config['username'], 'password': config['password'], 'loginkey': config['login_key'], } r = session.post('https://piratethenet.org/takelogin.php', data=login_params, verify=False) except requests.RequestException as e: log.error('Error while logging in to PtN: %s', e) raise plugin.PluginError('Could not log in to PtN') passkey = re.search(r'passkey=([\d\w]+)"', r.text) if not passkey: log.error("It doesn't look like PtN login worked properly.") raise plugin.PluginError('PTN cookie info invalid') search_params = default_search_params.copy() if 'movie_name' in entry: if 'movie_year' in entry: search_params[ 'advancedsearchparameters'] = '[year=%s]' % entry[ 'movie_year'] searches = [entry['movie_name']] else: searches = entry.get('search_strings', [entry['title']]) results = set() for search in searches: search_params['searchstring'] = search try: r = session.get('http://piratethenet.org/torrentsutils.php', params=search_params) except requests.RequestException as e: log.error('Error searching ptn: %s' % e) continue # html5parser doesn't work properly for some reason soup = get_soup(r.text, parser='html.parser') for movie in soup.select('.torrentstd'): imdb_id = movie.find('a', href=re.compile('.*imdb\.com/title/tt')) if imdb_id: imdb_id = extract_id(imdb_id['href']) if imdb_id and 'imdb_id' in entry and imdb_id != entry[ 'imdb_id']: continue results.update(self.create_entries(movie, imdb_id=imdb_id)) return results
def on_task_input(self, task, config): # Create entries by parsing AniDB wishlist page html using beautifulsoup log.verbose('Retrieving AniDB list: mywishlist') url = 'http://anidb.net/perl-bin/animedb.pl?show=mywishlist&uid=%s' % config['user_id'] log.debug('Requesting: %s' % url) page = task.requests.get(url) if page.status_code != 200: raise plugin.PluginError('Unable to get AniDB list. Either the list is private or does not exist.') soup = get_soup(page.text) soup = soup.find('table', class_='wishlist') trs = soup.find_all('tr') if not trs: log.verbose('No movies were found in AniDB list: mywishlist') return entries = [] entry_type = '' if config['type'] == 'movies': entry_type = 'Type: Movie' elif config['type'] == 'shows': entry_type = 'Type: TV Series' for tr in trs: if tr.find('span', title=entry_type): a = tr.find('td', class_='name').find('a') if not a: log.debug('No title link found for the row, skipping') continue anime_title = a.string if config.get('strip_dates'): # Remove year from end of series name if present anime_title = re.sub(r'\s+\(\d{4}\)$', '', anime_title) link = ('http://anidb.net/perl-bin/' + a.get('href')) anime_id = "" match = re.search(r'aid=([\d]{1,5})', a.get('href')) if match: anime_id = match.group(1) entry = Entry() entry['title'] = anime_title entry['url'] = link entry['anidb_id'] = anime_id entry['anidb_name'] = entry['title'] entries.append(entry) else: log.verbose('Entry does not match the requested type') return entries
def on_task_input(self, task, config): config = self.prepare_config(config) url = config.get('url') user_agent = config.get('user-agent') cookie = config.get('cookie') root_element_selector = config.get('root_element_selector') fields = config.get('fields') params = config.get('params') headers = { 'accept-encoding': 'gzip, deflate, br', 'user-agent': user_agent } entries = [] try: task.requests.headers.update(headers) task.requests.cookies.update(NetUtils.cookie_str_to_dict(cookie)) response = task.requests.get(url, timeout=60) content = NetUtils.decode(response) except RequestException as e: raise plugin.PluginError( 'Unable to download the Html for task {} ({}): {}'.format( task.name, url, e)) elements = get_soup(content).select(root_element_selector) if len(elements) == 0: logger.debug(f'no elements found in response: {content}') return entries for element in elements: logger.debug('element in element_selector: {}', element) entry = Entry() for key, value in fields.items(): entry[key] = '' sub_element = element.select_one(value['element_selector']) if sub_element: if value['attribute'] == 'textContent': sub_element_content = sub_element.get_text() else: sub_element_content = sub_element.get( value['attribute'], '') entry[key] = sub_element_content logger.debug('key: {}, value: {}', key, entry[key]) if entry['title'] and entry['url']: base_url = urljoin(url, entry['url']) if params.startswith("&"): entry['url'] = base_url + params else: entry['url'] = urljoin(base_url, params) entry['original_url'] = entry['url'] entries.append(entry) return entries
def search(self, task, entry, config=None): """ Search for name from iptorrents """ categories = config.get('category', 'all') # Make sure categories is a list if not isinstance(categories, list): categories = [categories] # If there are any text categories, turn them into their id number categories = [c if isinstance(c, int) else CATEGORIES[c] for c in categories] filter_url = '&'.join((str(c) + '=') for c in categories) entries = set() for search_string in entry.get('search_strings', [entry['title']]): query = normalize_unicode(search_string) query = quote_plus(query.encode('utf8')) url = "{base_url}/t?{filter}&q={query}&qf=".format(base_url=BASE_URL, filter=filter_url, query=query) log.debug('searching with url: %s' % url) req = requests.get(url, cookies={'uid': str(config['uid']), 'pass': config['password']}) if '/u/' + str(config.get('uid')) not in req.content: raise plugin.PluginError("Invalid cookies (user not logged in)...") soup = get_soup(req.content, parser="html.parser") torrents = soup.find('table', {'class': 'torrents'}) for torrent in torrents.findAll('a', href=re.compile('\.torrent$')): entry = Entry() entry['url'] = "{base}{link}?torrent_pass={key}".format( base=BASE_URL, link=torrent['href'], key=config.get('rss_key')) entry['title'] = torrent.findPrevious("a", attrs={'class': 't_title'}).text seeders = torrent.findNext('td', {'class': 'ac t_seeders'}).text leechers = torrent.findNext('td', {'class': 'ac t_leechers'}).text entry['torrent_seeds'] = int(seeders) entry['torrent_leeches'] = int(leechers) entry['search_sort'] = torrent_availability(entry['torrent_seeds'], entry['torrent_leeches']) size = torrent.findNext(text=re.compile('^([\.\d]+) ([GMK]?)B$')) size = re.search('^([\.\d]+) ([GMK]?)B$', size) entry['content_size'] = parse_filesize(size.group(0)) entries.add(entry) return entries
def parse_page(self, scraper, url: str): try: logger.debug('page url: {}', url) page = scraper.get(url) except RequestException as e: raise plugin.PluginError(str(e)) if page.status_code == 404: raise Page404Error() if page.status_code != 200: raise plugin.PluginError( f'HTTP Request failed {page.status_code}. Url: {url}') soup = get_soup(page.text) soup_table = soup.find('table', class_='download') if not soup_table: # very likely no result return table_tbody = soup_table.find('tbody') if not table_tbody: raise plugin.PluginError( 'Parsing crashed, no tbody, please report the issue') trs = table_tbody.find_all('tr') if not trs: logger.critical('Nothing to parse') return for tr in trs: try: magnet_td = tr.find('td', class_='m') if not magnet_td: # skip empty trs continue magnet_a = magnet_td.find('a') magnet = magnet_a['href'] title_td = tr.find('td', class_='n') title_a = title_td.find('a') title = title_a['title'] seed_td = tr.find('td', class_='s') seed = int(seed_td.text) leech = int(tr.find('td', class_='l').text) content_size = parse_filesize(seed_td.previous_sibling.text) yield Entry( url=magnet, title=title, torrent_seeds=seed, torrent_leech=leech, content_size=content_size, ) except AttributeError as e: raise plugin.PluginError( 'Parsing crashed, please report the issue') from e
def get_nexusphp_message(self, task, entry): message_url = entry['message_url'] if entry[ 'message_url'] else urljoin(entry['url'], '/messages.php') message_box_response = self._request(task, entry, 'get', message_url, headers=entry['headers']) state = self.check_net_state(entry, message_box_response, message_url) if state: entry['messages'] = 'Can not read message box!' return if message_box_response: unread_elements = get_soup(self._decode( message_box_response)).select('td > img[alt*="Unread"]') for unread_element in unread_elements: td = unread_element.parent.nextSibling.nextSibling title = td.text href = td.a.get('href') message_url = urljoin(message_url, href) message_response = self._request(task, entry, 'get', message_url, headers=entry['headers']) message_body = 'Can not read message body!' if message_response: body_element = get_soup(self._decode( message_response)).select_one('td[colspan*="2"]') if body_element: message_body = body_element.text.strip() entry['messages'] = entry['messages'] + ( '\nTitle: {}\nLink: {}\n{}'.format(title, message_url, message_body)) else: entry['messages'] = 'Can not read message box!'
def _request_url(self, task, config, url, auth, dump_name=None): log.verbose('Requesting: %s' % url) page = task.requests.get(url, auth=auth) log.verbose('Response: %s (%s)' % (page.status_code, page.reason)) soup = get_soup(page.content) # dump received content into a file if dump_name: log.verbose('Dumping: %s' % dump_name) data = soup.prettify() with open(dump_name, 'w') as f: f.write(data) return self.create_entries(url, soup, config)
def search(self, task, entry, config=None): """ Search for name from piratebay. """ if not isinstance(config, dict): config = {} sort = SORT.get(config.get('sort_by', 'seeds')) if config.get('sort_reverse'): sort += 1 if isinstance(config.get('category'), int): category = config['category'] else: category = CATEGORIES.get(config.get('category', 'all')) filter_url = '/0/%d/%d' % (sort, category) entries = set() for search_string in entry.get('search_strings', [entry['title']]): query = normalize_unicode(search_string) # TPB search doesn't like dashes or quotes query = query.replace('-', ' ').replace("'", " ") # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand url = 'http://thepiratebay.%s/search/%s%s' % (CUR_TLD, quote(query.encode('utf-8')), filter_url) log.debug('Using %s as piratebay search url' % url) page = task.requests.get(url).content soup = get_soup(page) for link in soup.find_all('a', attrs={'class': 'detLink'}): entry = Entry() entry['title'] = self.extract_title(link) if not entry['title']: log.error('Malformed search result. No title or url found. Skipping.') continue entry['url'] = 'http://thepiratebay.%s%s' % (CUR_TLD, link.get('href')) tds = link.parent.parent.parent.find_all('td') entry['torrent_seeds'] = int(tds[-2].contents[0]) entry['torrent_leeches'] = int(tds[-1].contents[0]) entry['search_sort'] = torrent_availability(entry['torrent_seeds'], entry['torrent_leeches']) # Parse content_size size_text = link.find_next(attrs={'class': 'detDesc'}).get_text() if size_text: size = re.search('Size (\d+(\.\d+)?\xa0(?:[PTGMK])?i?B)', size_text) if size: entry['content_size'] = parse_filesize(size.group(1)) else: log.error('Malformed search result? Title: "%s", No size? %s', entry['title'], size_text) entries.add(entry) return sorted(entries, reverse=True, key=lambda x: x.get('search_sort'))
def parse_download_page(self, url, requests): txheaders = { 'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } page = requests.get(url, headers=txheaders) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) tag_a = soup.select_one('a[href^="magnet:"]') if not tag_a: raise UrlRewritingError( f"Unable to locate download link from url {url}") return tag_a.get('href')
def horrible_entries(requests, page_url): entries = [] try: soup = get_soup(requests.get(page_url).content) except RequestException as e: log.error('HorribleSubs request failed: %s', e) return entries for li_label in soup.findAll('li'): title = '[HorribleSubs] {0}{1}'.format(str(li_label.find('span').next_sibling),str(li_label.find('strong').text)) log.debug('Found title `%s`', title) url = li_label.find('a')['href'] episode = re.sub(r'.*#', '', url) # Get show ID try: soup = get_soup(requests.get('https://horriblesubs.info/{0}'.format(url)).content) except RequestException as e: log.error('HorribleSubs request failed: %s', e) return entries show_id = re.sub(r'[^0-9]', '', soup(text=re.compile('hs_showid'))[0]) entries = HorribleSubs.horrible_get_downloads(requests, title, 'https://horriblesubs.info/api.php?method=getshows&type=show&mode=filter&showid={0}&value={1}'.format(show_id,episode)) return entries
def get_nexusphp_message(self, entry, config, messages_url='/messages.php'): message_url = urljoin(entry['url'], messages_url) message_box_response = self._request(entry, 'get', message_url) net_state = self.check_net_state(entry, message_box_response, message_url) if net_state: entry.fail_with_prefix( 'Can not read message box! url:{}'.format(message_url)) return unread_elements = get_soup(self._decode(message_box_response)).select( 'td > img[alt*="Unread"]') failed = False for unread_element in unread_elements: td = unread_element.parent.nextSibling.nextSibling title = td.text href = td.a.get('href') message_url = urljoin(message_url, href) message_response = self._request(entry, 'get', message_url) net_state = self.check_net_state(entry, message_response, message_url) if net_state: message_body = 'Can not read message body!' failed = True else: body_element = get_soup(self._decode( message_response)).select_one('td[colspan*="2"]') if body_element: message_body = body_element.text.strip() entry['messages'] = entry['messages'] + ( '\nTitle: {}\nLink: {}\n{}'.format(title, message_url, message_body)) if failed: entry.fail_with_prefix('Can not read message body!')
def on_task_input(self, task, config): config = self.prepare_config(config) url = config.get('url') root_element_selector = config.get('root_element_selector') fields = config.get('fields') params = config.get('params') entries = [] elements = [] if url and root_element_selector: try: if brotli: config.get( 'headers')['accept-encoding'] = 'gzip, deflate, br' response = task.requests.get(url, headers=config.get('headers'), timeout=60) content = self._decode(response) except RequestException as e: raise plugin.PluginError( 'Unable to download the Html for task {} ({}): {}'.format( task.name, url, e)) elements = get_soup(content).select(root_element_selector) if len(elements) == 0: return entries for element in elements: logger.debug('element in element_selector: {}', element) entry = Entry() for key, value in fields.items(): entry[key] = '' sub_element = element.select_one(value['element_selector']) if sub_element: if value['attribute'] == 'textContent': sub_element_content = sub_element.get_text() else: sub_element_content = sub_element.get( value['attribute'], '') entry[key] = sub_element_content logger.debug('key: {}, value: {}', key, entry[key]) if entry['title'] and entry['url']: base_url = urljoin(url, entry['url']) if params.startswith("&"): entry['url'] = base_url + params else: entry['url'] = urljoin(base_url, params) entry['original_url'] = entry['url'] entries.append(entry) return entries
def parse_download_page(self, page_url): page = urlopener(page_url, log) try: soup = get_soup(page) except Exception as e: raise UrlRewritingError(e) tag_a = soup.find("a", {"class": "dl_link"}) if not tag_a: raise UrlRewritingError( 'FTDB Unable to locate download link from url %s and tag_a is : %s' % (page_url, tag_a)) torrent_url = "http://www3.frenchtorrentdb.com" + tag_a.get( 'href') + "&js=1" log.debug('TORRENT URL is : %s' % torrent_url) return torrent_url
def get_gazelle_message(self, task, entry): message_url = urljoin(entry['url'], '/inbox.php') message_box_response = self._request(task, entry, 'get', message_url, headers=entry['headers']) state = self.check_net_state(entry, message_box_response, message_url) if state: entry['messages'] = 'Can not read message box!' return if message_box_response: unread_elements = get_soup(self._decode( message_box_response)).select("tr.unreadpm > td > strong > a") for unread_element in unread_elements: title = unread_element.text href = unread_element.get('href') message_url = urljoin(message_url, href) message_response = self._request(task, entry, 'get', message_url, headers=entry['headers']) message_body = 'Can not read message body!' if message_response: body_element = get_soup(self._decode( message_response)).select_one('div[id*="message"]') if body_element: message_body = body_element.text.strip() entry['messages'] = entry['messages'] + ( '\nTitle: {}\nLink: {}\n{}'.format(title, message_url, message_body)) else: entry['messages'] = 'Can not read message box!'
def parse_download_page(self, url): page = urlopener(url, log) log.debug('%s opened', url) try: soup = get_soup(page) torrent_url = 'http://www.t411.me' + soup.find( text='Télécharger').findParent().get('href') except Exception as e: raise UrlRewritingError(e) if not torrent_url: raise UrlRewritingError( 'Unable to locate download link from url %s' % url) return torrent_url
def parse_download_page(self, url, requests): page = requests.get(url).content try: soup = get_soup(page) tag_div = soup.find('div', attrs={'class': 'download'}) if not tag_div: raise UrlRewritingError('Unable to locate download link from url %s' % url) tag_a = tag_div.find('a') torrent_url = tag_a.get('href') # URL is sometimes missing the schema if torrent_url.startswith('//'): torrent_url = urlparse(url).scheme + ':' + torrent_url return torrent_url except Exception as e: raise UrlRewritingError(e)
def entries_from_search(self, name, url=None): """Parses torrent download url from search results""" name = normalize_unicode(name) if not url: url = 'http://www.newtorrents.info/search/%s' % urllib.quote(name.encode('utf-8'), safe=':/~?=&%') log.debug('search url: %s' % url) html = urlopener(url, log).read() # fix </SCR'+'IPT> so that BS does not crash # TODO: should use beautifulsoup massage html = re.sub(r'(</SCR.*?)...(.*?IPT>)', r'\1\2', html) soup = get_soup(html) # saving torrents in dict torrents = [] for link in soup.find_all('a', attrs={'href': re.compile('down.php')}): torrent_url = 'http://www.newtorrents.info%s' % link.get('href') release_name = link.parent.next.get('title') # quick dirty hack seed = link.find_next('td', attrs={'class': re.compile('s')}).renderContents() if seed == 'n/a': seed = 0 else: try: seed = int(seed) except ValueError: log.warning('Error converting seed value (%s) from newtorrents to integer.' % seed) seed = 0 #TODO: also parse content_size and peers from results torrents.append(Entry(title=release_name, url=torrent_url, torrent_seeds=seed, search_sort=torrent_availability(seed, 0))) # sort with seed number Reverse order torrents.sort(reverse=True, key=lambda x: x.get('search_sort', 0)) # choose the torrent if not torrents: dashindex = name.rfind('-') if dashindex != -1: return self.entries_from_search(name[:dashindex]) else: raise PluginWarning('No matches for %s' % name, log, log_once=True) else: if len(torrents) == 1: log.debug('found only one matching search result.') else: log.debug('search result contains multiple matches, sorted %s by most seeders' % torrents) return torrents
def parse_download_page(self, url): txheaders = { 'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } req = urllib2.Request(url, None, txheaders) page = urlopener(req, log) try: soup = get_soup(page) except Exception as e: raise UrlRewritingError(e) down_link = soup.find( 'a', attrs={'href': re.compile("download/\d+/.*\.torrent")}) if not down_link: raise UrlRewritingError( 'Unable to locate download link from url %s' % url) return 'http://www.deadfrog.us/' + down_link.get('href')