def normalize(title): try: return ''.join(c for c in unicodedata.normalize( 'NFKD', py_tools.ensure_text(py_tools.ensure_str(title))) if unicodedata.category(c) != 'Mn') except: error() return title
def get_sources_packs(self, link): # log_utils.log('link = %s' % str(link), __name__, log_utils.LOGDEBUG) try: r = py_tools.ensure_str(self.scraper.get(link).content, errors='replace') if not r: return posts = client.parseDOM(r, 'div', attrs={'class': 'tgxtable'}) if not posts: return except: source_utils.scraper_error('TORRENTGALAXY') return for post in posts: try: links = zip( re.findall(r'href\s*=\s*["\'](magnet:[^"\']+)["\']', post, re.DOTALL | re.I), re.findall(r'<span\s*class\s*=\s*["\']badge\s*badge-secondary["\']\s*style\s*=\s*["\']border-radius:4px;["\']>(.*?)</span>', post, re.DOTALL | re.I), re.findall(r'<span\s*title\s*=\s*["\']Seeders/Leechers["\']>\[<font\s*color\s*=\s*["\']green["\']><b>(.*?)<', post, re.DOTALL | re.I)) for link in links: url = unquote_plus(link[0]).split('&tr')[0].replace(' ', '.') url = source_utils.strip_non_ascii_and_unprintable(url) hash = re.compile(r'btih:(.*?)&', re.I).findall(url)[0] name = url.split('&dn=')[1] name = source_utils.clean_name(name) if not self.search_series: if not self.bypass_filter: if not source_utils.filter_season_pack(self.title, self.aliases, self.year, self.season_x, name): continue package = 'season' elif self.search_series: if not self.bypass_filter: valid, last_season = source_utils.filter_show_pack(self.title, self.aliases, self.imdb, self.year, self.season_x, name, self.total_seasons) if not valid: continue else: last_season = self.total_seasons package = 'show' name_info = source_utils.info_from_name(name, self.title, self.year, season=self.season_x, pack=package) if source_utils.remove_lang(name_info): continue try: seeders = int(link[2]) if self.min_seeders > seeders: continue except: seeders = 0 quality, info = source_utils.get_release_quality(name_info, url) try: dsize, isize = source_utils._size(link[1]) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) item = {'provider': 'torrentgalaxy', 'source': 'torrent', 'seeders': seeders, 'hash': hash, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize, 'package': package} if self.search_series: item.update({'last_season': last_season}) self.sources.append(item) except: source_utils.scraper_error('TORRENTGALAXY')
def get_sources_packs(self, link): # log_utils.log('link = %s' % str(link), __name__, log_utils.LOGDEBUG) try: headers = {'User-Agent': client.agent()} r = py_tools.ensure_str(self.scraper.get(link, headers=headers).content, errors='replace') if not r or '<table' not in r: return table = client.parseDOM(r, 'table', attrs={'class': 'table2'})[0] rows = client.parseDOM(table, 'tr') if not rows: return except: source_utils.scraper_error('LIMETORRENTS') return for row in rows: try: data = client.parseDOM(row, 'a', ret='href')[0] if '/search/' in data: continue data = re.sub(r'\s', '', data).strip() hash = re.compile(r'/torrent/(.+?).torrent', re.I).findall(data)[0] name = re.findall(r'title\s*=\s*(.+?)$', data, re.DOTALL | re.I)[0] name = source_utils.clean_name(name) url = 'magnet:?xt=urn:btih:%s&dn=%s' % (hash, name) if not self.search_series: if not self.bypass_filter: if not source_utils.filter_season_pack(self.title, self.aliases, self.year, self.season_x, name): continue package = 'season' elif self.search_series: if not self.bypass_filter: valid, last_season = source_utils.filter_show_pack(self.title, self.aliases, self.imdb, self.year, self.season_x, name, self.total_seasons) if not valid: continue else: last_season = self.total_seasons package = 'show' name_info = source_utils.info_from_name(name, self.title, self.year, season=self.season_x, pack=package) if source_utils.remove_lang(name_info): continue try: seeders = int(client.parseDOM(row, 'td', attrs={'class': 'tdseed'})[0].replace(',', '')) if self.min_seeders > seeders: continue except: seeders = 0 quality, info = source_utils.get_release_quality(name_info, url) try: size = re.findall(r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', row)[0] dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) item = {'provider': 'limetorrents', 'source': 'torrent', 'seeders': seeders, 'hash': hash, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize, 'package': package} if self.search_series: item.update({'last_season': last_season}) self.sources.append(item) except: source_utils.scraper_error('LIMETORRENTS')
def normalize(title): try: title = ''.join(c for c in unicodedata.normalize( 'NFKD', py_tools.ensure_text(py_tools.ensure_str(title))) if unicodedata.category(c) != 'Mn') return str(title) except: from fenomscrapers.modules import log_utils log_utils.error() return title
def log(msg, caller=None, level=LOGNOTICE): debug_enabled = getSetting('debug.enabled') == 'true' if not debug_enabled: return debug_location = getSetting('debug.location') if isinstance(msg, int): msg = lang(msg) # for strings.po translations try: if py_tools.isPY3: if not msg.isprintable( ): # ex. "\n" is not a printable character so returns False on those sort of cases msg = '%s (NORMALIZED by log_utils.log())' % normalize(msg) if isinstance(msg, py_tools.binary_type): msg = '%s (ENCODED by log_utils.log())' % (py_tools.ensure_str( msg, errors='replace')) else: if not is_printable( msg ): # if not all(c in printable for c in msg): # isprintable() not available in py2 msg = normalize(msg) if isinstance(msg, py_tools.binary_type): msg = '%s (ENCODED by log_utils.log())' % ( py_tools.ensure_text(msg)) if caller == 'scraper_error': pass elif caller is not None and level != LOGERROR: func = inspect.currentframe().f_back.f_code line_number = inspect.currentframe().f_back.f_lineno caller = "%s.%s()" % (caller, func.co_name) msg = 'From func name: %s Line # :%s\n msg : %s' % ( caller, line_number, msg) elif caller is not None and level == LOGERROR: msg = 'From func name: %s.%s() Line # :%s\n msg : %s' % ( caller[0], caller[1], caller[2], msg) if debug_location == '1': log_file = joinPath(LOGPATH, 'fenomscrapers.log') if not existsPath(log_file): f = open(log_file, 'w') f.close() with open(log_file, 'a', encoding='utf-8') as f: #with auto cleans up and closes line = '[%s %s] %s: %s' % ( datetime.now().date(), str(datetime.now().time())[:8], DEBUGPREFIX % debug_list[level], msg) f.write(line.rstrip('\r\n') + '\n') # f.writelines([line1, line2]) ## maybe an option for the 2 lines without using "\n" else: xbmc.log('%s: %s' % (DEBUGPREFIX % debug_list[level], msg, level)) except Exception as e: import traceback traceback.print_exc() xbmc.log( '[ script.module.fenomonscrapers ] log_utils.log() Logging Failure: %s' % (e), LOGERROR)
def sources(self, url, hostDict): self.sources = [] if not url: return self.sources try: scraper = cfscrape.create_scraper() data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) self.title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] self.title = self.title.replace('&', 'and').replace( 'Special Victims Unit', 'SVU') self.aliases = data['aliases'] self.episode_title = data[ 'title'] if 'tvshowtitle' in data else None self.year = data['year'] self.hdlr = 'S%02dE%02d' % ( int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else self.year query = '%s %s' % (self.title, self.hdlr) query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', query) urls = [] url = self.search_link % quote_plus(query) url = urljoin(self.base_link, url) urls.append(url) # urls.append('%s%s' % (url, '&page=2')) # next page is not working atm # urls.append('%s%s' % (url, '&page=3')) # log_utils.log('urls = %s' % urls, log_utils.LOGDEBUG) links = [] for x in urls: r = py_tools.ensure_str(scraper.get(x).content, errors='replace') if not r: continue list = client.parseDOM(r, 'tr', attrs={'class': 'tlr'}) list += client.parseDOM(r, 'tr', attrs={'class': 'tlz'}) for item in list: links.append(item) threads = [] for link in links: threads.append(workers.Thread(self.get_sources, link)) [i.start() for i in threads] [i.join() for i in threads] return self.sources except: source_utils.scraper_error('EXTRATORRENT') return self.sources
def get_sources(self, link): # log_utils.log('link = %s' % link, log_utils.LOGDEBUG) try: headers = {'User-Agent': client.agent()} r = py_tools.ensure_str(self.scraper.get(link, headers=headers).content, errors='replace') if not r or '<table' not in r: return table = client.parseDOM(r, 'table', attrs={'class': 'table2'})[0] rows = client.parseDOM(table, 'tr') if not rows: return except: source_utils.scraper_error('LIMETORRENTS') return for row in rows: try: data = client.parseDOM(row, 'a', ret='href')[0] if '/search/' in data: continue data = re.sub(r'\s', '', data).strip() hash = re.compile(r'/torrent/(.+?).torrent', re.I).findall(data)[0] name = re.findall(r'title\s*=\s*(.+?)$', data, re.DOTALL | re.I)[0] name = source_utils.clean_name(name) if not source_utils.check_title(self.title, self.aliases, name, self.hdlr, self.year): continue name_info = source_utils.info_from_name(name, self.title, self.year, self.hdlr, self.episode_title) if source_utils.remove_lang(name_info): continue url = 'magnet:?xt=urn:btih:%s&dn=%s' % (hash, name) if not self.episode_title: #filter for eps returned in movie query (rare but movie and show exists for Run in 2020) ep_strings = [r'[.-]s\d{2}e\d{2}([.-]?)', r'[.-]s\d{2}([.-]?)', r'[.-]season[.-]?\d{1,2}[.-]?'] if any(re.search(item, name.lower()) for item in ep_strings): continue try: seeders = int(client.parseDOM(row, 'td', attrs={'class': 'tdseed'})[0].replace(',', '')) if self.min_seeders > seeders: continue except: seeders = 0 quality, info = source_utils.get_release_quality(name_info, url) try: size = re.findall(r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', row)[0] dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) self.sources.append({'provider': 'limetorrents', 'source': 'torrent', 'seeders': seeders, 'hash': hash, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize}) except: source_utils.scraper_error('LIMETORRENTS')
def get(title): try: if not title: return try: title = py_tools.ensure_str(title) except: pass title = re.sub(r'&#(\d+);', '', title).lower() title = re.sub(r'(&#[0-9]+)([^;^0-9]+)', '\\1;\\2', title) title = title.replace('"', '\"').replace('&', '&') title = re.sub( r'\n|([\[({].+?[})\]])|([:;–\-"\',!_.?~$@])|\s', '', title ) # stop trying to remove alpha characters "vs" or "v", they're part of a title return title except: from fenomscrapers.modules import log_utils log_utils.error() return title
def get_simple(title): try: if not title: return try: title = py_tools.ensure_str(title) except: pass title = re.sub(r'(\d{4})', '', title).lower() title = re.sub(r'&#(\d+);', '', title) title = re.sub(r'(&#[0-9]+)([^;^0-9]+)', '\\1;\\2', title) title = title.replace('"', '\"').replace('&', '&') title = re.sub( r'\n|[()[\]{}]|[:;–\-",\'!_.?~$@]|\s', '', title ) # stop trying to remove alpha characters "vs" or "v", they're part of a title title = re.sub(r'<.*?>', '', title) # removes tags return title except: from fenomscrapers.modules import log_utils log_utils.error() return title
def search(self, title, year): try: url = urljoin(self.base_link, self.search_link % (quote_plus(title))) # r = self.scraper.get(url, headers=self.headers).content r = py_tools.ensure_str(self.scraper.get( url, headers=self.headers).content, errors='replace') # switch to client.parseDOM() to rid import if not r: return None r = dom_parser.parse_dom(r, 'div', {'class': 'list_items'})[0] r = dom_parser.parse_dom(r.content, 'li') r = [(dom_parser.parse_dom(i, 'a', {'class': 'title'})) for i in r] r = [(i[0].attrs['href'], i[0].content) for i in r] r = [(urljoin(self.base_link, i[0])) for i in r if cleantitle.get(title) in cleantitle.get(i[1]) and year in i[1]] if r: return r[0] else: return None except: return None
def geturl(title): if not title: return try: try: title = py_tools.ensure_str(title) except: pass title = title.lower().rstrip() try: title = title.translate(None, ':*?"\'\.<>|&!,') except: try: title = title.translate( title.maketrans('', '', ':*?"\'\.<>|&!,')) except: for c in ':*?"\'\.<>|&!,': title = title.replace(c, '') title = title.replace('/', '-').replace(' ', '-').replace( '--', '-').replace('–', '-').replace('!', '') return title except: from fenomscrapers.modules import log_utils log_utils.error() return title
def sources(self, url, hostDict): sources = [] if not url: return sources try: scraper = cfscrape.create_scraper(delay=5) data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] title = title.replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] if 'tvshowtitle' in data else None year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else year isSeasonQuery = False query = '%s %s' % (title, hdlr) query = re.sub(r'(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', query) # query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', query) query = re.sub(r'\s', '-', query) if int(year) >= 2021: self.base_link = self.base_new else: self.base_link = self.base_old url = urljoin(self.base_link, query) # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) # r = scraper.get(url).content r = py_tools.ensure_str(scraper.get(url).content, errors='replace') if not r or 'nothing was found' in r: if 'tvshowtitle' in data: season = re.search(r'S(.*?)E', hdlr).group(1) query = re.sub(r'(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', title) # query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', title) query = re.sub(r'\s', '-', query) query = query + "-S" + season url = urljoin(self.base_link, query) # r = scraper.get(url).content r = py_tools.ensure_str(scraper.get(url).content, errors='replace') isSeasonQuery = True else: return sources if not r or 'nothing was found' in r: return sources # may need to add fallback to use self.search_link if nothing found posts = client.parseDOM(r, "div", attrs={"class": "content"}) if not posts: return sources except: source_utils.scraper_error('RLSBB') return sources release_title = re.sub(r'[^A-Za-z0-9\s\.-]+', '', title).replace(' ', '.') items = [] count = 0 for post in posts: if count >= 300: break # to limit large link list and slow scrape time try: post_titles = re.findall( r'(?:.*>|>\sRelease Name.*|\s)(%s.*?)<' % release_title, post, re.I ) #parse all matching release_titles in each post(content) group items = [] if len(post_titles) > 1: index = 0 for name in post_titles: start = post_titles[index].replace('[', '\\[').replace( '(', '\\(').replace(')', '\\)').replace( '+', '\\+').replace(' \\ ', ' \\\\ ') end = (post_titles[index + 1].replace( '[', '\\[').replace('(', '\\(').replace( ')', '\\)').replace('+', '\\+')).replace( ' \\ ', ' \\\\ ' ) if index + 1 < len(post_titles) else '' try: container = re.findall( r'(?:%s)([\S\s]+)(?:%s)' % (start, end), post, re.I )[0] #parse all data between release_titles in multi post(content) group except: source_utils.scraper_error('RLSBB') continue try: size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', container)[0].replace(',', '.') except: size = '0' container = client.parseDOM(container, 'a', ret='href') items.append((name, size, container)) index += 1 elif len(post_titles) == 1: name = post_titles[0] container = client.parseDOM( post, 'a', ret='href' ) #parse all links in a single post(content) group try: size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', post)[0].replace(',', '.') except: size = '0' items.append((name, size, container)) else: continue for group_name, size, links in items: for i in links: name = group_name # if isSeasonQuery and hdlr not in name.upper(): # name = i.rsplit("/", 1)[-1] # if hdlr not in name.upper(): continue if hdlr not in name.upper(): name = i.rsplit("/", 1)[-1] if hdlr not in name.upper(): continue name = client.replaceHTMLCodes(name) name = source_utils.strip_non_ascii_and_unprintable( name) name_info = source_utils.info_from_name( name, title, year, hdlr, episode_title) url = py_tools.ensure_text(client.replaceHTMLCodes( str(i)), errors='replace') if url in str(sources): continue if url.endswith(('.rar', '.zip', '.iso', '.part', '.png', '.jpg', '.bmp', '.gif')): continue valid, host = source_utils.is_host_valid(url, hostDict) if not valid: continue quality, info = source_utils.get_release_quality( name, url) try: if size == '0': try: size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', name)[0].replace(',', '.') except: raise Exception() dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) sources.append({ 'provider': 'rlsbb', 'source': host, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) count += 1 except: source_utils.scraper_error('RLSBB') return sources
def sources(self, data, hostDict): self.sources = [] if not data: return self.sources try: self.hostDict = hostDict self.title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] self.title = self.title.replace('&', 'and').replace( 'Special Victims Unit', 'SVU') self.aliases = data['aliases'] self.episode_title = data[ 'title'] if 'tvshowtitle' in data else None self.year = data['year'] self.hdlr = 'S%02dE%02d' % ( int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else self.year imdb = data['imdb'] url = self.search(self.title, self.year) # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) if not url: return self.sources # result = self.scraper.get(url, headers=self.headers).content result = py_tools.ensure_str(self.scraper.get( url, headers=self.headers).content, errors='replace') if not result: return self.sources r_pack = None if 'tvshowtitle' in data: r = dom_parser.parse_dom(result, 'ul', {'id': 'episodes'}) # r_pack = dom_parser.parse_dom(result, 'ul', {'id': 'packs'}) # Rapidmoviez has pack files, needs more work else: r = dom_parser.parse_dom(result, 'ul', {'id': 'releases'}) if not r and not r_pack: return self.sources if r: r = dom_parser.parse_dom(r[0].content, 'a', req=['href']) r = [(i.content, urljoin(self.base_link, i.attrs['href'])) for i in r if i and i.content != 'Watch'] r = [(i[0], i[1]) for i in r if self.hdlr in i[0].upper()] # if r_pack: # r_pack = dom_parser.parse_dom(r_pack[0].content, 'a', req=['href']) # r_pack = [(i.content, urljoin(self.base_link, i.attrs['href'])) for i in r_pack if i and i.content != 'Watch'] # r += [(i[0], i[1]) for i in r_pack if 'S%02d' % int(data['season']) in i[0].upper()] # r += [(i[0], i[1]) for i in r_pack if 'SEASON %02d' % int(data['season']) in i[0].upper()] # log_utils.log('r = %s' % r, log_utils.LOGDEBUG) threads = [] for i in r: threads.append(workers.Thread(self.get_sources, i[0], i[1])) [i.start() for i in threads] alive = [x for x in threads if x.is_alive() is True] while alive: alive = [x for x in threads if x.is_alive() is True] time.sleep(0.1) return self.sources except: source_utils.scraper_error('RAPIDMOVIEZ') return self.sources
def get_sources(self, name, url): try: # r = self.scraper.get(url, headers=self.headers).content r = py_tools.ensure_str(self.scraper.get( url, headers=self.headers).content, errors='replace') name = client.replaceHTMLCodes(name) if name.startswith('['): name = name.split(']')[1] name = name.strip().replace(' ', '.') name_info = source_utils.info_from_name(name, self.title, self.year, self.hdlr, self.episode_title) if source_utils.remove_lang(name_info): return self.sources l = dom_parser.parse_dom(r, 'pre', {'class': 'links'}) if l == []: return s = '' for i in l: s += i.content urls = re.findall( r'''((?:http|ftp|https)://[\w_-]+(?:(?:\.[\w_-]+)+)[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])''', i.content, flags=re.M | re.S) urls = [ i for i in urls if not i.endswith(('.rar', '.zip', '.iso', '.idx', '.sub', '.srt')) ] for link in urls: url = py_tools.ensure_text(client.replaceHTMLCodes(str(link)), errors='replace') if url in str(self.sources): continue valid, host = source_utils.is_host_valid(url, self.hostDict) if not valid: continue quality, info = source_utils.get_release_quality(name, url) try: size = re.search( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', name).group(0) dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) self.sources.append({ 'provider': 'rapidmoviez', 'source': host, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) except: source_utils.scraper_error('RAPIDMOVIEZ')
def sources(self, url, hostDict): sources = [] if not url: return sources try: scraper = cfscrape.create_scraper() data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] title = title.replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] if 'tvshowtitle' in data else None year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else year query = '%s %s' % (title, hdlr) query = re.sub(r'(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', query) url = self.search_link % quote_plus(query) url = urljoin(self.base_link, url) # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) # r = scraper.get(url).content r = py_tools.ensure_str(scraper.get(url).content, errors='replace') posts = client.parseDOM(r, 'div', attrs={'class': 'post'}) if not posts: return sources except: source_utils.scraper_error('SCENERLS') return sources items = [] for post in posts: try: content = client.parseDOM(post, "div", attrs={"class": "postContent"}) size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', content[0])[0] u = client.parseDOM(content, "h2") u = client.parseDOM(u, 'a', ret='href') u = [(i.strip('/').split('/')[-1], i, size) for i in u] items += u except: source_utils.scraper_error('SCENERLS') return sources for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) if not source_utils.check_title(title, aliases, name, hdlr, year): continue name_info = source_utils.info_from_name( name, title, year, hdlr, episode_title) if source_utils.remove_lang(name_info): continue # check year for reboot/remake show issues if year is available-crap shoot # if 'tvshowtitle' in data: # if re.search(r'([1-3][0-9]{3})', name): # if not any(value in name for value in [year, str(int(year)+1), str(int(year)-1)]): # continue url = py_tools.ensure_text(client.replaceHTMLCodes(str( item[1])), errors='replace') if url in str(sources): continue valid, host = source_utils.is_host_valid(url, hostDict) if not valid: continue quality, info = source_utils.get_release_quality( name_info, url) try: dsize, isize = source_utils._size(item[2]) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) sources.append({ 'provider': 'scenerls', 'source': host, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) except: source_utils.scraper_error('SCENERLS') return sources
def sources(self, url, hostDict): sources = [] if not url: return sources try: scraper = cfscrape.create_scraper(delay=5) data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] title = title.replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] if 'tvshowtitle' in data else None year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else year query = '%s %s' % (title, hdlr) query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', query) url = self.search_link % quote_plus(query) url = urljoin(self.base_link, url).replace('%3A+', '+') # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) # result = scraper.get(url).content result = py_tools.ensure_str(scraper.get(url).content, errors='replace') if not result or "Sorry, but you are looking for something that isn't here" in str( result): return sources posts = client.parseDOM(result, "div", attrs={"class": "post"}) if not posts: return sources except: source_utils.scraper_error('MAXRLS') return sources for post in posts: try: post_title = client.parseDOM(post, "h2", attrs={"class": "postTitle"}) post_title = client.parseDOM(post_title, 'a')[0] if not source_utils.check_title(title, aliases, post_title, hdlr, year): continue content = client.parseDOM(post, "div", attrs={"class": "postContent"}) ltr = client.parseDOM(content, "p", attrs={"dir": "ltr"}) if not ltr: continue for i in ltr: if '<strong>' not in i or 'imdb.com' in i: continue name = re.search(r'<strong>(.*?)<', i).group(1) name = re.sub(r'(<span.*?>)', '', name).replace('</span>', '') if title not in name: continue # IMDB and Links: can be in name so check for title match name_info = source_utils.info_from_name( name, title, year, hdlr, episode_title) if source_utils.remove_lang(name_info): continue links = client.parseDOM(i, "a", ret="href") size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', i, re.DOTALL) for link in links: url = link if url in str(sources): continue valid, host = source_utils.is_host_valid(url, hostDict) if not valid: continue quality, info = source_utils.get_release_quality( name_info, url) try: dsize, isize = source_utils._size(size[0]) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) sources.append({ 'provider': 'maxrls', 'source': host, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) except: source_utils.scraper_error('MAXRLS') return sources
def get_sources_packs(self, link): # log_utils.log('link = %s' % str(link), log_utils.LOGDEBUG) try: r = py_tools.ensure_str(self.scraper.get(link).content, errors='replace') if not r: return posts = client.parseDOM(r, 'tr', attrs={'class': 'tlr'}) posts += client.parseDOM(r, 'tr', attrs={'class': 'tlz'}) except: source_utils.scraper_error('EXTRATORRENT') return for post in posts: try: post = re.sub(r'\n', '', post) post = re.sub(r'\t', '', post) url = re.findall(r'href\s*=\s*["\'](magnet:[^"\']+)["\']', post, re.DOTALL | re.I)[0] url = unquote_plus(url).replace('&', '&').replace( ' ', '.').split('&tr')[0] url = source_utils.strip_non_ascii_and_unprintable(url) if url in str(self.sources): continue hash = re.compile(r'btih:(.*?)&', re.I).findall(url)[0] name = url.split('&dn=')[1] name = source_utils.clean_name(name) if not self.search_series: if not self.bypass_filter: if not source_utils.filter_season_pack( self.title, self.aliases, self.year, self.season_x, name): continue package = 'season' elif self.search_series: if not self.bypass_filter: valid, last_season = source_utils.filter_show_pack( self.title, self.aliases, self.imdb, self.year, self.season_x, name, self.total_seasons) if not valid: continue else: last_season = self.total_seasons package = 'show' name_info = source_utils.info_from_name(name, self.title, self.year, season=self.season_x, pack=package) if source_utils.remove_lang(name_info): continue try: seeders = int( client.parseDOM(post, 'td', attrs={'class': 'sy'})[0].replace(',', '')) if self.min_seeders > seeders: continue except: seeders = 0 quality, info = source_utils.get_release_quality( name_info, url) try: size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', post)[0] dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) item = { 'provider': 'extratorrent', 'source': 'torrent', 'seeders': seeders, 'hash': hash, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize, 'package': package } if self.search_series: item.update({'last_season': last_season}) self.sources.append(item) except: source_utils.scraper_error('EXTRATORRENT')
def sources(self, url, hostDict): sources = [] if not url: return sources try: scraper = cfscrape.create_scraper() data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] title = title.replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] if 'tvshowtitle' in data else None year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else year if 'tvshowtitle' in data: query = '%s %s' % (title, hdlr) query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', query) url = self.search_link % quote_plus(query) else: url = self.search_link % data['imdb'] url = urljoin(self.base_link, url) # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) r = py_tools.ensure_str(scraper.get(url).content, errors='replace') posts = client.parseDOM(r, 'div', attrs={'class': 'tgxtable'}) if not posts: return sources except: source_utils.scraper_error('TORRENTGALAXY') return sources for post in posts: try: links = zip( re.findall(r'href\s*=\s*["\'](magnet:[^"\']+)["\']', post, re.DOTALL | re.I), re.findall(r'<span\s*class\s*=\s*["\']badge\s*badge-secondary["\']\s*style\s*=\s*["\']border-radius:4px;["\']>(.*?)</span>', post, re.DOTALL | re.I), re.findall(r'<span\s*title\s*=\s*["\']Seeders/Leechers["\']>\[<font\s*color\s*=\s*["\']green["\']><b>(.*?)<', post, re.DOTALL | re.I)) for link in links: url = unquote_plus(link[0]).split('&tr')[0].replace(' ', '.') url = source_utils.strip_non_ascii_and_unprintable(url) hash = re.compile(r'btih:(.*?)&', re.I).findall(url)[0] name = url.split('&dn=')[1] name = source_utils.clean_name(name) if not source_utils.check_title(title, aliases, name, hdlr, year): continue name_info = source_utils.info_from_name(name, title, year, hdlr, episode_title) if source_utils.remove_lang(name_info): continue if not episode_title: #filter for eps returned in movie query (rare but movie and show exists for Run in 2020) ep_strings = [r'[.-]s\d{2}e\d{2}([.-]?)', r'[.-]s\d{2}([.-]?)', r'[.-]season[.-]?\d{1,2}[.-]?'] if any(re.search(item, name.lower()) for item in ep_strings): continue try: seeders = int(link[2]) if self.min_seeders > seeders: continue except: seeders = 0 quality, info = source_utils.get_release_quality(name_info, url) try: dsize, isize = source_utils._size(link[1]) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) sources.append({'provider': 'torrentgalaxy', 'source': 'torrent', 'seeders': seeders, 'hash': hash, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize}) except: source_utils.scraper_error('TORRENTGALAXY') return sources
def get_sources(self, url): try: r = py_tools.ensure_str(self.scraper.get(url).content, errors='replace') if not r: return rows = client.parseDOM(r, 'tr', attrs={'class': 'tlr'}) rows += client.parseDOM(r, 'tr', attrs={'class': 'tlz'}) except: source_utils.scraper_error('EXTRATORRENT') return for row in rows: try: url = re.search(r'href\s*=\s*["\'](magnet:[^"\']+)["\']', row, re.I).group(1) url = unquote_plus(url).replace('&', '&').replace( ' ', '.').split('&tr')[0] url = source_utils.strip_non_ascii_and_unprintable(url) if url in str(self.sources): continue hash = re.search(r'btih:(.*?)&', url, re.I).group(1) name = url.split('&dn=')[1] name = source_utils.clean_name(name) if not source_utils.check_title(self.title, self.aliases, name, self.hdlr, self.year): continue name_info = source_utils.info_from_name( name, self.title, self.year, self.hdlr, self.episode_title) if source_utils.remove_lang(name_info): continue if not self.episode_title: #filter for eps returned in movie query (rare but movie and show exists for Run in 2020) ep_strings = [ r'(?:\.|\-)s\d{2}e\d{2}(?:\.|\-|$)', r'(?:\.|\-)s\d{2}(?:\.|\-|$)', r'(?:\.|\-)season(?:\.|\-)\d{1,2}(?:\.|\-|$)' ] if any( re.search(item, name.lower()) for item in ep_strings): continue try: seeders = int( client.parseDOM(row, 'td', attrs={'class': 'sn'})[0].replace(',', '')) if self.min_seeders > seeders: continue except: seeders = 0 quality, info = source_utils.get_release_quality( name_info, url) try: size = re.search( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', row).group(0) dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) self.sources.append({ 'provider': 'extratorrent', 'source': 'torrent', 'seeders': seeders, 'hash': hash, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) except: source_utils.scraper_error('EXTRATORRENT')