def __get_dom_content(html, name, match): try: if match.endswith('/>'): return '' # override tag name with tag from match if possible tag = re.match(r'<([^\s/>]+)', match) if tag: name = tag.group(1) start_str = '<%s' % name end_str = "</%s" % name # start/end tags without matching case cause issues start = html.find(match) end = html.find(end_str, start) pos = html.find(start_str, start + 1) while pos < end and pos != -1: # Ignore too early </endstr> return tend = html.find(end_str, end + len(end_str)) if tend != -1: end = tend pos = html.find(start_str, pos + 1) if start == -1 and end == -1: result = '' elif start > -1 and end > -1: result = html[start + len(match):end] elif end > -1: result = html[:end] elif start > -1: result = html[start + len(match):] else: result = '' return result except: from fenomscrapers.modules import log_utils log_utils.error() return ''
def sources(specified_folders=None): try: sourceDict = [] sourceFolder = getScraperFolder() sourceFolderLocation = os.path.join(os.path.dirname(__file__), sourceFolder) sourceSubFolders = [x[1] for x in os.walk(sourceFolderLocation)][0] sourceSubFolders = [x for x in sourceSubFolders if '__pycache__' not in x] if specified_folders: sourceSubFolders = specified_folders for i in sourceSubFolders: for loader, module_name, is_pkg in walk_packages([os.path.join(sourceFolderLocation, i)]): if is_pkg: continue if enabledCheck(module_name): try: module = loader.find_module(module_name).load_module(module_name) sourceDict.append((module_name, module.source())) except Exception as e: if debug: from fenomscrapers.modules import log_utils log_utils.log('Error: Loading module: "%s": %s' % (module_name, e), level=log_utils.LOGWARNING) return sourceDict except: from fenomscrapers.modules import log_utils log_utils.error() return []
def remove_lang(release_info): if not release_info: return False try: if any(value in release_info for value in DUBBED): return True if any(value in release_info for value in SUBS): return True if home_getProperty('fenom.filter.undesirables') == 'true': undesirables = get_undesirables() if any(value in release_info for value in undesirables): return True if home_getProperty('fenom.filter.foreign.single.audio') == 'true': if any(value in release_info for value in LANG) and not any( value in release_info for value in ['.eng.', '.en.', 'english']): return True if any(value in release_info for value in ABV_LANG) and not any( value in release_info for value in ['.eng.', '.en.', 'english']): return True if release_info.endswith('.srt.') and not any( value in release_info for value in ['with.srt', '.avi', '.mkv', '.mp4']): return True return False except: from fenomscrapers.modules import log_utils log_utils.error() return False
def get(function, duration, *args): """ :param function: Function to be executed :param duration: Duration of validity of cache in hours :param args: Optional arguments for the provided function """ try: key = _hash_function(function, args) cache_result = cache_get(key) if cache_result: result = literal_eval(cache_result['value']) if _is_cache_valid(cache_result['date'], duration): return result fresh_result = repr(function(*args)) invalid = False try: # Sometimes None is returned as a string instead of None type for "fresh_result" if not fresh_result: invalid = True elif fresh_result == 'None' or fresh_result == '' or fresh_result == '[]' or fresh_result == '{}': invalid = True elif len(fresh_result) == 0: invalid = True except: pass if invalid: # If the cache is old, but we didn't get "fresh_result", return the old cache if cache_result: return result else: return None else: cache_insert(key, fresh_result) return literal_eval(fresh_result) except: from fenomscrapers.modules import log_utils log_utils.error() return None
def enabledCheck(module_name): try: if getSetting('provider.' + module_name) == 'true': return True else: return False except: from fenomscrapers.modules import log_utils log_utils.error() return True
def getScraperFolder(): try: sourceSubFolders = [x[1] for x in os.walk(os.path.dirname(__file__))][0] return [i for i in sourceSubFolders if 'fenomscrapers' in i.lower()][0] except: from fenomscrapers.modules import log_utils log_utils.error() return 'sources_fenomscrapers'
def strip_non_ascii_and_unprintable(text): try: result = ''.join(char for char in text if char in printable) return result.encode('ascii', errors='ignore').decode('ascii', errors='ignore') except: from fenomscrapers.modules import log_utils log_utils.error() return text
def six_decode(txt, char='utf-8'): try: if isPY3 and isinstance(txt, binary_type): txt = txt.decode(char) return txt except: from fenomscrapers.modules import log_utils log_utils.error() return txt
def ensure_text(s, encoding='utf-8', errors='strict'): try: if isinstance(s, binary_type): return s.decode(encoding, errors) elif isinstance(s, text_type): return s except: from fenomscrapers.modules import log_utils log_utils.error() return s
def normalize(title): try: title = ''.join(c for c in unicodedata.normalize( 'NFKD', py_tools.ensure_text(py_tools.ensure_str(title))) if unicodedata.category(c) != 'Mn') return str(title) except: from fenomscrapers.modules import log_utils log_utils.error() return title
def parseJSString(self, s): try: offset = 1 if s[0] == '+' else 0 val = int( eval( s.replace('!+[]', '1').replace('!![]', '1').replace( '[]', '0').replace('(', 'str(')[offset:])) return val except: from fenomscrapers.modules import log_utils log_utils.error()
def release_title_format(release_title): try: release_title = release_title.lower().replace( "'", "").lstrip('.').rstrip('.') fmt = '.%s.' % re.sub(r'[^a-z0-9-~]+', '.', release_title).replace( '.-.', '-').replace('-.', '-').replace('.-', '-').replace( '--', '-') return fmt except: from fenomscrapers.modules import log_utils log_utils.error() return release_title
def check_title(title, aliases, release_title, hdlr, year, years=None): try: aliases = aliases_to_array(jsloads(aliases)) except: aliases = None title_list = [] title_list_append = title_list.append if aliases: for item in aliases: try: alias = item.replace('!', '').replace('(', '').replace( ')', '').replace('&', 'and').replace(year, '') # alias = re.sub(r'[^A-Za-z0-9\s\.-]+', '', alias) if years: # for movies only, scraper to pass None for episodes for i in years: alias = alias.replace(i, '') if alias in title_list: continue title_list_append(alias) except: from fenomscrapers.modules import log_utils log_utils.error() try: match = True title = title.replace('!', '').replace('(', '').replace(')', '').replace('&', 'and') # title = re.sub(r'[^A-Za-z0-9\s\.-]+', '', title) title_list_append(title) release_title = release_title_format( release_title) # converts to .lower() h = hdlr.lower() t = release_title.split(h)[0].replace(year, '').replace('(', '').replace( ')', '').replace('&', 'and') if years: for i in years: t = t.split(i)[0] t = t.split('2160p')[0].split('4k')[0].split('1080p')[0].split( '720p')[0] if all(cleantitle.get(i) != cleantitle.get(t) for i in title_list): match = False if years: # for movies only, scraper to pass None for episodes if not any(value in release_title for value in years): match = False else: if h not in release_title: match = False return match except: from fenomscrapers.modules import log_utils log_utils.error() return match
def parseDOM(html, name='', attrs=None, ret=False): try: if attrs: attrs = dict((key, re.compile(value + ('$' if value else ''))) for key, value in py_tools.iteritems(attrs)) results = dom_parser.parse_dom(html, name, attrs, ret) if ret: results = [result.attrs[ret.lower()] for result in results] else: results = [result.content for result in results] return results except: from fenomscrapers.modules import log_utils log_utils.error()
def __get_dom_elements(item, name, attrs): try: if not attrs: pattern = r'(<%s(?:\s[^>]*>|/?>))' % name this_list = re.findall(pattern, item, re.M | re.S | re.I) else: last_list = None for key, value in py_tools.iteritems(attrs): value_is_regex = isinstance(value, re_type) value_is_str = isinstance(value, py_tools.string_types) pattern = r'''(<{tag}[^>]*\s{key}=(?P<delim>['"])(.*?)(?P=delim)[^>]*>)'''.format( tag=name, key=key) re_list = re.findall(pattern, item, re.M | re.S | re.I) if value_is_regex: this_list = [ r[0] for r in re_list if re.match(value, r[2]) ] else: temp_value = [value] if value_is_str else value this_list = [ r[0] for r in re_list if set(temp_value) <= set(r[2].split(' ')) ] if not this_list: has_space = (value_is_regex and ' ' in value.pattern) or ( value_is_str and ' ' in value) if not has_space: pattern = r'''(<{tag}[^>]*\s{key}=((?:[^\s>]|/>)*)[^>]*>)'''.format( tag=name, key=key) re_list = re.findall(pattern, item, re.M | re.S | re.I) if value_is_regex: this_list = [ r[0] for r in re_list if re.match(value, r[1]) ] else: this_list = [ r[0] for r in re_list if value == r[1] ] if last_list is None: last_list = this_list else: last_list = [ item for item in this_list if item in last_list ] this_list = last_list return this_list except: from fenomscrapers.modules import log_utils log_utils.error() return this_list
def aliases_to_array(aliases, filter=None): try: if all(isinstance(x, str) for x in aliases): return aliases if not filter: filter = [] if isinstance(filter, str): filter = [filter] return [ x.get('title') for x in aliases if not filter or x.get('country') in filter ] except: from fenomscrapers.modules import log_utils log_utils.error() return []
def _basic_request(url, headers=None, post=None, timeout='30', limit=None): try: try: headers.update(headers) except: headers = {} req = urllib2.Request(url, data=post) _add_request_header(req, headers) response = urllib2.urlopen(req, timeout=int(timeout)) return _get_result(response, limit) except: from fenomscrapers.modules import log_utils log_utils.error()
def ensure_str(s, encoding='utf-8', errors='strict'): from fenomscrapers.modules import log_utils try: if not isinstance(s, (text_type, binary_type)): return log_utils.log("not expecting type '%s'" % type(s), __name__, log_utils.LOGDEBUG) if isPY2 and isinstance(s, text_type): s = s.encode(encoding, errors) elif isPY3 and isinstance(s, binary_type): s = s.decode(encoding, errors) return s except: log_utils.error() return s
def clean_name(release_title): try: unwanted = [ '[.www.tamilrockers.com.]', 'tamilrockers.com', 'www.tamilrockers.com', 'www.tamilrockers.ws', 'www.tamilrockers.pl', '[.www.torrenting.com.]', 'www.torrenting.com', 'www.torrenting.org', 'www-torrenting-com', 'www-torrenting-org', '[katmoviehd.eu]', '[katmoviehd.to]', '[katmoviehd.tv]', '+katmoviehd.pw+', 'katmoviehd-pw', '[.www.torrent9.uno.]', '[www.torrent9.ph.]', 'www.torrent9.nz', '[.torrent9.tv.]', '[.torrent9.cz.]', '[ torrent9.cz ]', 'torrent9-cz-.-', '[agusiq.torrents.pl]', '[agusiq-torrents.pl]', 'agusiq-torrents-pl', '[.oxtorrent.com.]', '[oxtorrent-com]', 'oxtorrent-com', '[movcr.com]', 'www.movcr.tv', 'movcr-com', 'www.movcr.to', '[ex-torrenty.org]', '[xtorrenty.org]', 'xtorrenty.org', '[acesse.]', '[acesse-hd-elite-me]', '[acesse.hd-elite.me].', '[torrentcouch.net]', '[torrentcouch-net]', '[.www.cpasbien.cm.]', '[.www.cpasbien.pw.]', '[auratorrent.pl].nastoletni.wilkoak', '[auratorrent.pl]', '[.www.nextorrent.site.]', '[nextorrent.net]', '[www.scenetime.com]', 'www.scenetime.com', '[kst.vn]', 'kst-vn', '[itfriend]', '[itf]', '(imax)', '.imax.', 'www.2movierulz.ac', 'www.2movierulz.ms', 'www.3movierulz.com', 'www.3movierulz.tv', '[zooqle.com]', '[horriblesubs]', '[gktorrent.com]', '[.www.omgtorrent.com.]', '[3d.hentai]', '[dark.media]', '[devil-torrents.pl]', 'mkvcinemas.live', '[filetracker.pl]', 'www.bludv.tv', 'ramin.djawadi', '[prof]', '[reup]', '[.www.speed.cd.]', '[-bde4.com]', 'extramovies.casa', '[ah]', '[ul]', '+13.+', 'taht.oyunlar', 'crazy4tv.com', '[tv]', '[noobsubs]', '[.freecourseweb.com.]', 'karibu', '989pa.com', '[aletorrenty.pl]', 'best-torrents-net', '[.www.torrentday.com.]', '1-3-3-8.com', 'ssrmovies.club', 'www.tamilmv.bid', 'www.1tamilmv.org', '[h3h2.com]' ] if release_title.lower().startswith('rifftrax'): return release_title # removed by "undesirables" anyway so exit release_title = strip_non_ascii_and_unprintable(release_title).lstrip( '/ ').replace(' ', '.') for i in unwanted: if release_title.lower().startswith(i): pattern = r'\%s' % i if i.startswith('[') or i.startswith( '+') else r'%s' % i release_title = re.sub(r'^%s' % pattern, '', release_title, 1, re.I) release_title = release_title.lstrip(' 4.-[](){}') return release_title except: from fenomscrapers.modules import log_utils log_utils.error() return release_title
def _get_result(response, limit=None): try: if limit == '0': result = response.read(224 * 1024) elif limit: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.headers["Content-Encoding"] except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO(result)).read() return result except: from fenomscrapers.modules import log_utils log_utils.error()
def info_from_name(release_title, title, year, hdlr=None, episode_title=None, season=None, pack=None): try: release_title = release_title.lower().replace('&', 'and').replace("'", "") release_title = re.sub(r'[^a-z0-9]+', '.', release_title) title = title.lower().replace('&', 'and').replace("'", "") title = re.sub(r'[^a-z0-9]+', '.', title) name_info = release_title.replace(title, '').replace(year, '') if hdlr: name_info = name_info.replace(hdlr.lower(), '') if episode_title: episode_title = episode_title.lower().replace('&', 'and').replace( "'", "") episode_title = re.sub(r'[^a-z0-9]+', '.', episode_title) name_info = name_info.replace(episode_title, '') if pack: if pack == 'season': season_fill = season.zfill(2) str1_replace = [ '.s%s' % season, '.s%s' % season_fill, '.season.%s' % season, '.season%s' % season, '.season.%s' % season_fill, '.season%s' % season_fill, 'complete' ] for i in str1_replace: name_info = name_info.replace(i, '') elif pack == 'show': str2_replace = [ '.all.seasons', 'seasons', 'season', 'the.complete', 'complete', 'all.torrent', 'total.series', 'tv.series', 'series', 'edited', 's1', 's01' ] for i in str2_replace: name_info = name_info.replace(i, '') name_info = name_info.lstrip('.').rstrip('.') name_info = '.%s.' % name_info return name_info except: from fenomscrapers.modules import log_utils log_utils.error() return release_title
def _size(siz): try: if siz in ['0', 0, '', None]: return 0, '' div = 1 if siz.lower().endswith(('gb', 'gib')) else 1024 # if ',' in siz and siz.lower().endswith(('mb', 'mib')): siz = size.replace(',', '') # elif ',' in siz and siz.lower().endswith(('gb', 'gib')): siz = size.replace(',', '.') # float_size = float(re.sub(r'[^0-9|/.|/,]', '', siz.replace(',', '.'))) / div float_size = float( re.sub(r'[^0-9|/.|/,]', '', siz.replace(',', '')) ) / div #comma issue where 2,750 MB or 2,75 GB (sometimes replace with "." and sometimes not) str_size = '%.2f GB' % float_size return float_size, str_size except: from fenomscrapers.modules import log_utils log_utils.error() return 0, ''
def get_release_quality(release_info, release_link=None): try: quality = None info = [] if release_info: quality = get_qual(release_info) if not quality: if release_link: release_link = release_link.lower() quality = get_qual(release_link) if not quality: quality = 'SD' else: quality = 'SD' return quality, info except: from fenomscrapers.modules import log_utils log_utils.error() return 'SD', []
def __get_attribs(element): try: attribs = {} for match in re.finditer( r'''\s+(?P<key>[^=]+)=\s*(?:(?P<delim>["'])(?P<value1>.*?)(?P=delim)|(?P<value2>[^"'][^>\s]*))''', element): match = match.groupdict() value1 = match.get('value1') value2 = match.get('value2') value = value1 if value1 is not None else value2 if value is None: continue attribs[match['key'].lower().strip()] = value return attribs except: from fenomscrapers.modules import log_utils log_utils.error() return attribs
def get(self, netloc, ua, timeout): try: headers = {'User-Agent': ua, 'Referer': netloc} result = _basic_request(netloc, headers=headers, timeout=timeout) match = re.findall(r'xhr\.open\("GET","([^,]+),', result, re.I) if not match: return False url_Parts = match[0].split('"') url_Parts[1] = '1680' url = urljoin(netloc, ''.join(url_Parts)) match = re.findall(r'rid\s*?=\s*?([0-9a-zA-Z]+)', url_Parts[0]) if not match: return False headers['Cookie'] = 'rcksid=%s' % match[0] result = _basic_request(url, headers=headers, timeout=timeout) return self.getCookieString(result, headers['Cookie']) except: from fenomscrapers.modules import log_utils log_utils.error()
def __top_domain(url): try: #Py2 from urlparse import urlparse except ImportError: #Py3 from urllib.parse import urlparse try: elements = urlparse(url) domain = elements.netloc or elements.path domain = domain.split('@')[-1].split(':')[0] regex = r"(?:www\.)?([\w\-]*\.[\w\-]{2,3}(?:\.[\w\-]{2,3})?)$" res = re.search(regex, domain) if res: domain = res.group(1) domain = domain.lower() return domain except: from fenomscrapers.modules import log_utils log_utils.error()
def get(title): try: if not title: return try: title = py_tools.ensure_str(title) except: pass title = re.sub(r'&#(\d+);', '', title).lower() title = re.sub(r'(&#[0-9]+)([^;^0-9]+)', '\\1;\\2', title) title = title.replace('"', '\"').replace('&', '&') title = re.sub( r'\n|([\[({].+?[})\]])|([:;–\-"\',!_.?~$@])|\s', '', title ) # stop trying to remove alpha characters "vs" or "v", they're part of a title return title except: from fenomscrapers.modules import log_utils log_utils.error() return title
def cache_get(key): try: dbcon = get_connection() dbcur = get_connection_cursor(dbcon) ck_table = dbcur.execute( '''SELECT * FROM sqlite_master WHERE type='table' AND name='cache';''' ).fetchone() if not ck_table: return None results = dbcur.execute('''SELECT * FROM cache WHERE key=?''', (key, )).fetchone() return results except: from fenomscrapers.modules import log_utils log_utils.error() return None finally: dbcur.close() dbcon.close()
def _add_request_header(_request, headers): try: if not headers: headers = {} if py_tools.isPY3: scheme = _request.type host = _request.host else: scheme = _request.get_type() host = _request.get_host() referer = headers.get( 'Referer') if 'Referer' in headers else '%s://%s/' % (scheme, host) _request.add_unredirected_header('Host', host) _request.add_unredirected_header('Referer', referer) for key in headers: _request.add_header(key, headers[key]) except: from fenomscrapers.modules import log_utils log_utils.error()
def _replaceHTMLCodes(txt): try: if not txt: return '' txt = re.sub(r"(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) txt = unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") txt = txt.replace("<", "<") txt = txt.replace(">", ">") txt = txt.replace("&", "&") txt = txt.replace(" ", "") txt = txt.replace('…', '...') txt = txt.replace('’', '\'') txt = txt.replace('–', '-') txt = txt.strip() return txt except: from fenomscrapers.modules import log_utils log_utils.error() return txt