def urlencode(query): # Dict[str, str] -> str return py2_decode( _urlencode({ py2_encode(param): py2_encode(arg) for param, arg in query.items() }))
def reinterpret_windows1252_as_utf8(wrong_text): """ Maybe this was always meant to be in a single-byte encoding, and it makes the most sense in utf-8. :param wrong_text: text with problems :type: str or unicode :return: corrected text :rtype: str or unicode """ altered_bytes = [] for char in wrong_text: if ord(char) in WINDOWS_1252_GREMLINS: altered_bytes.append(py2_encode(char, 'WINDOWS_1252')) else: altered_bytes.append(py2_encode(char, 'latin-1', 'replace')) return py2_decode(''.join(altered_bytes), 'utf-8', 'replace')
def reinterpret_latin1_as_windows1252(wrong_text): """ Maybe this was always meant to be in a single-byte encoding, and it makes the most sense in Windows-1252. :param wrong_text: text with problems :type: str or unicode :return: corrected text :rtype: str or unicode """ return py2_decode(py2_encode(wrong_text, 'latin-1'), 'WINDOWS_1252', 'replace')
def extract_subpage(q, name, torrent, size, seeds, peers, info_hash, referer): try: log.debug("[%s] Getting subpage at %s" % (provider, repr(torrent))) except Exception as e: import traceback log.error("[%s] Subpage logging failed with: %s" % (provider, repr(e))) map(log.debug, traceback.format_exc().split("\n")) # New client instance, otherwise it's race conditions all over the place subclient = Client() subclient.passkey = client.passkey headers = {} if "subpage_mode" in definition: if definition["subpage_mode"] == "xhr": headers['X-Requested-With'] = 'XMLHttpRequest' headers['Content-Language'] = '' if referer: headers['Referer'] = referer uri = torrent.split('|') # Split cookies for private trackers subclient.open(py2_encode(uri[0]), headers=headers) if 'bittorrent' in subclient.headers.get('content-type', ''): log.debug('[%s] bittorrent content-type for %s' % (provider, repr(torrent))) if len(uri) > 1: # Stick back cookies if needed torrent = '%s|%s' % (torrent, uri[1]) else: try: torrent = extract_from_page(provider, subclient.content) if torrent and not torrent.startswith('magnet') and len( uri) > 1: # Stick back cookies if needed torrent = '%s|%s' % (torrent, uri[1]) except Exception as e: import traceback log.error( "[%s] Subpage extraction for %s failed with: %s" % (provider, repr(uri[0]), repr(e))) map(log.debug, traceback.format_exc().split("\n")) log.debug("[%s] Subpage torrent for %s: %s" % (provider, repr(uri[0]), torrent)) ret = (name, info_hash, torrent, size, seeds, peers) # Cache this subpage result if another query would need to request same url. provider_cache[uri[0]] = torrent q.put_nowait(ret)
def _execute(execPath): process = subprocess.Popen(py2_encode(execPath), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) stdout_value, stderr_value = process.communicate() stdout_value = stdout_value.decode() stderr_value = stderr_value.decode() retCode = process.returncode if len(stderr_value) > 2: debug.logError(stderr_value) return stdout_value
def remove_accents(string): """ Remove any accent in the string :param string: string to remove accents :type string: str or unicode :return: string without accents :rtype: unicode """ if not isinstance(string, unicode): string = normalize_string(string) nfkd_form = unicodedata.normalize('NFKD', string) only_ascii = py2_encode(nfkd_form, 'ASCII', 'ignore').strip() return string if only_ascii == u'' else only_ascii
def reinterpret_latin1_as_utf8(wrong_text): new_bytes = py2_encode(wrong_text, 'latin-1', 'replace') return py2_decode(new_bytes, 'utf-8', 'replace')
def process(provider, generator, filtering, has_special, verify_name=True, verify_size=True, skip_auth=False, start_time=None, timeout=None): """ Method for processing provider results using its generator and Filtering class instance Args: provider (str): Provider ID generator (function): Generator method, can be either ``extract_torrents`` or ``extract_from_api`` filtering (Filtering): Filtering class instance has_special (bool): Whether title contains special chars verify_name (bool): Whether to double-check the results' names match the query or not verify_size (bool): Whether to check the results' file sizes """ log.debug("[%s] execute_process for %s with %s" % (provider, provider, repr(generator))) definition = definitions[provider] definition = get_alias(definition, get_setting("%s_alias" % provider)) client = Client(info=filtering.info, request_charset=definition['charset'], response_charset=definition['response_charset']) token = None logged_in = False token_auth = False if get_setting('kodi_language', bool): kodi_language = xbmc.getLanguage(xbmc.ISO_639_1) if kodi_language: filtering.kodi_language = kodi_language language_exceptions = get_setting('language_exceptions') if language_exceptions.strip().lower(): filtering.language_exceptions = re.split(r',\s?', language_exceptions) log.debug("[%s] Queries: %s" % (provider, filtering.queries)) log.debug("[%s] Extras: %s" % (provider, filtering.extras)) for query, extra in zip(filtering.queries, filtering.extras): log.debug("[%s] Before keywords - Query: %s - Extra: %s" % (provider, repr(query), repr(extra))) if has_special: # Removing quotes, surrounding {title*} keywords, when title contains special chars query = re.sub("[\"']({title.*?})[\"']", '\\1', query) query = filtering.process_keywords(provider, query) extra = filtering.process_keywords(provider, extra) if not query: continue elif extra == '-' and filtering.results: continue elif start_time and timeout and time.time() - start_time + 3 >= timeout: # Stop doing requests if there is 3 seconds left for the overall task continue try: if 'charset' in definition and definition['charset'] and 'utf' not in definition['charset'].lower(): query = quote(query.encode(definition['charset'])) extra = quote(extra.encode(definition['charset'])) else: query = quote(py2_encode(query)) extra = quote(py2_encode(extra)) except Exception as e: log.debug("[%s] Could not quote the query (%s): %s" % (provider, query, e)) pass log.debug("[%s] After keywords - Query: %s - Extra: %s" % (provider, repr(query), repr(extra))) if not query: return filtering.results url_search = filtering.url.replace('QUERY', query) if extra and extra != '-': url_search = url_search.replace('EXTRA', extra) else: url_search = url_search.replace('EXTRA', '') url_search = url_search.replace(' ', definition['separator']) if definition['separator'] != '%20': url_search = url_search.replace('%20', definition['separator']) # MagnetDL fix... url_search = url_search.replace('FIRSTLETTER', query[:1]) # Creating the payload for POST method if 'post_data' in definition and not filtering.post_data: filtering.post_data = eval(definition['post_data']) payload = dict() for key, value in iteritems(filtering.post_data): if 'QUERY' in value: payload[key] = filtering.post_data[key].replace('QUERY', query) else: payload[key] = filtering.post_data[key] payload[key] = urllib.unquote(payload[key]) # Creating the payload for GET method headers = None data = None if filtering.get_data: data = dict() for key, value in iteritems(filtering.get_data): if 'QUERY' in value: data[key] = filtering.get_data[key].replace('QUERY', query) else: data[key] = filtering.get_data[key] log.debug("- %s query: %s" % (provider, repr(query))) log.debug("-- %s url_search before token: %s" % (provider, repr(url_search))) log.debug("--- %s using POST payload: %s" % (provider, repr(payload))) log.debug("----%s filtering with post_data: %s" % (provider, repr(filtering.post_data))) # Set search's "title" in filtering to double-check results' names if 'filter_title' in definition and definition['filter_title']: filtering.filter_title = True filtering.title = query if 'initial_url' in definition and definition['initial_url']: url = definition['initial_url'] if not url.startswith('http'): url = definition['root_url'] + url client.open(url) if token: log.info('[%s] Reusing existing token' % provider) url_search = url_search.replace('TOKEN', token) elif 'token' in definition: token_url = definition['base_url'] + definition['token'] log.debug("[%s] Getting token for %s at %s" % (provider, provider, repr(token_url))) client.open(py2_encode(token_url)) try: token_data = json.loads(client.content) except: log.error('%s: Failed to get token for %s' % (provider, repr(url_search))) return filtering.results log.debug("[%s] Token response for %s: %s" % (provider, provider, repr(token_data))) if 'token' in token_data: token = token_data['token'] log.debug("[%s] Got token for %s: %s" % (provider, provider, repr(token))) url_search = url_search.replace('TOKEN', token) else: log.warning('%s: Unable to get token for %s' % (provider, repr(url_search))) if logged_in: log.info("[%s] Reusing previous login" % provider) elif token_auth: log.info("[%s] Reusing previous token authorization" % provider) elif 'private' in definition and definition['private']: username = get_setting('%s_username' % provider, unicode) password = get_setting('%s_password' % provider, unicode) passkey = get_setting('%s_passkey' % provider, unicode) if not username and not password and not passkey: for addon_name in ('script.magnetic.%s' % provider, 'script.magnetic.%s-mc' % provider): for setting in ('username', 'password'): try: value = xbmcaddon.Addon(addon_name).getSetting(setting) set_setting('%s_%s' % (provider, setting), value) if setting == 'username': username = value if setting == 'password': password = value except: pass if username: client.username = username url_search = url_search.replace('USERNAME', username) if passkey: logged_in = True client.passkey = passkey url_search = url_search.replace('PASSKEY', passkey) elif 'login_object' in definition and definition['login_object']: login_object = None login_headers = None logged_in = skip_auth try: login_object = definition['login_object'].replace('USERNAME', 'u"%s"' % username).replace('PASSWORD', 'u"%s"' % password) except Exception as e: log.error("Could not make login object for %s: %s" % (provider, e)) try: if 'login_headers' in definition and definition['login_headers']: login_headers = eval(definition['login_headers']) except Exception as e: log.error("Could not make login headers for %s: %s" % (provider, e)) # TODO generic flags in definitions for those... if 'csrf_token' in definition and definition['csrf_token']: client.open(definition['root_url'] + definition['login_path']) if client.content: csrf_token = re.search(r'name=\"_?csrf_token\" value=\"(.*?)\"', client.content) if csrf_token: login_object = login_object.replace('CSRF_TOKEN', '"%s"' % csrf_token.group(1)) else: logged_in = True if 'token_auth' in definition: # log.debug("[%s] logging in with: %s" % (provider, login_object)) if client.open(definition['root_url'] + definition['token_auth'], post_data=eval(login_object)): try: token_data = json.loads(client.content) except: log.error('%s: Failed to get token from %s' % (provider, definition['token_auth'])) return filtering.results log.debug("[%s] Token response for %s: %s" % (provider, provider, repr(token_data))) if 'token' in token_data: client.token = token_data['token'] log.debug("[%s] Auth token for %s: %s" % (provider, provider, repr(client.token))) else: log.error('[%s] Unable to get auth token for %s' % (provider, repr(url_search))) return filtering.results log.info('[%s] Token auth successful' % provider) token_auth = True else: log.error("[%s] Token auth failed with response: %s" % (provider, repr(client.content))) return filtering.results elif not logged_in and client.login(definition['root_url'], definition['login_path'], eval(login_object), login_headers, definition['login_failed']): log.info('[%s] Login successful' % provider) logged_in = True elif not logged_in: log.error("[%s] Login failed: %s", provider, client.status) log.debug("[%s] Failed login content: %s", provider, repr(client.content)) return filtering.results if logged_in: if provider == 'hd-torrents': client.open(definition['root_url'] + '/torrents.php') csrf_token = re.search(r'name="csrfToken" value="(.*?)"', client.content) url_search = url_search.replace("CSRF_TOKEN", csrf_token.group(1)) client.save_cookies() log.info("[%s] > %s search URL: %s" % (provider, definition['name'].rjust(longest), url_search)) if 'headers' in definition and definition['headers']: headers = eval(definition['headers']) log.info("[%s] > %s headers: %s" % (provider, definition['name'].rjust(longest), headers)) client.open(py2_encode(url_search), post_data=payload, get_data=data, headers=headers) filtering.results.extend( generate_payload(provider, generator(provider, client), filtering, verify_name, verify_size)) return filtering.results
def extract_torrents(provider, client): """ Main torrent extraction generator for non-API based providers Args: provider (str): Provider ID client (Client): Client class instance Yields: tuple: A torrent result """ definition = definitions[provider] definition = get_alias(definition, get_setting("%s_alias" % provider)) log.debug("[%s] Extracting torrents from %s using definitions: %s" % (provider, provider, repr(definition))) if not client.content: if debug_parser: log.debug("[%s] Parser debug | Page content is empty" % provider) raise StopIteration dom = Html().feed(client.content) key_search = get_search_query(definition, "key") row_search = get_search_query(definition, "row") name_search = get_search_query(definition, "name") torrent_search = get_search_query(definition, "torrent") info_hash_search = get_search_query(definition, "infohash") size_search = get_search_query(definition, "size") seeds_search = get_search_query(definition, "seeds") peers_search = get_search_query(definition, "peers") referer_search = get_search_query(definition, "referer") log.debug("[%s] Parser: %s" % (provider, repr(definition['parser']))) q = Queue() threads = [] needs_subpage = 'subpage' in definition and definition['subpage'] if needs_subpage: def extract_subpage(q, name, torrent, size, seeds, peers, info_hash, referer): try: log.debug("[%s] Getting subpage at %s" % (provider, repr(torrent))) except Exception as e: import traceback log.error("[%s] Subpage logging failed with: %s" % (provider, repr(e))) map(log.debug, traceback.format_exc().split("\n")) # New client instance, otherwise it's race conditions all over the place subclient = Client() subclient.passkey = client.passkey headers = {} if "subpage_mode" in definition: if definition["subpage_mode"] == "xhr": headers['X-Requested-With'] = 'XMLHttpRequest' headers['Content-Language'] = '' if referer: headers['Referer'] = referer uri = torrent.split('|') # Split cookies for private trackers subclient.open(py2_encode(uri[0]), headers=headers) if 'bittorrent' in subclient.headers.get('content-type', ''): log.debug('[%s] bittorrent content-type for %s' % (provider, repr(torrent))) if len(uri) > 1: # Stick back cookies if needed torrent = '%s|%s' % (torrent, uri[1]) else: try: torrent = extract_from_page(provider, subclient.content) if torrent and not torrent.startswith('magnet') and len(uri) > 1: # Stick back cookies if needed torrent = '%s|%s' % (torrent, uri[1]) except Exception as e: import traceback log.error("[%s] Subpage extraction for %s failed with: %s" % (provider, repr(uri[0]), repr(e))) map(log.debug, traceback.format_exc().split("\n")) log.debug("[%s] Subpage torrent for %s: %s" % (provider, repr(uri[0]), torrent)) ret = (name, info_hash, torrent, size, seeds, peers) # Cache this subpage result if another query would need to request same url. provider_cache[uri[0]] = torrent q.put_nowait(ret) if not dom: if debug_parser: log.debug("[%s] Parser debug | Could not parse DOM from page content" % provider) raise StopIteration if debug_parser: log.debug("[%s] Parser debug | Page content: %s" % (provider, client.content.replace('\r', '').replace('\n', ''))) key = eval(key_search) if key_search else "" if key_search and debug_parser: key_str = key.__str__() log.debug("[%s] Parser debug | Matched '%s' iteration for query '%s': %s" % (provider, 'key', key_search, key_str.replace('\r', '').replace('\n', ''))) items = eval(row_search) if debug_parser: log.debug("[%s] Parser debug | Matched %d items for '%s' query '%s'" % (provider, len(items), 'row', row_search)) for item in items: if debug_parser: item_str = item.__str__() log.debug("[%s] Parser debug | Matched '%s' iteration for query '%s': %s" % (provider, 'row', row_search, item_str.replace('\r', '').replace('\n', ''))) if not item: continue try: name = eval(name_search) if name_search else "" torrent = eval(torrent_search) if torrent_search else "" size = eval(size_search) if size_search else "" seeds = eval(seeds_search) if seeds_search else "" peers = eval(peers_search) if peers_search else "" info_hash = eval(info_hash_search) if info_hash_search else "" referer = eval(referer_search) if referer_search else "" if 'magnet:?' in torrent: torrent = torrent[torrent.find('magnet:?'):] if debug_parser: log.debug("[%s] Parser debug | Matched '%s' iteration for query '%s': %s" % (provider, 'name', name_search, name)) log.debug("[%s] Parser debug | Matched '%s' iteration for query '%s': %s" % (provider, 'torrent', torrent_search, torrent)) log.debug("[%s] Parser debug | Matched '%s' iteration for query '%s': %s" % (provider, 'size', size_search, size)) log.debug("[%s] Parser debug | Matched '%s' iteration for query '%s': %s" % (provider, 'seeds', seeds_search, seeds)) log.debug("[%s] Parser debug | Matched '%s' iteration for query '%s': %s" % (provider, 'peers', peers_search, peers)) if info_hash_search: log.debug("[%s] Parser debug | Matched '%s' iteration for query '%s': %s" % (provider, 'info_hash', info_hash_search, info_hash)) if referer_search: log.debug("[%s] Parser debug | Matched '%s' iteration for query '%s': %s" % (provider, 'info_hash', referer_search, referer)) # Pass client cookies with torrent if private if not torrent.startswith('magnet'): user_agent = USER_AGENT if client.passkey: torrent = torrent.replace('PASSKEY', client.passkey) elif client.token: headers = {'Authorization': client.token, 'User-Agent': user_agent} log.debug("[%s] Appending headers: %s" % (provider, repr(headers))) torrent = append_headers(torrent, headers) log.debug("[%s] Torrent with headers: %s" % (provider, repr(torrent))) else: parsed_url = urlparse(torrent.split('|')[0]) cookie_domain = '{uri.netloc}'.format(uri=parsed_url) cookie_domain = re.sub('www\d*\.', '', cookie_domain) cookies = [] for cookie in client._cookies: if cookie_domain in cookie.domain: cookies.append(cookie) headers = {} if cookies: headers = {'User-Agent': user_agent} if client.request_headers: headers.update(client.request_headers) if client.url: headers['Referer'] = client.url headers['Origin'] = client.url # Need to set Cookie afterwards to avoid rewriting it with session Cookies headers['Cookie'] = ";".join(["%s=%s" % (c.name, c.value) for c in cookies]) else: headers = {'User-Agent': user_agent} torrent = append_headers(torrent, headers) if name and torrent and needs_subpage and not torrent.startswith('magnet'): if not torrent.startswith('http'): torrent = definition['root_url'] + py2_encode(torrent) # Check if this url was previously requested, to avoid doing same job again. uri = torrent.split('|') if uri and uri[0] and uri[0] in provider_cache and provider_cache[uri[0]]: yield (name, info_hash, provider_cache[uri[0]], size, seeds, peers) continue t = Thread(target=extract_subpage, args=(q, name, torrent, size, seeds, peers, info_hash, referer)) threads.append(t) else: yield (name, info_hash, torrent, size, seeds, peers) except Exception as e: log.error("[%s] Got an exception while parsing results: %s" % (provider, repr(e))) if needs_subpage: log.debug("[%s] Starting subpage threads..." % provider) for t in threads: t.start() for t in threads: t.join() for i in range(q.qsize()): ret = q.get_nowait() log.debug("[%s] Queue %d got: %s" % (provider, i, repr(ret))) yield ret # Save cookies in cookie jar client.save_cookies()
def strftime(format, t=None): return py2_decode(_strftime(py2_encode(format)))
def parse_qs(qs): # str -> Dict[str, List[str]] return {py2_decode(param): [py2_decode(a) for a in args] for param, args in _parse_qs(py2_encode(qs)).items()}