Пример #1
0
def extract_torrents(provider, client):
    """ Main torrent extraction generator for non-API based providers

    Args:
        provider  (str): Provider ID
        client (Client): Client class instance

    Yields:
        tuple: A torrent result
    """
    definition = definitions[provider]
    definition = get_alias(definition, get_setting("%s_alias" % provider))
    log.debug("[%s] Extracting torrents from %s using definitions: %s" %
              (provider, provider, repr(definition)))

    if not client.content:
        if get_setting("use_debug_parser", bool):
            log.debug("[%s] Parser debug | Page content is empty" % provider)

        raise StopIteration

    dom = Html().feed(client.content)

    key_search = get_search_query(definition, "key")
    row_search = get_search_query(definition, "row")
    name_search = get_search_query(definition, "name")
    torrent_search = get_search_query(definition, "torrent")
    info_hash_search = get_search_query(definition, "infohash")
    size_search = get_search_query(definition, "size")
    seeds_search = get_search_query(definition, "seeds")
    peers_search = get_search_query(definition, "peers")
    referer_search = get_search_query(definition, "referer")

    log.debug("[%s] Parser: %s" % (provider, repr(definition['parser'])))

    q = Queue()
    threads = []
    needs_subpage = 'subpage' in definition and definition['subpage']

    if needs_subpage:

        def extract_subpage(q, name, torrent, size, seeds, peers, info_hash,
                            referer):
            try:
                log.debug("[%s] Getting subpage at %s" %
                          (provider, repr(torrent)))
            except Exception as e:
                import traceback
                log.error("[%s] Subpage logging failed with: %s" %
                          (provider, repr(e)))
                map(log.debug, traceback.format_exc().split("\n"))

            # New client instance, otherwise it's race conditions all over the place
            subclient = Client()
            subclient.passkey = client.passkey
            headers = {}

            if "subpage_mode" in definition:
                if definition["subpage_mode"] == "xhr":
                    headers['X-Requested-With'] = 'XMLHttpRequest'
                    headers['Content-Language'] = ''

            if referer:
                headers['Referer'] = referer

            uri = torrent.split('|')  # Split cookies for private trackers
            subclient.open(uri[0].encode('utf-8'), headers=headers)

            if 'bittorrent' in subclient.headers.get('content-type', ''):
                log.debug('[%s] bittorrent content-type for %s' %
                          (provider, repr(torrent)))
                if len(uri) > 1:  # Stick back cookies if needed
                    torrent = '%s|%s' % (torrent, uri[1])
            else:
                try:
                    torrent = extract_from_page(provider, subclient.content)
                    if torrent and not torrent.startswith('magnet') and len(
                            uri) > 1:  # Stick back cookies if needed
                        torrent = '%s|%s' % (torrent, uri[1])
                except Exception as e:
                    import traceback
                    log.error(
                        "[%s] Subpage extraction for %s failed with: %s" %
                        (provider, repr(uri[0]), repr(e)))
                    map(log.debug, traceback.format_exc().split("\n"))

            ret = (name, info_hash, torrent, size, seeds, peers)
            q.put_nowait(ret)

    if not dom:
        if get_setting("use_debug_parser", bool):
            log.debug(
                "[%s] Parser debug | Could not parse DOM from page content" %
                provider)

        raise StopIteration

    if get_setting("use_debug_parser", bool):
        log.debug(
            "[%s] Parser debug | Page content: %s" %
            (provider, client.content.replace('\r', '').replace('\n', '')))

    key = eval(key_search) if key_search else ""
    if key_search and get_setting("use_debug_parser", bool):
        key_str = key.__str__()
        log.debug(
            "[%s] Parser debug | Matched '%s' iteration for query '%s': %s" %
            (provider, 'key', key_search, key_str.replace('\r', '').replace(
                '\n', '')))

    items = eval(row_search)
    if get_setting("use_debug_parser", bool):
        log.debug("[%s] Parser debug | Matched %d items for '%s' query '%s'" %
                  (provider, len(items), 'row', row_search))

    for item in items:
        if get_setting("use_debug_parser", bool):
            item_str = item.__str__()
            log.debug(
                "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                % (provider, 'row', row_search, item_str.replace(
                    '\r', '').replace('\n', '')))

        if not item:
            continue

        try:
            name = eval(name_search) if name_search else ""
            torrent = eval(torrent_search) if torrent_search else ""
            size = eval(size_search) if size_search else ""
            seeds = eval(seeds_search) if seeds_search else ""
            peers = eval(peers_search) if peers_search else ""
            info_hash = eval(info_hash_search) if info_hash_search else ""
            referer = eval(referer_search) if referer_search else ""

            if 'magnet:?' in torrent:
                torrent = torrent[torrent.find('magnet:?'):]

            if get_setting("use_debug_parser", bool):
                log.debug(
                    "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                    % (provider, 'name', name_search, name))
                log.debug(
                    "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                    % (provider, 'torrent', torrent_search, torrent))
                log.debug(
                    "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                    % (provider, 'size', size_search, size))
                log.debug(
                    "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                    % (provider, 'seeds', seeds_search, seeds))
                log.debug(
                    "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                    % (provider, 'peers', peers_search, peers))
                if info_hash_search:
                    log.debug(
                        "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                        % (provider, 'info_hash', info_hash_search, info_hash))
                if referer_search:
                    log.debug(
                        "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                        % (provider, 'info_hash', referer_search, referer))

            # Pass client cookies with torrent if private
            if not torrent.startswith('magnet'):
                user_agent = USER_AGENT

                if client.passkey:
                    torrent = torrent.replace('PASSKEY', client.passkey)
                elif client.token:
                    headers = {
                        'Authorization': client.token,
                        'User-Agent': user_agent
                    }
                    log.debug("[%s] Appending headers: %s" %
                              (provider, repr(headers)))
                    torrent = append_headers(torrent, headers)
                    log.debug("[%s] Torrent with headers: %s" %
                              (provider, repr(torrent)))
                else:
                    parsed_url = urlparse(torrent.split('|')[0])
                    cookie_domain = '{uri.netloc}'.format(uri=parsed_url)
                    cookie_domain = re.sub('www\d*\.', '', cookie_domain)
                    cookies = []
                    for cookie in client._cookies:
                        if cookie_domain in cookie.domain:
                            cookies.append(cookie)
                    headers = {}
                    if cookies:
                        headers = {'User-Agent': user_agent}
                        log.debug("[%s] Cookies res: %s / %s" %
                                  (provider, repr(headers),
                                   repr(client.request_headers)))
                        if client.request_headers:
                            headers.update(client.request_headers)
                        if client.url:
                            headers['Referer'] = client.url
                            headers['Origin'] = client.url
                        # Need to set Cookie afterwards to avoid rewriting it with session Cookies
                        headers['Cookie'] = ";".join(
                            ["%s=%s" % (c.name, c.value) for c in cookies])
                    else:
                        headers = {'User-Agent': user_agent}

                    torrent = append_headers(torrent, headers)

            if name and torrent and needs_subpage and not torrent.startswith(
                    'magnet'):
                if not torrent.startswith('http'):
                    torrent = definition['root_url'] + torrent.encode('utf-8')
                t = Thread(target=extract_subpage,
                           args=(q, name, torrent, size, seeds, peers,
                                 info_hash, referer))
                threads.append(t)
            else:
                yield (name, info_hash, torrent, size, seeds, peers)
        except Exception as e:
            log.error("[%s] Got an exception while parsing results: %s" %
                      (provider, repr(e)))

    if needs_subpage:
        log.debug("[%s] Starting subpage threads..." % provider)
        for t in threads:
            t.start()
        for t in threads:
            t.join()

        for i in range(q.qsize()):
            ret = q.get_nowait()
            log.debug("[%s] Queue %d got: %s" % (provider, i, repr(ret)))
            yield ret
Пример #2
0
def extract_from_api(provider, client):
    """ Main API parsing generator for API-based providers

    An almost clever API parser, mostly just for YTS, RARBG and T411

    Args:
        provider  (str): Provider ID
        client (Client): Client class instance

    Yields:
        tuple: A torrent result
    """
    try:
        data = json.loads(client.content)
    except:
        data = []
    log.debug("[%s] JSON response from API: %s" %
              (unquote(provider), repr(data)))

    definition = definitions[provider]
    definition = get_alias(definition, get_setting("%s_alias" % provider))
    api_format = definition['api_format']

    results = []
    result_keys = api_format['results'].split('.')
    log.debug("[%s] result_keys: %s" % (provider, repr(result_keys)))
    for key in result_keys:
        if key in data:
            data = data[key]
        else:
            data = []
        # log.debug("[%s] nested results: %s" % (provider, repr(data)))
    results = data
    log.debug("[%s] results: %s" % (provider, repr(results)))

    if 'subresults' in api_format:
        from copy import deepcopy
        for result in results:  # A little too specific to YTS but who cares...
            result['name'] = result[api_format['name']]
        subresults = []
        subresults_keys = api_format['subresults'].split('.')
        for key in subresults_keys:
            for result in results:
                if key in result:
                    for subresult in result[key]:
                        sub = deepcopy(result)
                        sub.update(subresult)
                        subresults.append(sub)
        results = subresults
        log.debug("[%s] with subresults: %s" % (provider, repr(results)))

    for result in results:
        if not result or not isinstance(result, dict):
            continue
        name = ''
        info_hash = ''
        torrent = ''
        size = ''
        seeds = ''
        peers = ''
        if 'name' in api_format:
            name = result[api_format['name']]
        if 'description' in api_format:
            if name:
                name += ' '
            name += result[api_format['description']]
        if 'torrent' in api_format:
            torrent = result[api_format['torrent']]
            if 'download_path' in definition:
                torrent = definition['base_url'] + definition[
                    'download_path'] + torrent
            if client.token:
                user_agent = USER_AGENT
                headers = {
                    'Authorization': client.token,
                    'User-Agent': user_agent
                }
                log.debug("[%s] Appending headers: %s" %
                          (provider, repr(headers)))
                torrent = append_headers(torrent, headers)
                log.debug("[%s] Torrent with headers: %s" %
                          (provider, repr(torrent)))
        if 'info_hash' in api_format:
            info_hash = result[api_format['info_hash']]
        if 'quality' in api_format:  # Again quite specific to YTS...
            name = "%s - %s" % (name, result[api_format['quality']])
        if 'size' in api_format:
            size = result[api_format['size']]
            if type(size) in (long, int):
                size = sizeof(size)
            elif type(size) in (str, unicode) and size.isdigit():
                size = sizeof(int(size))
        if 'seeds' in api_format:
            seeds = result[api_format['seeds']]
            if type(seeds) in (str, unicode) and seeds.isdigit():
                seeds = int(seeds)
        if 'peers' in api_format:
            peers = result[api_format['peers']]
            if type(peers) in (str, unicode) and peers.isdigit():
                peers = int(peers)
        yield (name, info_hash, torrent, size, seeds, peers)
Пример #3
0
def extract_torrents(provider, client):
    """ Main torrent extraction generator for non-API based providers

    Args:
        provider  (str): Provider ID
        client (Client): Client class instance

    Yields:
        tuple: A torrent result
    """
    definition = definitions[provider]
    definition = get_alias(definition, get_setting("%s_alias" % provider))
    log.debug("Extracting torrents from %s using definitions: %s" %
              (provider, repr(definition)))

    if not client.content:
        raise StopIteration

    dom = Html().feed(client.content)

    row_search = "dom." + definition['parser']['row']
    name_search = definition['parser']['name']
    torrent_search = definition['parser']['torrent']
    info_hash_search = definition['parser']['infohash']
    size_search = definition['parser']['size']
    seeds_search = definition['parser']['seeds']
    peers_search = definition['parser']['peers']

    log.debug("[%s] Parser: %s" % (provider, repr(definition['parser'])))

    q = Queue()
    threads = []
    needs_subpage = 'subpage' in definition and definition['subpage']

    if needs_subpage:

        def extract_subpage(q, name, torrent, size, seeds, peers, info_hash):
            try:
                log.debug("[%s] Getting subpage at %s" %
                          (provider, repr(torrent)))
            except Exception as e:
                import traceback
                log.error("[%s] Subpage logging failed with: %s" %
                          (provider, repr(e)))
                map(log.debug, traceback.format_exc().split("\n"))

            # New client instance, otherwise it's race conditions all over the place
            subclient = Client()
            subclient.passkey = client.passkey

            if get_setting("use_cloudhole", bool):
                subclient.clearance = get_setting('clearance')
                subclient.user_agent = get_setting('user_agent')

            uri = torrent.split('|')  # Split cookies for private trackers
            subclient.open(uri[0].encode('utf-8'))

            if 'bittorrent' in subclient.headers.get('content-type', ''):
                log.debug('[%s] bittorrent content-type for %s' %
                          (provider, repr(torrent)))
                if len(uri) > 1:  # Stick back cookies if needed
                    torrent = '%s|%s' % (torrent, uri[1])
            else:
                try:
                    torrent = extract_from_page(provider, subclient.content)
                    if torrent and not torrent.startswith('magnet') and len(
                            uri) > 1:  # Stick back cookies if needed
                        torrent = '%s|%s' % (torrent, uri[1])
                except Exception as e:
                    import traceback
                    log.error(
                        "[%s] Subpage extraction for %s failed with: %s" %
                        (provider, repr(uri[0]), repr(e)))
                    map(log.debug, traceback.format_exc().split("\n"))

            ret = (name, info_hash, torrent, size, seeds, peers)
            q.put_nowait(ret)

    if not dom:
        raise StopIteration

    if get_setting("use_debug_parser", bool):
        log.debug(
            "[%s] Parser debug | Page content: %s" %
            (provider, client.content.replace('\r', '').replace('\n', '')))
        log.debug("[%s] Parser debug | Matched %d items for '%s' query '%s'" %
                  (provider, len(eval(row_search)), 'row', row_search))

    for item in eval(row_search):
        if get_setting("use_debug_parser", bool):
            item_str = item.__str__()
            log.debug(
                "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                % (provider, 'row', row_search, item_str.replace(
                    '\r', '').replace('\n', '')))

        if not item:
            continue
        name = eval(name_search)
        torrent = eval(torrent_search) if torrent_search else ""
        size = eval(size_search) if size_search else ""
        seeds = eval(seeds_search) if seeds_search else ""
        peers = eval(peers_search) if peers_search else ""
        info_hash = eval(info_hash_search) if info_hash_search else ""

        if get_setting("use_debug_parser", bool):
            log.debug(
                "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                % (provider, 'name', name_search, name))
            log.debug(
                "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                % (provider, 'torrent', torrent_search, torrent))
            log.debug(
                "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                % (provider, 'size', size_search, size))
            log.debug(
                "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                % (provider, 'seeds', seeds_search, seeds))
            log.debug(
                "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                % (provider, 'peers', peers_search, peers))
            log.debug(
                "[%s] Parser debug | Matched '%s' iteration for query '%s': %s"
                % (provider, 'info_hash', info_hash_search, info_hash))

        # Pass client cookies with torrent if private
        if (definition['private'] or get_setting(
                "use_cloudhole", bool)) and not torrent.startswith('magnet'):
            user_agent = USER_AGENT
            if get_setting("use_cloudhole", bool):
                user_agent = get_setting("user_agent")

            if client.passkey:
                torrent = torrent.replace('PASSKEY', client.passkey)
            elif client.token:
                headers = {
                    'Authorization': client.token,
                    'User-Agent': user_agent
                }
                log.debug("[%s] Appending headers: %s" %
                          (provider, repr(headers)))
                torrent = append_headers(torrent, headers)
                log.debug("[%s] Torrent with headers: %s" %
                          (provider, repr(torrent)))
            else:
                log.debug("[%s] Cookies: %s" %
                          (provider, repr(client.cookies())))
                parsed_url = urlparse(definition['root_url'])
                cookie_domain = '{uri.netloc}'.format(uri=parsed_url).replace(
                    'www.', '')
                cookies = []
                log.debug("[%s] cookie_domain: %s" % (provider, cookie_domain))
                for cookie in client._cookies:
                    log.debug(
                        "[%s] cookie for domain: %s (%s=%s)" %
                        (provider, cookie.domain, cookie.name, cookie.value))
                    if cookie_domain in cookie.domain:
                        cookies.append(cookie)
                if cookies:
                    headers = {
                        'Cookie':
                        ";".join(
                            ["%s=%s" % (c.name, c.value) for c in cookies]),
                        'User-Agent':
                        user_agent
                    }
                    log.debug("[%s] Appending headers: %s" %
                              (provider, repr(headers)))
                    torrent = append_headers(torrent, headers)
                    log.debug("[%s] Torrent with headers: %s" %
                              (provider, repr(torrent)))

        if name and torrent and needs_subpage:
            if not torrent.startswith('http'):
                torrent = definition['root_url'] + torrent.encode('utf-8')
            t = Thread(target=extract_subpage,
                       args=(q, name, torrent, size, seeds, peers, info_hash))
            threads.append(t)
        else:
            yield (name, info_hash, torrent, size, seeds, peers)

    if needs_subpage:
        log.debug("[%s] Starting subpage threads..." % provider)
        for t in threads:
            t.start()
        for t in threads:
            t.join()

        log.debug("[%s] Threads returned: %s" % (provider, repr(threads)))

        for i in range(q.qsize()):
            ret = q.get_nowait()
            log.debug("[%s] Queue %d got: %s" % (provider, i, repr(ret)))
            yield ret