Python parse_html примеры, htmltools.parse_html Python примеры использования

Пример #1

0

Показать файл

Файл: libgen.py Проект: kanzure/paperbot

def upload_to_libgen(paperpath, doi):
    """
    Store the paper on libgen.
    """
    # need to provide some credentials to libgen
    authfragment = build_libgen_auth_fragment()

    files = {"uploadedfile": ("derp.pdf", paperpath)}

    data = {"doi": doi}

    kwargs = {"auth": authfragment, "files": files, "data": data}

    log.debug("Uploading to libgen doi {} path {}".format(doi, paperpath))
    response = requests.post("http://libgen.org/scimag/librarian/form.php", **kwargs)

    # parse returned html
    tree = parse_html(response)

    # build dict with all named fields from html
    formp = dict(map(lambda x: (x.get("name"), x.get("value")), tree.xpath("//input[@name]")))

    log.debug("Submitting form back to libgen.")
    response = requests.get("http://libgen.org/scimag/librarian/register.php", data=formp, auth=authfragment)

    urldoi = make_libgen_doi_url(doi)
    log.debug("Completed libgen upload: {}".format(urldoi))

    return urldoi

Пример #2

0

Показать файл

Файл: libgen.py Проект: tucanae47/paperbot

def upload_to_libgen(paperpath, doi):
    """
    Store the paper on libgen.
    """
    # need to provide some credentials to libgen
    authfragment = build_libgen_auth_fragment()

    files = {
        "uploadedfile": ("derp.pdf", paperpath),
    }

    data = {
        "doi": doi,
    }

    kwargs = {
        "auth": authfragment,
        "files": files,
        "data": data,
    }

    log.debug("Uploading to libgen doi {} path {}".format(doi, paperpath))
    response = requests.post("http://libgen.org/scimag/librarian/form.php",
                             **kwargs)

    # parse returned html
    tree = parse_html(response)

    # build dict with all named fields from html
    formp = dict(
        map(lambda x: (x.get("name"), x.get("value")),
            tree.xpath("//input[@name]")))

    log.debug("Submitting form back to libgen.")
    response = requests.get("http://libgen.org/scimag/librarian/register.php",
                            data=formp,
                            auth=authfragment)

    urldoi = make_libgen_doi_url(doi)
    log.debug("Completed libgen upload: {}".format(urldoi))

    return urldoi

Пример #3

0

Показать файл

Файл: orchestrate.py Проект: dpk/paperbot

def download(url, paper=None):
    """
    Main entry point for executing paperbot's primary function, paper fetching.
    The given url may be to a pdf file, which should be archived, or it may be
    to an academic publisher's website which points to a paper. The paper needs
    to be downloaded and the metadata should be stored.

    Returns a tuple of (paper, json_path, pdf_path, logpath).

    :param url: url to fetch and examine
    :type url: str
    """
    # store logs in tempfile
    (templogpath, loghandler) = loghijack()

    if paper is None:
        paper = Paper.create({})

    # clean up url if necessary
    url = run_url_fixers(url)

    # whether or not metadata has already been populated
    populated_metadata = False

    for (url2, response) in iterdownload(url, paper=paper):
        if is_response_pdf(response):
            log.debug("Got pdf.")
            pdfcontent = remove_watermarks(response.content)
            paper.pdf = pdfcontent
            store(paper)
            break

        paper.html = response.content

        # Was not pdf. Attempt to parse the HTML based on normal expected
        # HTML elements. The HTML elements may say that the actual pdf url
        # is something else. If this happens, then attempt to download that
        # pdf url instead and then break out of this loop.

        # no reason to get same metadata on every iteration of loop
        if not populated_metadata:
            tree = parse_html(response.content)

            # most publishers show paper metadata in html in same way because ?
            populate_metadata_from_tree(tree, paper)

            # TODO: better way to check if populate_metadata_from_tree did
            # anything useful?
            if paper.title in [None, ""]:
                log.debug("# TODO: parse metadata from html using plugins here")
            else:
                populated_metadata = True

        # can't try anything else if the url is still bad
        if paper.pdf_url in [None, ""]:
            continue

        # Normalize the two urls. The url from the metadata on the page
        # might be different from the url that was originally passed in,
        # even though both urls might still refer to the same resource.
        if is_same_url(url, paper.pdf_url):
            # pdf_url is same as original url, no pdf found yet. This
            # happens when the pdf url is correct, but the publisher is
            # returning html instead. And the html happens to reference the
            # url that was originally requested in the first place. Argh.
            continue

        log.debug("Switching activity to pdf_url {}".format(paper.pdf_url))

        # paper pdf is stored at a different url. Attempt to fetch that
        # url now. Only do this if pdf_url != url because otherwise
        # this will be an endless loop.
        for (url3, response2) in iterdownload(paper.pdf_url, paper=paper):
            if is_response_pdf(response2):
                log.debug("Got pdf on second-level page.")
                pdfcontent = remove_watermarks(response.content)
                paper.pdf = pdfcontent
                store(paper)
                break
        else:
            log.debug("Couldn't download pdf from {}".format(paper.pdf_url))

        break

    # was pdf downloaded?
    if (hasattr(paper, "pdf") and paper.pdf not in [None, ""]) or os.path.exists(paper.file_path_pdf):
        fetched = True
    else:
        fetched = False

    hasdoi = (paper.doi not in [None, ""])

    if hasdoi:
        # check if libgen has this paper already
        libgenhas = check_libgen_has_paper(paper.doi)

        if fetched and not libgenhas:
            # upload if libgen doesn't already have it
            upload_to_libgen(paper.file_path_pdf, paper.doi)
        elif not fetched and libgenhas:
            urldoi = make_libgen_doi_url(paper.doi)

            # get from libgen
            log.debug("Haven't yet fetched paper. Have doi. Also, libgenhas.")
            log.debug("HTTP GET {}".format(urldoi))
            response = requests.get(urldoi, headers=DEFAULT_HEADERS)

            if is_pdf_response(response):
                log.debug("Got pdf from libgen.")

                # skip pdfparanoia because it's from libgen
                pdfcontent = response.content
                paper.pdf = pdfcontent

                store(paper)

                fetched = True
            else:
                log.debug("libgen lied about haspdf :(")
    else:
        log.debug("Don't know doi, can't check if libgen has this paper.")
        libgenhas = None

    # store(paper) usually handles json but in case of failure there needs to
    # be an explicit save of paper metadata.
    if not fetched:
        store_json(paper)

    # move logs into position
    logpath = store_logs(paper, templogpath)

    # remove loghandler from logger
    mainlogger = logging.getLogger("paperbot")
    mainlogger.handlers.remove(loghandler)

    return (paper, paper.file_path_json, paper.file_path_pdf, logpath)

Пример #4

0

Показать файл

Файл: orchestrate.py Проект: tucanae47/paperbot

def download(url, paper=None):
    """
    Main entry point for executing paperbot's primary function, paper fetching.
    The given url may be to a pdf file, which should be archived, or it may be
    to an academic publisher's website which points to a paper. The paper needs
    to be downloaded and the metadata should be stored.

    Returns a tuple of (paper, json_path, pdf_path, logpath).

    :param url: url to fetch and examine
    :type url: str
    """
    # store logs in tempfile
    (templogpath, loghandler) = loghijack()

    if paper is None:
        paper = Paper.create({})

    # clean up url if necessary
    url = run_url_fixers(url)

    # whether or not metadata has already been populated
    populated_metadata = False

    for (url2, response) in iterdownload(url, paper=paper):
        if is_response_pdf(response):
            log.debug("Got pdf.")
            pdfcontent = remove_watermarks(response.content)
            paper.pdf = pdfcontent
            store(paper)
            break

        paper.html = response.content

        # Was not pdf. Attempt to parse the HTML based on normal expected
        # HTML elements. The HTML elements may say that the actual pdf url
        # is something else. If this happens, then attempt to download that
        # pdf url instead and then break out of this loop.

        # no reason to get same metadata on every iteration of loop
        if not populated_metadata:
            tree = parse_html(response.content)

            # most publishers show paper metadata in html in same way because ?
            populate_metadata_from_tree(tree, paper)

            # TODO: better way to check if populate_metadata_from_tree did
            # anything useful?
            if paper.title in [None, ""]:
                log.debug(
                    "# TODO: parse metadata from html using plugins here")
            else:
                populated_metadata = True

        # can't try anything else if the url is still bad
        if paper.pdf_url in [None, ""]:
            continue

        # Normalize the two urls. The url from the metadata on the page
        # might be different from the url that was originally passed in,
        # even though both urls might still refer to the same resource.
        if is_same_url(url, paper.pdf_url):
            # pdf_url is same as original url, no pdf found yet. This
            # happens when the pdf url is correct, but the publisher is
            # returning html instead. And the html happens to reference the
            # url that was originally requested in the first place. Argh.
            continue

        log.debug("Switching activity to pdf_url {}".format(paper.pdf_url))

        # paper pdf is stored at a different url. Attempt to fetch that
        # url now. Only do this if pdf_url != url because otherwise
        # this will be an endless loop.
        for (url3, response2) in iterdownload(paper.pdf_url, paper=paper):
            if is_response_pdf(response2):
                log.debug("Got pdf on second-level page.")
                pdfcontent = remove_watermarks(response.content)
                paper.pdf = pdfcontent
                store(paper)
                break
        else:
            log.debug("Couldn't download pdf from {}".format(paper.pdf_url))

        break

    # was pdf downloaded?
    if (hasattr(paper, "pdf") and paper.pdf not in [None, ""]) or \
       os.path.exists(paper.file_path_pdf):
        fetched = True
    else:
        fetched = False

    hasdoi = (paper.doi not in [None, ""])

    if hasdoi:
        # check if libgen has this paper already
        libgenhas = check_libgen_has_paper(paper.doi)

        if fetched and not libgenhas:
            # upload if libgen doesn't already have it
            upload_to_libgen(paper.file_path_pdf, paper.doi)
        elif not fetched and libgenhas:
            urldoi = make_libgen_doi_url(paper.doi)

            # get from libgen
            log.debug("Haven't yet fetched paper. Have doi. Also, libgenhas.")
            log.debug("HTTP GET {}".format(urldoi))
            response = requests.get(urldoi, headers=DEFAULT_HEADERS)

            if is_response_pdf(response):
                log.debug("Got pdf from libgen.")

                # skip pdfparanoia because it's from libgen
                pdfcontent = response.content
                paper.pdf = pdfcontent

                store(paper)

                fetched = True
            else:
                log.debug("libgen lied about haspdf :(")
    else:
        log.debug("Don't know doi, can't check if libgen has this paper.")
        libgenhas = None

    # store(paper) usually handles json but in case of failure there needs to
    # be an explicit save of paper metadata.
    if not fetched:
        store_json(paper)

    # move logs into position
    logpath = store_logs(paper, templogpath)

    # remove loghandler from logger
    mainlogger = logging.getLogger("paperbot")
    mainlogger.handlers.remove(loghandler)

    return (paper, paper.file_path_json, paper.file_path_pdf, logpath)

Python parse_html примеры использования