Exemplos de scrub em Python, exemplos de pdfparanoia.scrub em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: orchestrate.py Projeto: tucanae47/paperbot

def remove_watermarks(pdfcontent):
    """
    Use pdfparanoia to remove watermarks from the pdf.
    """
    log.debug("Removing pdf watermarks.")
    pdfcontent = pdfparanoia.scrub(StringIO(pdfcontent))
    return pdfcontent

Exemplo n.º 2

0

Exibir arquivo

Arquivo: orchestrate.py Projeto: dpk/paperbot

def remove_watermarks(pdfcontent):
    """
    Use pdfparanoia to remove watermarks from the pdf.
    """
    log.debug("Removing pdf watermarks.")
    pdfcontent = pdfparanoia.scrub(StringIO(pdfontent))
    return pdfcontent

Exemplo n.º 3

0

Exibir arquivo

Arquivo: papers.py Projeto: tucanae47/paperbot

def download_url(url, _log=nullLog, **kwargs):
    paperbot_download_request_obj = paperbot_download_request()
    paperbot_download_request_obj._log = _log
    response_generator = paperbot_download_request_obj.get(
        url, use_generator=True, headers={"User-Agent": "origami-pdf"})
    cc = 0
    for response in response_generator:
        _log('using generator for %s time' % cc)
        cc += 1
        paperbot_download_request_obj2 = paperbot_download_request()
        paperbot_download_request_obj2._log = _log
        content = response.content
        # response = requests.get(url, headers={"User-Agent": "origami-pdf"}, **kwargs)
        # content = response.content

        # just make up a default filename
        title = "%0.2x" % random.getrandbits(128)

        # default extension
        extension = ".txt"

        if "pdf" in response.headers["content-type"]:
            extension = ".pdf"
        elif check_if_html(response):
            # parse the html string with lxml.etree
            tree = parse_html(content)

            # extract some metadata with xpaths
            citation_pdf_url = find_citation_pdf_url(tree, url)
            citation_title = find_citation_title(tree)

            # aip.org sucks, citation_pdf_url is wrong
            if citation_pdf_url and "link.aip.org/" in citation_pdf_url:
                citation_pdf_url = None

            if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
                content = requests.get(citation_pdf_url).content
                tree = parse_html(content)
                # citation_title = ...

            # wow, this seriously needs to be cleaned up
            if citation_pdf_url and citation_title and \
               "ieeexplore.ieee.org" not in citation_pdf_url:
                citation_title = citation_title.encode("ascii", "ignore")
                response = requests.get(citation_pdf_url,
                                        headers=HEADERS_DEFENSE)
                content = response.content
                if "pdf" in response.headers["content-type"]:
                    extension = ".pdf"
                    title = citation_title
            else:
                if "sciencedirect.com" in url and "ShoppingCart" not in url:
                    _log('download_url got a sciencedirect URL')
                    try:
                        try:
                            title_xpath = "//h1[@class='svTitle']"
                            title = tree.xpath(title_xpath)[0].text
                            pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
                        except IndexError:
                            title = tree.xpath("//title")[0].text
                            pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]

                        if 'http' not in pdf_url:
                            main_url_split = response.url.split('//')
                            http_prefix = main_url_split[0]
                            if 'http' in http_prefix:
                                domain_url = main_url_split[1].split('/')[0]
                                slash = '/' if pdf_url[0] != '/' else ''
                                pdf_url = http_prefix + '//' + domain_url + slash + pdf_url
                        gen = paperbot_download_request_obj2.get(
                            pdf_url,
                            use_generator=False,
                            headers={"User-Agent": "sdf-macross"})
                        # this is stupidly ugly
                        for genresponse in gen:
                            new_response, extension = genresponse
                        new_content = new_response.content
                        _log(
                            'paperbot_download_request_obj2 content-type: %s' %
                            new_response.headers["content-type"])
                        if "pdf" in new_response.headers["content-type"]:
                            extension = ".pdf"
                            break
                    except Exception as e:
                        _log(traceback.format_exc())
                        pass
                    else:
                        content = new_content
                        response = new_response
                elif "jstor.org/" in url:
                    # clean up the url
                    if "?" in url:
                        url = url[0:url.find("?")]

                    # not all pages have the <input type="hidden" name="ppv-title"> element
                    try:
                        title = tree.xpath("//div[@class='hd title']")[0].text
                    except Exception:
                        try:
                            input_xpath = "//input[@name='ppv-title']/@value"
                            title = tree.xpath(input_xpath)[0]
                        except Exception:
                            pass

                    # get the document id
                    document_id = None
                    if url[-1] != "/":
                        # if "stable/" in url:
                        # elif "discover/" in url:
                        # elif "action/showShelf?candidate=" in url:
                        # elif "pss/" in url:
                        document_id = url.split("/")[-1]

                    if document_id.isdigit():
                        try:
                            pdf_url = make_jstor_url(document_id)
                            new_response = requests.get(pdf_url,
                                                        headers=HEADERS_TM_11)
                            new_content = new_response.content
                            if "pdf" in new_response.headers["content-type"]:
                                extension = ".pdf"
                        except Exception:
                            pass
                        else:
                            content = new_content
                            response = new_response
                elif ".aip.org/" in url:
                    try:
                        title = tree.xpath("//title/text()")[0].split(" | ")[0]
                        pdf_url = [
                            link for link in tree.xpath("//a/@href")
                            if "getpdf" in link
                        ][0]
                        new_response = requests.get(pdf_url,
                                                    headers=HEADERS_TM_1)
                        new_content = new_response.content
                        if "pdf" in new_response.headers["content-type"]:
                            extension = ".pdf"
                    except Exception:
                        pass
                    else:
                        content = new_content
                        response = new_response
                elif "ieeexplore.ieee.org" in url:
                    try:
                        pdf_url = [
                            url for url in tree.xpath("//frame/@src")
                            if "pdf" in url
                        ][0]
                        new_response = requests.get(pdf_url,
                                                    headers=HEADERS_TM_2)
                        new_content = new_response.content
                        if "pdf" in new_response.headers["content-type"]:
                            extension = ".pdf"
                    except Exception:
                        pass
                    else:
                        content = new_content
                        response = new_response
                elif "h1 class=\"articleTitle" in content:
                    try:
                        title_xpath = "//h1[@class='articleTitle']"
                        title = tree.xpath(title_xpath)[0].text
                        title = title.encode("ascii", "ignore")
                        url_xpath = "//a[@title='View the Full Text PDF']/@href"
                        pdf_url = tree.xpath(url_xpath)[0]
                    except:
                        pass
                    else:
                        if pdf_url.startswith("/"):
                            url_start = url[:url.find("/", 8)]
                            pdf_url = url_start + pdf_url
                        response = requests.get(pdf_url,
                                                headers=HEADERS_TEAPOT)
                        content = response.content
                        if "pdf" in response.headers["content-type"]:
                            extension = ".pdf"
                # raise Exception("problem with citation_pdf_url or citation_title")
                # well, at least save the contents from the original url
                pass

    # make the title again just in case
    if not title:
        title = "%0.2x" % random.getrandbits(128)

    # can't create directories
    title = title.replace("/", "_")

    path = os.path.join(ARCHIVE_DIR, title + extension)

    if extension in [".pdf", "pdf"]:
        try:
            content = pdfparanoia.scrub(StringIO(content))
        except:
            # this is to avoid a PDFNotImplementedError
            pass

    file_handler = open(path, "w")
    file_handler.write(content)
    file_handler.close()

    title = title.encode("ascii", "ignore")
    url = ARCHIVE_BASE + requests.utils.quote(title) + extension

    return url

Exemplo n.º 4

0

Exibir arquivo

Arquivo: papers.py Projeto: tucanae47/paperbot

def download(phenny, input, verbose=True):
    """
    Downloads a paper.
    """
    if logchannel:
        _log = lambda x: phenny.msg("#%s" % logchannel, x)
    else:
        _log = lambda x: None
    # only accept requests in a channel
    if not input.sender.startswith('#'):
        # unless the user is an admin, of course
        if not input.admin:
            phenny.say("i only take requests in the ##hplusroadmap channel.")
            return
        else:
            # just give a warning message to the admin.. not a big deal.
            phenny.say(
                "okay i'll try, but please send me requests in ##hplusroadmap in the future."
            )

    # get the input
    line = input.group()

    # was this an explicit command?
    explicit = False
    if line.startswith(phenny.nick):
        explicit = True
        line = line[len(phenny.nick):]

        if line.startswith(",") or line.startswith(":"):
            line = line[1:]

    if line.startswith(" "):
        line = line.strip()

    # don't bother if there's nothing there
    if len(line) < 5 or ("http://" not in line and "https://" not in line) or \
       not line.startswith("http"):
        return
    for line in re.findall(URL_REGEX, line):
        # fix an UnboundLocalError problem
        shurl = None

        line = filter_fix(line)

        # fix for login.jsp links to ieee xplore
        line = fix_ieee_login_urls(line)
        line = fix_jstor_pdf_urls(line)

        translation_url = "http://localhost:1969/web"

        headers = {
            "Content-Type": "application/json",
        }

        data = {"url": line, "sessionid": "what"}

        data = json.dumps(data)

        response = requests.post(translation_url, data=data, headers=headers)

        if response.status_code == 200 and response.content != "[]":
            # see if there are any attachments
            content = json.loads(response.content)
            item = content[0]
            title = item["title"]

            if "DOI" in item:
                _log("Translator DOI")
                lgre = requests.post(LIBGEN_FORM, data={"doi": item["DOI"]})
                tree = parse_html(lgre.content)
                if tree.xpath("//h1")[0].text != "No file selected":
                    phenny.say("http://libgen.info/scimag/get.php?doi=%s" %
                               urllib.quote_plus(item["DOI"]))
                    return

            if "attachments" in item:
                pdf_url = None
                for attachment in item["attachments"]:
                    if "mimeType" in attachment and \
                       "application/pdf" in attachment["mimeType"]:
                        pdf_url = attachment["url"]
                        break

                if pdf_url:
                    user_agent = USER_AGENT
                    paperbot_download_request_obj = paperbot_download_request()
                    paperbot_download_request_obj._log = _log
                    gen = paperbot_download_request_obj.get(
                        pdf_url, use_generator=False, headers=headers)
                    # this is stupidly ugly
                    for genresponse in gen:
                        response, extension = genresponse

                    # detect failure
                    if response.status_code != 200:
                        shurl, _ = modules.scihub.scihubber(pdf_url)
                        if shurl:
                            if "libgen" in shurl:
                                phenny.say(
                                    "http://libgen.info/scimag/get.php?doi=%s"
                                    % urllib.quote_plus(item["DOI"]))
                            elif "pdfcache" not in shurl:
                                phenny.say(shurl)
                            else:
                                pdfstr = modules.scihub.scihub_dl(shurl)
                                phenny.say(
                                    modules.scihub.libgen(pdfstr, item["DOI"]))
                        return

                    data = response.content

                    if "pdf" in response.headers["content-type"]:
                        try:
                            data = pdfparanoia.scrub(StringIO(data))
                            try:
                                _log('after pdfparanoia.scrub')
                                requests.get(
                                    'http://localhost:8500/remoteprint',
                                    headers={'msg': 'after pdfparanoia.scrub'})
                            except:
                                pass
                            break
                        except:
                            # this is to avoid a PDFNotImplementedError
                            pass

                    if "DOI" in item:
                        phenny.say(modules.scihub.libgen(data, item["DOI"]))
                        return

                    # grr..
                    title = title.encode("ascii", "ignore")

                    path = os.path.join(ARCHIVE_DIR, title + ".pdf")

                    file_handler = open(path, "w")
                    file_handler.write(data)
                    file_handler.close()

                    filename = requests.utils.quote(title)

                    # Remove an ending period, which sometimes happens when the
                    # title of the paper has a period at the end.
                    if filename[-1] == ".":
                        filename = filename[:-1]

                    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"

                    phenny.say(url)
                    continue
                elif verbose and explicit:
                    _log("Translation server PDF fail")
                    shurl, doi = modules.scihub.scihubber(line)
                    continue
            elif verbose and explicit:
                _log("Translation server PDF fail")
                shurl, doi = modules.scihub.scihubber(line)
                phenny.say(download_url(line, _log))
                continue
        elif verbose and explicit:
            _log("Translation server fail")
            shurl, doi = modules.scihub.scihubber(line)
            _log("Scihubber -> (%s, %s)" % (shurl, doi))
        if shurl:
            if "pdfcache" in shurl:
                if doi:
                    pdfstr = modules.scihub.scihub_dl(shurl)
                    phenny.say(modules.scihub.libgen(pdfstr, doi))
                else:
                    phenny.say(
                        download_url(shurl,
                                     _log,
                                     cookies=modules.scihub.shcookie))
            else:
                phenny.say(shurl)
        elif verbose and explicit:
            _log("All approaches failed")
            phenny.say(download_url(line, _log))
    return

Exemplo n.º 5

0

Exibir arquivo

Arquivo: papers.py Projeto: dpk/paperbot

def download_url(url, _log=nullLog, **kwargs):
    paperbot_download_request_obj = paperbot_download_request()
    paperbot_download_request_obj._log = _log
    response_generator = paperbot_download_request_obj.get(url, use_generator=True, headers={"User-Agent": "origami-pdf"})
    cc=0
    for response in response_generator:
        _log('using generator for %s time' % cc)
        cc+=1
        paperbot_download_request_obj2 = paperbot_download_request()
        paperbot_download_request_obj2._log = _log
        content = response.content
        #response = requests.get(url, headers={"User-Agent": "origami-pdf"}, **kwargs)
        #content = response.content
        
        # just make up a default filename
        title = "%0.2x" % random.getrandbits(128)

        # default extension
        extension = ".txt"

        if "pdf" in response.headers["content-type"]:
            extension = ".pdf"
        elif check_if_html(response):
            # parse the html string with lxml.etree
            tree = parse_html(content)

            # extract some metadata with xpaths
            citation_pdf_url = find_citation_pdf_url(tree, url)
            citation_title = find_citation_title(tree)

            # aip.org sucks, citation_pdf_url is wrong
            if citation_pdf_url and "link.aip.org/" in citation_pdf_url:
                citation_pdf_url = None

            if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
                content = requests.get(citation_pdf_url).content
                tree = parse_html(content)
                # citation_title = ...

            # wow, this seriously needs to be cleaned up
            if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
                citation_title = citation_title.encode("ascii", "ignore")
                response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
                content = response.content
                if "pdf" in response.headers["content-type"]:
                    extension = ".pdf"
                    title = citation_title
            else:
                if "sciencedirect.com" in url and not "ShoppingCart" in url:
                    _log('download_url got a sciencedirect URL')
                    try:
                        try:
                            title = tree.xpath("//h1[@class='svTitle']")[0].text
                            pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
                        except IndexError: 
                            title = tree.xpath("//title")[0].text
                            pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
                        
                        if 'http' not in pdf_url:
                            main_url_split = response.url.split('//')
                            http_prefix = main_url_split[0]
                            if 'http' in http_prefix:
                                domain_url = main_url_split[1].split('/')[0]
                                pdf_url = http_prefix + '//' + domain_url + ('/' if pdf_url[0]!='/' else '') + pdf_url
                        gen = paperbot_download_request_obj2.get(pdf_url, use_generator=False, headers={"User-Agent": "sdf-macross"})
                        #this is stupidly ugly
                        for genresponse in gen:
                            new_response, extension = genresponse
                        new_content = new_response.content
                        _log('paperbot_download_request_obj2 content-type: %s' % new_response.headers["content-type"])
                        if "pdf" in new_response.headers["content-type"]:
                            extension = ".pdf"
                            break
                    except Exception as e:
                        _log(traceback.format_exc())
                        pass
                    else:
                        content = new_content
                        response = new_response
                elif "jstor.org/" in url:
                    # clean up the url
                    if "?" in url:
                        url = url[0:url.find("?")]

                    # not all pages have the <input type="hidden" name="ppv-title"> element
                    try:
                        title = tree.xpath("//div[@class='hd title']")[0].text
                    except Exception:
                        try:
                            title = tree.xpath("//input[@name='ppv-title']/@value")[0]
                        except Exception:
                            pass

                    # get the document id
                    document_id = None
                    if url[-1] != "/":
                        #if "stable/" in url:
                        #elif "discover/" in url:
                        #elif "action/showShelf?candidate=" in url:
                        #elif "pss/" in url:
                        document_id = url.split("/")[-1]

                    if document_id.isdigit():
                        try:
                            pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
                            new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
                            new_content = new_response.content
                            if "pdf" in new_response.headers["content-type"]:
                                extension = ".pdf"
                        except Exception:
                            pass
                        else:
                            content = new_content
                            response = new_response
                elif ".aip.org/" in url:
                    try:
                        title = tree.xpath("//title/text()")[0].split(" | ")[0]
                        pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
                        new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
                        new_content = new_response.content
                        if "pdf" in new_response.headers["content-type"]:
                            extension = ".pdf"
                    except Exception:
                        pass
                    else:
                        content = new_content
                        response = new_response
                elif "ieeexplore.ieee.org" in url:
                    try:
                        pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
                        new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
                        new_content = new_response.content
                        if "pdf" in new_response.headers["content-type"]:
                            extension = ".pdf"
                    except Exception:
                        pass
                    else:
                        content = new_content
                        response = new_response
                elif "h1 class=\"articleTitle" in content:
                    try:
                        title = tree.xpath("//h1[@class='articleTitle']")[0].text
                        title = title.encode("ascii", "ignore")
                        pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0]
                    except:
                        pass
                    else:
                        if pdf_url.startswith("/"):
                            url_start = url[:url.find("/",8)]
                            pdf_url = url_start + pdf_url
                        response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
                        content = response.content
                        if "pdf" in response.headers["content-type"]:
                            extension = ".pdf"
                # raise Exception("problem with citation_pdf_url or citation_title")
                # well, at least save the contents from the original url
                pass

    # make the title again just in case
    if not title:
        title = "%0.2x" % random.getrandbits(128)

    # can't create directories
    title = title.replace("/", "_")

    path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension)

    if extension in [".pdf", "pdf"]:
        try:
            content = pdfparanoia.scrub(StringIO(content))
        except:
            # this is to avoid a PDFNotImplementedError
            pass

    file_handler = open(path, "w")
    file_handler.write(content)
    file_handler.close()

    title = title.encode("ascii", "ignore")
    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + requests.utils.quote(title) + extension

    return url

Exemplo n.º 6

0

Exibir arquivo

Arquivo: papers.py Projeto: dpk/paperbot

def download(phenny, input, verbose=True):
    """
    Downloads a paper.
    """
    if logchannel:
        _log = lambda x: phenny.msg("#%s" % logchannel, x)
    else:
        _log = lambda x: None
    # only accept requests in a channel
    if not input.sender.startswith('#'):
        # unless the user is an admin, of course
        if not input.admin:
            phenny.say("i only take requests in the ##hplusroadmap channel.")
            return
        else:
            # just give a warning message to the admin.. not a big deal.
            phenny.say("okay i'll try, but please send me requests in ##hplusroadmap in the future.")

    # get the input
    line = input.group()

    # was this an explicit command?
    explicit = False
    if line.startswith(phenny.nick):
        explicit = True
        line = line[len(phenny.nick):]

        if line.startswith(",") or line.startswith(":"):
            line = line[1:]

    if line.startswith(" "):
        line = line.strip()

    # don't bother if there's nothing there
    if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
        return
    for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
        # fix an UnboundLocalError problem
        shurl = None

        line = filter_fix(line)

        # fix for login.jsp links to ieee xplore
        line = fix_ieee_login_urls(line)
        line = fix_jstor_pdf_urls(line)

        translation_url = "http://localhost:1969/web"

        headers = {
            "Content-Type": "application/json",
        }

        data = {
            "url": line,
            "sessionid": "what"
        }

        data = json.dumps(data)

        response = requests.post(translation_url, data=data, headers=headers)

        if response.status_code == 200 and response.content != "[]":
            # see if there are any attachments
            content = json.loads(response.content)
            item = content[0]
            title = item["title"]

            if item.has_key("DOI"):
                _log("Translator DOI")
                lgre = requests.post("http://libgen.org/scimag/librarian/form.php", data={"doi":item["DOI"]})
                tree = parse_html(lgre.content)
                if tree.xpath("//h1")[0].text != "No file selected":
                    phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"]))
                    return

            if item.has_key("attachments"):
                pdf_url = None
                for attachment in item["attachments"]:
                    if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
                        pdf_url = attachment["url"]
                        break

                if pdf_url:
                    user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
                    """
                proxies_left_to_try = len(proxy_list)
                request_iteration = 0
                proxy_url_index=0
                _log('before while proxies_left_to_try')
                while proxies_left_to_try:
                    headers = {
                        "User-Agent": user_agent,
                    }
                    response = None
                    proxy_url = proxy_list[proxy_url_index]['proxy_url']
                    proxy_type = proxy_list[proxy_url_index]['proxy_type']
                    _log('proxies_left_to_try: %d' % proxies_left_to_try)
                    #perform default behaviour if proxy is None
                    if proxy_url is None:
                        if pdf_url.startswith("https://"):
                            response = requests.get(pdf_url, headers=headers, verify=False)
                        else:
                            response = requests.get(pdf_url, headers=headers)
                    else:

                        #check type of proxy
                        if proxy_type == 'custom_flask_json':
                            
                            headers["Content-Type"] = "application/json"
                            data = {'pdf_url' : pdf_url,
                                    'request_iteration' : request_iteration
                                    }
                            
                            request_iteration+=1
                            response = requests.get(proxy_url, data=json.dumps(data), headers=headers)
                        elif proxy_type == 'normal':
                            #i'm not even checking if http or https is in the pdf_url, since the default proxy of None is already being tried in this loop
                            proxies = { 
                              "http": proxy_url,
                              "https": proxy_url,
                            }
                            response = requests.get(pdf_url, headers=headers, proxies=proxies)
                    """
                    paperbot_download_request_obj = paperbot_download_request()
                    paperbot_download_request_obj._log = _log
                    gen = paperbot_download_request_obj.get(pdf_url, use_generator=False, headers=headers)
                    #this is stupidly ugly
                    for genresponse in gen:
                        response, extension = genresponse

                    # detect failure
                    if response.status_code != 200:
                        shurl, _ = modules.scihub.scihubber(pdf_url)
                        if shurl:
                            if "libgen" in shurl:
                                phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"]))
                            elif "pdfcache" not in shurl:
                                phenny.say(shurl)
                            else:
                                phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"]))
                        return

                    data = response.content

                    if "pdf" in response.headers["content-type"]:
                        try:
                            data = pdfparanoia.scrub(StringIO(data))
                            try:
                                _log('after pdfparanoia.scrub')
                                requests.get('http://localhost:8500/remoteprint', headers={'msg':'after pdfparanoia.scrub'})
                            except:
                                pass
                            break
                        except:
                            """
                            #check for custom_flask_json proxy response, which indicates if the given custom proxy has more internal proxies to try with
                            if 'proxies_remaining' in response.headers:
                                #decrement the index if the custom proxy doesn't have any more internal proxies to try
                                if response.headers['proxies_remaining'] == 0:
                                    proxies_left_to_try-=1
                                    proxy_url_index+=1
                                    request_iteration=0
                            else:    
                                #decrement the index to move on to the next proxy in our proxy_list
                                proxies_left_to_try-=1
                                proxy_url_index+=1
                            """
                            # this is to avoid a PDFNotImplementedError
                            pass

                    if item.has_key("DOI"):
                        phenny.say(modules.scihub.libgen(data, item["DOI"]))
                        return

                    # grr..
                    title = title.encode("ascii", "ignore")

                    path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf")

                    file_handler = open(path, "w")
                    file_handler.write(data)
                    file_handler.close()

                    filename = requests.utils.quote(title)

                    # Remove an ending period, which sometimes happens when the
                    # title of the paper has a period at the end.
                    if filename[-1] == ".":
                        filename = filename[:-1]

                    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"

                    phenny.say(url)
                    continue
                elif verbose and explicit:
                    _log("Translation server PDF fail")
                    shurl, doi = modules.scihub.scihubber(line)
                    continue
            elif verbose and explicit:
                _log("Translation server PDF fail")
                shurl, doi = modules.scihub.scihubber(line)
                phenny.say(download_url(line, _log))
                continue
        elif verbose and explicit:
            _log("Translation server fail")
            shurl, doi = modules.scihub.scihubber(line)
            _log("Scihubber -> (%s, %s)" % (shurl, doi))
        if shurl:
            if "pdfcache" in shurl:
                if doi: phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), doi))
                else: phenny.say(download_url(shurl, _log, cookies=modules.scihub.shcookie))
            else: phenny.say(shurl)
        elif verbose and explicit:
            _log("All approaches failed")
            phenny.say(download_url(line, _log))
    return

Exemplo n.º 7

0

Exibir arquivo

Arquivo: papers.py Projeto: carlcrott/paperbot

def download(phenny, input, verbose=True):
    """
    Downloads a paper.
    """
    if logchannel:
        _log = lambda x: phenny.msg("#%s" % logchannel, x)
    else:
        _log = lambda x: None
    # only accept requests in a channel
    if not input.sender.startswith('#'):
        # unless the user is an admin, of course
        if not input.admin:
            phenny.say("i only take requests in the ##hplusroadmap channel.")
            return
        else:
            # just give a warning message to the admin.. not a big deal.
            phenny.say("okay i'll try, but please send me requests in ##hplusroadmap in the future.")

    # get the input
    line = input.group()

    # was this an explicit command?
    explicit = False
    if line.startswith(phenny.nick):
        explicit = True
        line = line[len(phenny.nick):]

        if line.startswith(",") or line.startswith(":"):
            line = line[1:]

    if line.startswith(" "):
        line = line.strip()

    # don't bother if there's nothing there
    if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"):
        return
    for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line):
        # fix an UnboundLocalError problem
        shurl = None

        line = filter_fix(line)

        # fix for login.jsp links to ieee xplore
        line = fix_ieee_login_urls(line)
        line = fix_jstor_pdf_urls(line)

        translation_url = "http://localhost:1969/web"

        headers = {
            "Content-Type": "application/json",
        }

        data = {
            "url": line,
            "sessionid": "what"
        }

        data = json.dumps(data)

        response = requests.post(translation_url, data=data, headers=headers)

        if response.status_code == 200 and response.content != "[]":
            # see if there are any attachments
            content = json.loads(response.content)
            item = content[0]
            title = item["title"]

            if item.has_key("DOI"):
                _log("Translator DOI")
                lgre = requests.post("http://libgen.org/scimag/librarian/form.php", data={"doi":item["DOI"]})
                tree = parse_html(lgre.content)
                if tree.xpath("//h1")[0].text != "No file selected":
                    phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"]))
                    return

            if item.has_key("attachments"):
                pdf_url = None
                for attachment in item["attachments"]:
                    if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
                        pdf_url = attachment["url"]
                        break

                if pdf_url:
                    user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"

                    headers = {
                        "User-Agent": user_agent,
                    }

                    response = None
                    if pdf_url.startswith("https://"):
                        response = requests.get(pdf_url, headers=headers, verify=False)
                    else:
                        response = requests.get(pdf_url, headers=headers)

                    # detect failure
                    if response.status_code != 200:
                        shurl, _ = modules.scihub.scihubber(pdf_url)
                        if shurl:
                            if "libgen" in shurl:
                                phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"]))
                            elif "pdfcache" not in shurl:
                                phenny.say(shurl)
                            else:
                                phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"]))
                        return

                    data = response.content

                    if "pdf" in response.headers["content-type"]:
                        try:
                            data = pdfparanoia.scrub(StringIO(data))
                        except:
                            # this is to avoid a PDFNotImplementedError
                            pass

                    if item.has_key("DOI"):
                        phenny.say(modules.scihub.libgen(data, item["DOI"]))
                        return

                    # grr..
                    title = title.encode("ascii", "ignore")

                    path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf")

                    file_handler = open(path, "w")
                    file_handler.write(data)
                    file_handler.close()

                    filename = requests.utils.quote(title)

                    # Remove an ending period, which sometimes happens when the
                    # title of the paper has a period at the end.
                    if filename[-1] == ".":
                        filename = filename[:-1]

                    url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf"

                    phenny.say(url)
                    continue
                elif verbose and explicit:
                    _log("Translation server PDF fail")
                    shurl, doi = modules.scihub.scihubber(line)
                    continue
            elif verbose and explicit:
                _log("Translation server PDF fail")
                shurl, doi = modules.scihub.scihubber(line)
                phenny.say(download_url(line))
                continue
        elif verbose and explicit:
            _log("Translation server fail")
            shurl, doi = modules.scihub.scihubber(line)
            _log("Scihubber -> (%s, %s)" % (shurl, doi))
        if shurl:
            if "pdfcache" in shurl:
                if doi: phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), doi))
                else: phenny.say(download_url(shurl, cookies=modules.scihub.shcookie))
            else: phenny.say(shurl)
        elif verbose and explicit:
            _log("All approaches failed")
            phenny.say(download_url(line))
    return

Exemplo n.º 8

0

Exibir arquivo

Arquivo: papers.py Projeto: a3nm/paperbot

def download_url(url, proxy, last_resort=False):
    sys.stderr.write("attempting direct for %s through %s\n" % (url,
      proxy))

    session = requests.Session()
    session.proxies = {
        'http': proxy,
        'https': proxy}

    try:
        response = session.get(url, headers={"User-Agent": "origami-pdf"})
    except requests.exceptions.ConnectionError:
        sys.stderr.write("network failure on download " +
            str(url) + "\n")
        return 1

    content = response.content

    # just make up a default filename
    title = "%0.2x" % random.getrandbits(128)

    # default extension
    extension = ".txt"

    if "pdf" in response.headers["content-type"]:
        extension = ".pdf"
    elif check_if_html(response):
        # parse the html string with lxml.etree
        tree = parse_html(content)

        # extract some metadata with xpaths
        citation_pdf_url = find_citation_pdf_url(tree, url)
        citation_title = find_citation_title(tree)

        # aip.org sucks, citation_pdf_url is wrong
        if citation_pdf_url and "link.aip.org/" in citation_pdf_url:
            citation_pdf_url = None

        if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url:
            content = session.get(citation_pdf_url).content
            tree = parse_html(content)
            # citation_title = ...

        # wow, this seriously needs to be cleaned up
        if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url:
            citation_title = citation_title.encode("ascii", "ignore")
            response = session.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"})
            content = response.content
            if "pdf" in response.headers["content-type"]:
                extension = ".pdf"
                title = citation_title
        else:
            if "sciencedirect.com" in url and not "ShoppingCart" in url:
                try:
                    title = tree.xpath("//h1[@class='svTitle']")[0].text
                    pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0]
                    new_response = session.get(pdf_url, headers={"User-Agent": "sdf-macross"})
                    new_content = new_response.content
                    if "pdf" in new_response.headers["content-type"]:
                        extension = ".pdf"
                except Exception:
                    pass
                else:
                    content = new_content
                    response = new_response
            elif "jstor.org/" in url:
                # clean up the url
                if "?" in url:
                    url = url[0:url.find("?")]

                # not all pages have the <input type="hidden" name="ppv-title"> element
                try:
                    title = tree.xpath("//div[@class='hd title']")[0].text
                except Exception:
                    try:
                        title = tree.xpath("//input[@name='ppv-title']/@value")[0]
                    except Exception:
                        pass

                # get the document id
                document_id = None
                if url[-1] != "/":
                    #if "stable/" in url:
                    #elif "discover/" in url:
                    #elif "action/showShelf?candidate=" in url:
                    #elif "pss/" in url:
                    document_id = url.split("/")[-1]

                if document_id.isdigit():
                    try:
                        pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true"
                        new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.1"})
                        new_content = new_response.content
                        if "pdf" in new_response.headers["content-type"]:
                            extension = ".pdf"
                    except Exception:
                        pass
                    else:
                        content = new_content
                        response = new_response
            elif ".aip.org/" in url:
                try:
                    title = tree.xpath("//title/text()")[0].split(" | ")[0]
                    pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0]
                    new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.0"})
                    new_content = new_response.content
                    if "pdf" in new_response.headers["content-type"]:
                        extension = ".pdf"
                except Exception:
                    pass
                else:
                    content = new_content
                    response = new_response
            elif "ieeexplore.ieee.org" in url:
                try:
                    pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0]
                    new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/2.0"})
                    new_content = new_response.content
                    if "pdf" in new_response.headers["content-type"]:
                        extension = ".pdf"
                except Exception:
                    pass
                else:
                    content = new_content
                    response = new_response
            elif "h1 class=\"articleTitle" in content:
                try:
                    title = tree.xpath("//h1[@class='articleTitle']")[0].text
                    title = title.encode("ascii", "ignore")
                    pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0]
                except:
                    pass
                else:
                    if pdf_url.startswith("/"):
                        url_start = url[:url.find("/",8)]
                        pdf_url = url_start + pdf_url
                    response = session.get(pdf_url, headers={"User-Agent": "pdf-teapot"})
                    content = response.content
                    if "pdf" in response.headers["content-type"]:
                        extension = ".pdf"
            # raise Exception("problem with citation_pdf_url or citation_title")
            # well, at least save the contents from the original url
            pass

    # make the title again just in case
    if not title:
        title = "%0.2x" % random.getrandbits(128)

    # can't create directories
    title = title.replace("/", "_")
    title = title.replace(" ", "_")
    title = title[:params.maxlen]

    path = os.path.join(params.folder, title + extension)

    if extension in [".pdf", "pdf"]:
        try:
            sys.stderr.write("got it! " +
                str(url) + "\n")
            content = pdfparanoia.scrub(StringIO(content))
        except:
            # this is to avoid a PDFNotImplementedError
            pass

    file_handler = open(path, "w")
    file_handler.write(content)
    file_handler.close()

    title = title.encode("ascii", "ignore")
    url = params.url + requests.utils.quote(title) + extension

    if extension in [".pdf", "pdf"]:
        print url
        return 0
    else:
        sys.stderr.write("couldn't find it, dump: %s\n" % url)
        if last_resort:
          print "pas possible de le trouver, dump: %s" % url
        else:
            return 1
    return 0

Exemplo n.º 9

0

Exibir arquivo

Arquivo: papers.py Projeto: a3nm/paperbot

def download_proxy(line, zotero, proxy, verbose=True):
    sys.stderr.write("tente de télécharger %s through %s and %s\n" %
        (line, zotero, proxy))

    headers = {
        "Content-Type": "application/json",
    }

    data = {
        "url": line,
        "sessionid": "what"
    }

    data = json.dumps(data)

    response = requests.post(zotero, data=data, headers=headers)

    if response.status_code != 200 or response.content == "[]":
        sys.stderr.write("no valid reply from zotero\n")
        sys.stderr.write("status %d\n" % response.status_code)
        sys.stderr.write("content %s\n" % response.content)
        return -1 # fatal

    sys.stderr.write("content %s\n" % response.content)
    # see if there are any attachments
    content = json.loads(response.content)
    item = content[0]
    title = item["title"]

    if not item.has_key("attachments"):
        sys.stderr.write("no attachement with this proxy\n")
        return 1 # try another proxy

    pdf_url = None
    for attachment in item["attachments"]:
        if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]:
            pdf_url = attachment["url"]
            break

    if not pdf_url:
        sys.stderr.write("no PDF attachement with this proxy\n")
        return 1 # try another proxy

    user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"

    headers = {
        "User-Agent": user_agent,
    }

    sys.stderr.write("try retrieving " +
        str(pdf_url) + " through proxy " + proxy + "\n")
    response = None
    session = requests.Session()
    session.proxies = {
        'http': proxy,
        'https': proxy}

    try:
        if pdf_url.startswith("https://"):
            response = session.get(pdf_url, headers=headers, verify=False)
        else:
            response = session.get(pdf_url, headers=headers)
    except requests.exceptions.ConnectionError:
        sys.stderr.write("network failure on download " +
            str(pdf_url) + "\n")
        return 1

    # detect failure
    if response.status_code == 401:
        sys.stderr.write("HTTP 401 unauthorized when trying to fetch " +
            str(pdf_url) + "\n")
        return 1
    elif response.status_code != 200:
        sys.stderr.write("HTTP " + str(response.status_code)
        + " when trying to fetch " + str(pdf_url) + "\n")
        return 1

    data = response.content

    if "pdf" in response.headers["content-type"]:
        try:
            data = pdfparanoia.scrub(StringIO(data))
        except:
            # this is to avoid a PDFNotImplementedError
            pass

    # grr..
    title = title.encode("ascii", "ignore")
    title = title.replace(" ", "_")
    title = title[:params.maxlen]

    path = os.path.join(params.folder, title + ".pdf")

    file_handler = open(path, "w")
    file_handler.write(data)
    file_handler.close()

    filename = requests.utils.quote(title)

    # Remove an ending period, which sometimes happens when the
    # title of the paper has a period at the end.
    if filename[-1] == ".":
        filename = filename[:-1]

    url = params.url + filename + ".pdf"

    print(url)
    return 0