Python convert_pdf_to_txt示例，pdf_to_text.convert_pdf_to_txt Python示例

示例#1

0

显示文件

文件： page.py 项目： Impactstory/sherlockoa

    def set_version_and_license(self, r=None):
        self.updated = datetime.datetime.utcnow().isoformat()

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # set as default
        self.scrape_version = "submittedVersion"

        is_updated = self.update_with_local_info()

        # now try to see what we can get out of the pdf itself

        if not r:
            logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
            return

        try:
            # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf
            if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE):
                self.scrape_version = "publishedVersion"

            text = convert_pdf_to_txt(r, max_pages=25)

            # logger.info(text)

            if text and self.scrape_version == "submittedVersion":
                patterns = [
                    re.compile(ur"©.?\d{4}", re.UNICODE),
                    re.compile(ur"\(C\).?\d{4}", re.IGNORECASE),
                    re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE),
                    re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"all rights reserved", re.IGNORECASE),
                    re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL)
                    ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern))
                        self.scrape_version = "publishedVersion"

            if not self.scrape_license:
                open_license = find_normalized_license(text)
                if open_license:
                    logger.info(u'found license in PDF: {}'.format(open_license))
                    self.scrape_license = open_license

        except Exception as e:
            logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url))
            self.error += u"Exception doing convert_pdf_to_txt!"
            logger.info(self.error)

        logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))

示例#2

0

显示文件

文件： qa_parser.py 项目： fishdda/IMRT-QA-Data-Miner

def pdf_to_qa_result(abs_file_path):

    try:
        text = convert_pdf_to_txt(abs_file_path).split('\n')
    except:
        return False

    if is_file_snc_mapcheck(text):
        return MapcheckResult(text).data_to_csv() + ',' + basename(
            abs_file_path)

示例#3

0

显示文件

def main():
    baseEl = etree.Element('issue', published="true", current="false")
    issueData = getIssueData()
    etree.SubElement(baseEl, 'title').text = '0'
    etree.SubElement(baseEl, 'volume').text = issueData["volume"]
    etree.SubElement(baseEl, 'number').text = issueData["number"]
    etree.SubElement(baseEl, 'year').text = issueData["year"]
    etree.SubElement(baseEl,
                     'date_published').text = issueData["date_published"]
    etree.SubElement(baseEl, 'access_date').text = issueData["access_date"]
    articlesEl = etree.SubElement(baseEl, 'section')
    etree.SubElement(articlesEl, 'title', locale="en_US").text = 'Articles'
    etree.SubElement(articlesEl, 'abbr', locale="en_US").text = 'ART'
    if (len(sys.argv) > 1):
        for file in sys.argv[1:]:  # add an article tag for each file
            fileText = convert_pdf_to_txt(file)
            fileBinary = open(file, "rb").read().encode("base64")
            fileXml = articleToXml(fileText, file, fileBinary,
                                   issueData["date_published"])
            articlesEl.insert(-1, fileXml)
    tree = etree.ElementTree(baseEl)
    tree.write('output.xml',
               pretty_print=True)  # write end result to 'output.xml'

示例#4

0

显示文件

    def set_version_and_license(self, r=None):
        self.updated = datetime.datetime.utcnow().isoformat()

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # set as default
        self.scrape_version = "submittedVersion"

        is_updated = self.update_with_local_info()

        # now try to see what we can get out of the pdf itself

        if not r:
            logger.info(
                u"before scrape returning {} with scrape_version: {}, license {}"
                .format(self.url, self.scrape_version, self.scrape_license))
            return

        try:
            # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf
            if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(),
                          re.IGNORECASE):
                self.scrape_version = "publishedVersion"

            text = convert_pdf_to_txt(r, max_pages=25)
            # logger.info(text)

            if text and self.scrape_version != "publishedVersion":
                patterns = [
                    re.compile(ur"©.?\d{4}", re.UNICODE),
                    re.compile(ur"\(C\).?\d{4}", re.IGNORECASE),
                    re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE),
                    re.compile(
                        ur"received.{0,100}revised.{0,100}accepted.{0,100}publication",
                        re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"all rights reserved", re.IGNORECASE),
                    re.compile(
                        ur"This article is distributed under the terms of the Creative Commons",
                        re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(
                        ur"This article is licensed under a Creative Commons",
                        re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"this is an open access article",
                               re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(
                        ur"This article is brought to you for free and open access by Works.",
                        re.IGNORECASE | re.MULTILINE | re.DOTALL),
                ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(
                            u'found {}, decided PDF is published version'.
                            format(pattern.pattern))
                        self.scrape_version = "publishedVersion"

            if not self.scrape_license:
                open_license = find_normalized_license(text)
                if open_license:
                    logger.info(
                        u'found license in PDF: {}'.format(open_license))
                    self.scrape_license = open_license

        except Exception as e:
            logger.exception(u"exception in convert_pdf_to_txt for {}".format(
                self.url))
            self.error += u"Exception doing convert_pdf_to_txt!"
            logger.info(self.error)

        logger.info(
            u"scrape returning {} with scrape_version: {}, license {}".format(
                self.url, self.scrape_version, self.scrape_license))

示例#5

0

显示文件

    def set_version_and_license(self, r=None):
        self.updated = datetime.datetime.utcnow().isoformat()

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # set as default
        self.scrape_version = self.default_version()

        is_updated = self.update_with_local_info()

        # now try to see what we can get out of the pdf itself
        version_is_from_strict_metadata = self.pmh_record and self.pmh_record.api_raw and re.compile(
            ur"<dc:type>{}</dc:type>".format(self.scrape_version), re.IGNORECASE | re.MULTILINE | re.DOTALL
        ).findall(self.pmh_record.api_raw)

        if version_is_from_strict_metadata or not r:
            logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
            return

        try:
            # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf
            if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE):
                self.scrape_version = "publishedVersion"

            text = convert_pdf_to_txt(r, max_pages=25)
            # logger.info(text)

            if text and self.scrape_version != "publishedVersion" and not version_is_from_strict_metadata:
                patterns = [
                    re.compile(ur"©.?\d{4}", re.UNICODE),
                    re.compile(ur"\(C\).?\d{4}", re.IGNORECASE),
                    re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE),
                    re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"all rights reserved", re.IGNORECASE),
                    re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"This article is brought to you for free and open access by Works.", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern))
                        self.scrape_version = "publishedVersion"

            if text and self.scrape_version != 'acceptedVersion':
                patterns = [
                    re.compile(ur'This is a post-peer-review, pre-copyedit version', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur'This is the peer reviewed version of the following article', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur'The present manuscript as of \d\d \w+ \d\d\d\d has been accepted', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur'Post-peer-review, pre-copyedit version of accepted manuscript', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(u'found {}, decided PDF is accepted version'.format(pattern.pattern))
                        self.scrape_version = "acceptedVersion"

                if r and r.url and '61RMIT_INST' in r.url:
                    if 'Version: Accepted' in text:
                        logger.info(u'found Version: Accepted, decided PDF is accepted version')
                        self.scrape_version = "acceptedVersion"

                heading_text = text[0:50].lower()
                accepted_headings = [
                    "final accepted version",
                    "accepted manuscript",
                ]

                for heading in accepted_headings:
                    if heading in heading_text:
                        logger.info(u'found {} in heading, decided PDF is accepted version'.format(heading))
                        self.scrape_version = "acceptedVersion"
                        break

            if not self.scrape_license:
                open_license = find_normalized_license(text)
                if open_license:
                    logger.info(u'found license in PDF: {}'.format(open_license))
                    self.scrape_license = open_license

        except Exception as e:
            logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url))
            self.error += u"Exception doing convert_pdf_to_txt!"
            logger.info(self.error)

        if self.pmh_record:
            self.scrape_version = _scrape_version_override().get(self.pmh_record.bare_pmh_id, self.scrape_version)

        logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))

示例#6

0

显示文件

def main():
    if (len(sys.argv) > 1):
        baseEl = etree.Element('issue', published="true", current="false")
        issueData = getIssueData()
        etree.SubElement(baseEl, 'title').text = '0'
        etree.SubElement(baseEl, 'volume').text = issueData["volume"]
        etree.SubElement(baseEl, 'number').text = issueData["number"]
        etree.SubElement(baseEl, 'year').text = issueData["year"]
        etree.SubElement(baseEl,
                         'date_published').text = issueData["date_published"]
        etree.SubElement(baseEl, 'access_date').text = issueData["access_date"]
        articlesEl = etree.SubElement(baseEl, 'section')
        etree.SubElement(articlesEl, 'title', locale="en_US").text = 'Articles'
        etree.SubElement(articlesEl, 'abbr', locale="en_US").text = 'ART'
        while (True):
            print("Article information ####################\n")
            articleName = raw_input(
                "What is the article name? You can enter a temporary filename if you want."
            )
            articleStart = raw_input(
                "What is the start page of this article? ")
            while (not articleStart.isdigit()):
                articleStart = raw_input(
                    "Please enter an integer. What is the start page of this article? "
                )
            articleEnd = raw_input("What is the end page of this article? ")
            while (not articleEnd.isdigit()):
                articleEnd = raw_input(
                    "Please enter an integer. What is the end page of this article? "
                )
            articleText = convert_pdf_to_txt(sys.argv[1], int(articleStart),
                                             int(articleEnd))
            split_article(sys.argv[1], articleName, int(articleStart),
                          int(articleEnd))
            articleBinary = open(articleName, "rb").read().encode("base64")
            articleXml = articleToXml(articleText, articleName, articleBinary,
                                      issueData["date_published"])
            articlesEl.insert(-1, articleXml)
            if (raw_input(
                    "Parse another article? Type yes to continue or any other character to quit. "
            ) != "yes"):
                break

        if (raw_input(
                "Are there book reviews to parse? Type yes to continue. ")):
            bookReviewsElement = etree.SubElement(baseEl, 'section')
            etree.SubElement(bookReviewsElement, 'title',
                             locale="en_US").text = 'Book Reviews'
            etree.SubElement(bookReviewsElement, 'abbr',
                             locale="en_US").text = 'BKRV'
            while (True):
                articleName = raw_input(
                    "What is the article name? You can enter a temporary filename if you want."
                )
                articleStart = raw_input(
                    "What is the start page of this article? ")
                while (not articleStart.isdigit()):
                    articleStart = raw_input(
                        "Please enter an integer. What is the start page of this article? "
                    )
                articleEnd = raw_input(
                    "What is the end page of this article? ")
                while (not articleEnd.isdigit()):
                    articleEnd = raw_input(
                        "Please enter an integer. What is the end page of this article? "
                    )

                split_article(sys.argv[1], articleName, int(articleStart),
                              int(articleEnd))
                articleBinary = open(articleName, "rb").read().encode("base64")
                articleXml = bookReviewToXml(articleText, articleName,
                                             articleBinary,
                                             issueData["date_published"])
                bookReviewsElement.insert(-1, articleXml)
                if (raw_input(
                        "Parse another article? Type yes to continue or any other character to quit. "
                ) != "yes"):
                    break

        tree = etree.ElementTree(baseEl)
        tree.write('output.xml', pretty_print=True)

    else:
        print "Not enough input entered"