Python PDFParser示例，catalogue.pdf_parser.PDFParser Python示例

示例#1

0

显示文件

文件： test_pdf_parser.py 项目： kingsdigitallab/chopin-django

class TestPDFToText(TestCase):
    def setUp(self):
        file_path = 'catalogue/tests/docs/Opus 1/1–1-BRZ.v.1.pdf'
        self.parser = PDFParser(file_path)
        self._expected_code = u'1–1-BRZ'

    def test_get_text_content(self):
        content = self.parser.get_text_content()
        expected_content = _d('''1–1-BRZ

RONDEAUcomposé pour lePIANOFORTEet dédiéà M de LindePARFREDERIC CHOPINProprieté de l’editeur
à Varsovie chez A. Brzezina

Contents           6 leaves: p. [1] lith ITP, pp. 2–11 lith text, p. [12] blank.
Sub-caption        p. 2: Rondo.

Comments           PFE published without plate number, printed on grey paper. PD: 2/6/1825 (KW No. 129). No price appears on TP
                   but, according to advt in KW, cost was 3 złp.
Errors             TP: ‘Proprieté’, ‘l’editeur’.

Copies

D-Dl       Mus. 5565-T-530 – 274 x 342 mm.
PL-Wn      Mus.III.127.998 Cim.  237 x 324 mm (v). TP: signature ‘Helena Turno1830’.
PL-Wnifc   D/508 – 243 x 323 mm. TP: stamp ‘a Leopol [illegible]’. Reduction in size resulted in loss of pagination on pp. 2, 5, 6, 10, 11.

'''.strip())
        self.assertEqual(content, expected_content)

    def test_get_impression_code(self):
        expected = self._expected_code
        impression_code = self.parser.get_impression_code()
        self.assertEqual(expected, impression_code)

    def test_get_title(self):
        title = self.parser.get_title()
        expected_title = _d(
            '''RONDEAUcomposé pour lePIANOFORTEet dédiéà M de LindePARFREDERIC CHOPINProprieté de l’editeur
à Varsovie chez A. Brzezina''')
        self.assertEqual(title, expected_title)

    def test_get_comments(self):
        comments = self.parser.get_comments()
        expected_comments = _d(
            '''PFE published without plate number, printed on grey paper. PD: 2/6/1825 (KW No. 129). No price appears on TP
                   but, according to advt in KW, cost was 3 złp.''')
        self.assertEqual(comments, expected_comments)

    def test_get_copies(self):
        copies = self.parser.get_copies()
        expected_copies = {
            u'D-Dl':
            u'Mus. 5565-T-530 – 274 x 342 mm.',
            u'PL-Wn':
            u'Mus.III.127.998 Cim.  237 x 324 mm (v). TP: signature ‘Helena Turno1830’.',
            u'PL-Wnifc':
            u'D/508 – 243 x 323 mm. TP: stamp ‘a Leopol [illegible]’. Reduction in size resulted in loss of pagination on pp. 2, 5, 6, 10, 11.'
        }
        self.assertEqual(copies, expected_copies)

示例#2

0

显示文件

文件： test_pdf_parser.py 项目： kcl-ddh/chopin-online

class TestPDFToText(TestCase):

    def setUp(self):
        file_path = 'catalogue/tests/docs/Opus 1/1–1-BRZ.v.1.pdf'
        self.parser = PDFParser(file_path)
        self._expected_code = u'1–1-BRZ'

    def test_get_text_content(self):
        content = self.parser.get_text_content()
        expected_content = _d('''1–1-BRZ

RONDEAUcomposé pour lePIANOFORTEet dédiéà M de LindePARFREDERIC CHOPINProprieté de l’editeur
à Varsovie chez A. Brzezina

Contents           6 leaves: p. [1] lith ITP, pp. 2–11 lith text, p. [12] blank.
Sub-caption        p. 2: Rondo.

Comments           PFE published without plate number, printed on grey paper. PD: 2/6/1825 (KW No. 129). No price appears on TP
                   but, according to advt in KW, cost was 3 złp.
Errors             TP: ‘Proprieté’, ‘l’editeur’.

Copies

D-Dl       Mus. 5565-T-530 – 274 x 342 mm.
PL-Wn      Mus.III.127.998 Cim.  237 x 324 mm (v). TP: signature ‘Helena Turno1830’.
PL-Wnifc   D/508 – 243 x 323 mm. TP: stamp ‘a Leopol [illegible]’. Reduction in size resulted in loss of pagination on pp. 2, 5, 6, 10, 11.
'''.strip())
        self.assertEqual(content, expected_content)

    def test_get_impression_code(self):
        expected = self._expected_code
        impression_code = self.parser.get_impression_code()
        self.assertEqual(expected, impression_code)

    def test_get_title(self):
        title = self.parser.get_title()
        expected_title = _d('''RONDEAUcomposé pour lePIANOFORTEet dédiéà M de LindePARFREDERIC CHOPINProprieté de l’editeur
à Varsovie chez A. Brzezina''')
        self.assertEqual(title, expected_title)

    def test_get_comments(self):
        comments = self.parser.get_comments()
        expected_comments = _d('''PFE published without plate number, printed on grey paper. PD: 2/6/1825 (KW No. 129). No price appears on TP
                   but, according to advt in KW, cost was 3 złp.''')
        self.assertEqual(comments, expected_comments)

    def test_get_copies(self):
        copies = self.parser.get_copies()
        expected_copies = {
            u'D-Dl': u'Mus. 5565-T-530 – 274 x 342 mm.',
            u'PL-Wn': u'Mus.III.127.998 Cim.  237 x 324 mm (v). TP: signature ‘Helena Turno1830’.',
            u'PL-Wnifc': u'D/508 – 243 x 323 mm. TP: stamp ‘a Leopol [illegible]’. Reduction in size resulted in loss of pagination on pp. 2, 5, 6, 10, 11.'
        }
        self.assertEqual(copies, expected_copies)

示例#3

0

显示文件

文件： pdf_import_utils.py 项目： kcl-ddh/chopin-online

def import_library (file_path, index_page):
    logger.debug('Importing {}'.format(file_path))
    parser = PDFParser(file_path)
    content = parser.get_text_content()
    if not content:
        logger.debug('Found no content in the PDF')
        return

    # gets the library heading
    heading = content.split('\n')[0]
    heading_parts = heading.split('   ')

    # gets the library code, the first value before the spaces
    if len(heading_parts) < 2:
        code = content.split('\n')[1]
    else:
        code = heading_parts[0].strip()

    # the information after the spaces
    metadata = heading_parts[-1]
    metadata_parts = metadata.split(',')

    # the country name is the first element in the metadata
    country_name = metadata_parts[0].strip()
    # and the city the second
    city_name = metadata_parts[1].strip()

    # if the county is usa
    if country_name == 'United States of America':
        # the library name is after the state
        name = ','. join(metadata_parts[3:]).strip()
    else:
        # otherwise the library name comes after the city
        name = ','. join(metadata_parts[2:]).strip()

    logger.debug(u'{0} {1} {2} {3}'.format(code, country_name, city_name, name))

    # gets the country
    country = Country.objects.filter(name=country_name).first()

    # if the country is not in the db yet
    if not country:
        # creates a new country object
        country = Country(name=country_name)
        country.save()

    # gets the city
    city = City.objects.filter(name=city_name, country=country).first()

    # if the city is not in the db yet
    if not city:
        # creates a new city object
        city = City(country=country, name=city_name)
        city.save()

    # gets the library
    slug = slugify(code)[:50]
    # Use the slug for lookups, because there are case differences in
    # some references that are meant to be the same.
    library = Library.objects.filter(slug=slug).first()

    # if the library is not in the db
    if not library:
        # creates a new library
        library = Library(title=code, city=city, name=name)
        library.slug = slug
        index_page.add_child(instance=library)
    else:
        logger.warning('Duplicate library')
        # otherwise update the library
        library.city = city
        library.name = name

    # Create a Library PDF Document.
    document = Document(title=code)
    with open(file_path, 'rb') as fh:
        pdf_file = File(fh)
        document.file.save(os.path.basename(file_path), pdf_file)
    document.tags.add('library')
    library.pdf = document
    library.save()

示例#4

0

显示文件

文件： test_pdf_parser.py 项目： kcl-ddh/chopin-online

 def setUp(self):
     file_path = 'catalogue/tests/docs/Opus 1/1–1-BRZ.v.1.pdf'
     self.parser = PDFParser(file_path)
     self._expected_code = u'1–1-BRZ'

示例#5

0

显示文件

文件： test_pdf_parser.py 项目： kingsdigitallab/chopin-django

 def setUp(self):
     file_path = 'catalogue/tests/docs/Opus 1/1–1-BRZ.v.1.pdf'
     self.parser = PDFParser(file_path)
     self._expected_code = u'1–1-BRZ'

示例#6

0

显示文件

文件： pdf_import_utils.py 项目： kingsdigitallab/chopin-django

def import_library(file_path, index_page):
    logger.debug('Importing {}'.format(file_path))
    parser = PDFParser(file_path)
    content = parser.get_text_content()
    if not content:
        logger.debug('Found no content in the PDF')
        return

    # gets the library heading
    heading = content.split('\n')[0]
    heading_parts = heading.split('   ')

    # gets the library code, the first value before the spaces
    if len(heading_parts) < 2:
        code = content.split('\n')[1]
    else:
        code = heading_parts[0].strip()

    # the information after the spaces
    metadata = heading_parts[-1]
    metadata_parts = metadata.split(',')

    # the country name is the first element in the metadata
    country_name = metadata_parts[0].strip()
    # and the city the second
    city_name = metadata_parts[1].strip()

    # if the county is usa
    if country_name == 'United States of America':
        # the library name is after the state
        name = ','.join(metadata_parts[3:]).strip()
    else:
        # otherwise the library name comes after the city
        name = ','.join(metadata_parts[2:]).strip()

    logger.debug(u'{0} {1} {2} {3}'.format(code, country_name, city_name,
                                           name))

    # gets the country
    country = Country.objects.filter(name=country_name).first()

    # if the country is not in the db yet
    if not country:
        # creates a new country object
        country = Country(name=country_name)
        country.save()

    # gets the city
    city = City.objects.filter(name=city_name, country=country).first()

    # if the city is not in the db yet
    if not city:
        # creates a new city object
        city = City(country=country, name=city_name)
        city.save()

    # gets the library
    slug = slugify(code)[:50]
    # Use the slug for lookups, because there are case differences in
    # some references that are meant to be the same.
    library = Library.objects.filter(slug=slug).first()

    # if the library is not in the db
    if not library:
        # creates a new library
        library = Library(title=code, city=city, name=name)
        library.slug = slug
        index_page.add_child(instance=library)
    else:
        logger.warning('Duplicate library')
        # otherwise update the library
        library.city = city
        library.name = name

    # Create a Library PDF Document.
    document = Document(title=code)
    with open(file_path, 'rb') as fh:
        pdf_file = File(fh)
        document.file.save(os.path.basename(file_path), pdf_file)
    document.tags.add('library')
    library.pdf = document
    library.save()