class TestPDFToText(TestCase): def setUp(self): file_path = 'catalogue/tests/docs/Opus 1/1–1-BRZ.v.1.pdf' self.parser = PDFParser(file_path) self._expected_code = u'1–1-BRZ' def test_get_text_content(self): content = self.parser.get_text_content() expected_content = _d('''1–1-BRZ RONDEAUcomposé pour lePIANOFORTEet dédiéà M de LindePARFREDERIC CHOPINProprieté de l’editeur à Varsovie chez A. Brzezina Contents 6 leaves: p. [1] lith ITP, pp. 2–11 lith text, p. [12] blank. Sub-caption p. 2: Rondo. Comments PFE published without plate number, printed on grey paper. PD: 2/6/1825 (KW No. 129). No price appears on TP but, according to advt in KW, cost was 3 złp. Errors TP: ‘Proprieté’, ‘l’editeur’. Copies D-Dl Mus. 5565-T-530 – 274 x 342 mm. PL-Wn Mus.III.127.998 Cim. 237 x 324 mm (v). TP: signature ‘Helena Turno1830’. PL-Wnifc D/508 – 243 x 323 mm. TP: stamp ‘a Leopol [illegible]’. Reduction in size resulted in loss of pagination on pp. 2, 5, 6, 10, 11. '''.strip()) self.assertEqual(content, expected_content) def test_get_impression_code(self): expected = self._expected_code impression_code = self.parser.get_impression_code() self.assertEqual(expected, impression_code) def test_get_title(self): title = self.parser.get_title() expected_title = _d( '''RONDEAUcomposé pour lePIANOFORTEet dédiéà M de LindePARFREDERIC CHOPINProprieté de l’editeur à Varsovie chez A. Brzezina''') self.assertEqual(title, expected_title) def test_get_comments(self): comments = self.parser.get_comments() expected_comments = _d( '''PFE published without plate number, printed on grey paper. PD: 2/6/1825 (KW No. 129). No price appears on TP but, according to advt in KW, cost was 3 złp.''') self.assertEqual(comments, expected_comments) def test_get_copies(self): copies = self.parser.get_copies() expected_copies = { u'D-Dl': u'Mus. 5565-T-530 – 274 x 342 mm.', u'PL-Wn': u'Mus.III.127.998 Cim. 237 x 324 mm (v). TP: signature ‘Helena Turno1830’.', u'PL-Wnifc': u'D/508 – 243 x 323 mm. TP: stamp ‘a Leopol [illegible]’. Reduction in size resulted in loss of pagination on pp. 2, 5, 6, 10, 11.' } self.assertEqual(copies, expected_copies)
class TestPDFToText(TestCase): def setUp(self): file_path = 'catalogue/tests/docs/Opus 1/1–1-BRZ.v.1.pdf' self.parser = PDFParser(file_path) self._expected_code = u'1–1-BRZ' def test_get_text_content(self): content = self.parser.get_text_content() expected_content = _d('''1–1-BRZ RONDEAUcomposé pour lePIANOFORTEet dédiéà M de LindePARFREDERIC CHOPINProprieté de l’editeur à Varsovie chez A. Brzezina Contents 6 leaves: p. [1] lith ITP, pp. 2–11 lith text, p. [12] blank. Sub-caption p. 2: Rondo. Comments PFE published without plate number, printed on grey paper. PD: 2/6/1825 (KW No. 129). No price appears on TP but, according to advt in KW, cost was 3 złp. Errors TP: ‘Proprieté’, ‘l’editeur’. Copies D-Dl Mus. 5565-T-530 – 274 x 342 mm. PL-Wn Mus.III.127.998 Cim. 237 x 324 mm (v). TP: signature ‘Helena Turno1830’. PL-Wnifc D/508 – 243 x 323 mm. TP: stamp ‘a Leopol [illegible]’. Reduction in size resulted in loss of pagination on pp. 2, 5, 6, 10, 11. '''.strip()) self.assertEqual(content, expected_content) def test_get_impression_code(self): expected = self._expected_code impression_code = self.parser.get_impression_code() self.assertEqual(expected, impression_code) def test_get_title(self): title = self.parser.get_title() expected_title = _d('''RONDEAUcomposé pour lePIANOFORTEet dédiéà M de LindePARFREDERIC CHOPINProprieté de l’editeur à Varsovie chez A. Brzezina''') self.assertEqual(title, expected_title) def test_get_comments(self): comments = self.parser.get_comments() expected_comments = _d('''PFE published without plate number, printed on grey paper. PD: 2/6/1825 (KW No. 129). No price appears on TP but, according to advt in KW, cost was 3 złp.''') self.assertEqual(comments, expected_comments) def test_get_copies(self): copies = self.parser.get_copies() expected_copies = { u'D-Dl': u'Mus. 5565-T-530 – 274 x 342 mm.', u'PL-Wn': u'Mus.III.127.998 Cim. 237 x 324 mm (v). TP: signature ‘Helena Turno1830’.', u'PL-Wnifc': u'D/508 – 243 x 323 mm. TP: stamp ‘a Leopol [illegible]’. Reduction in size resulted in loss of pagination on pp. 2, 5, 6, 10, 11.' } self.assertEqual(copies, expected_copies)
def import_library (file_path, index_page): logger.debug('Importing {}'.format(file_path)) parser = PDFParser(file_path) content = parser.get_text_content() if not content: logger.debug('Found no content in the PDF') return # gets the library heading heading = content.split('\n')[0] heading_parts = heading.split(' ') # gets the library code, the first value before the spaces if len(heading_parts) < 2: code = content.split('\n')[1] else: code = heading_parts[0].strip() # the information after the spaces metadata = heading_parts[-1] metadata_parts = metadata.split(',') # the country name is the first element in the metadata country_name = metadata_parts[0].strip() # and the city the second city_name = metadata_parts[1].strip() # if the county is usa if country_name == 'United States of America': # the library name is after the state name = ','. join(metadata_parts[3:]).strip() else: # otherwise the library name comes after the city name = ','. join(metadata_parts[2:]).strip() logger.debug(u'{0} {1} {2} {3}'.format(code, country_name, city_name, name)) # gets the country country = Country.objects.filter(name=country_name).first() # if the country is not in the db yet if not country: # creates a new country object country = Country(name=country_name) country.save() # gets the city city = City.objects.filter(name=city_name, country=country).first() # if the city is not in the db yet if not city: # creates a new city object city = City(country=country, name=city_name) city.save() # gets the library slug = slugify(code)[:50] # Use the slug for lookups, because there are case differences in # some references that are meant to be the same. library = Library.objects.filter(slug=slug).first() # if the library is not in the db if not library: # creates a new library library = Library(title=code, city=city, name=name) library.slug = slug index_page.add_child(instance=library) else: logger.warning('Duplicate library') # otherwise update the library library.city = city library.name = name # Create a Library PDF Document. document = Document(title=code) with open(file_path, 'rb') as fh: pdf_file = File(fh) document.file.save(os.path.basename(file_path), pdf_file) document.tags.add('library') library.pdf = document library.save()
def setUp(self): file_path = 'catalogue/tests/docs/Opus 1/1–1-BRZ.v.1.pdf' self.parser = PDFParser(file_path) self._expected_code = u'1–1-BRZ'
def import_library(file_path, index_page): logger.debug('Importing {}'.format(file_path)) parser = PDFParser(file_path) content = parser.get_text_content() if not content: logger.debug('Found no content in the PDF') return # gets the library heading heading = content.split('\n')[0] heading_parts = heading.split(' ') # gets the library code, the first value before the spaces if len(heading_parts) < 2: code = content.split('\n')[1] else: code = heading_parts[0].strip() # the information after the spaces metadata = heading_parts[-1] metadata_parts = metadata.split(',') # the country name is the first element in the metadata country_name = metadata_parts[0].strip() # and the city the second city_name = metadata_parts[1].strip() # if the county is usa if country_name == 'United States of America': # the library name is after the state name = ','.join(metadata_parts[3:]).strip() else: # otherwise the library name comes after the city name = ','.join(metadata_parts[2:]).strip() logger.debug(u'{0} {1} {2} {3}'.format(code, country_name, city_name, name)) # gets the country country = Country.objects.filter(name=country_name).first() # if the country is not in the db yet if not country: # creates a new country object country = Country(name=country_name) country.save() # gets the city city = City.objects.filter(name=city_name, country=country).first() # if the city is not in the db yet if not city: # creates a new city object city = City(country=country, name=city_name) city.save() # gets the library slug = slugify(code)[:50] # Use the slug for lookups, because there are case differences in # some references that are meant to be the same. library = Library.objects.filter(slug=slug).first() # if the library is not in the db if not library: # creates a new library library = Library(title=code, city=city, name=name) library.slug = slug index_page.add_child(instance=library) else: logger.warning('Duplicate library') # otherwise update the library library.city = city library.name = name # Create a Library PDF Document. document = Document(title=code) with open(file_path, 'rb') as fh: pdf_file = File(fh) document.file.save(os.path.basename(file_path), pdf_file) document.tags.add('library') library.pdf = document library.save()