def get_pdf_metadata(self, pdf_file_stream): metadata = { 'author': 'UNKNOWN_AUTHOR', 'title': 'UNKNOWN_TITLE', 'year': 'UNKNOWN_YEAR' } pdf_parser = PDFParser(pdf_file_stream) pdf_doc = PDFDocument(pdf_parser) pdf_metadata = pdf_doc.info[0] author = make_pdf_metadata_str(pdf_metadata['Author'] if 'Author' in pdf_metadata else '') if author and author != '': metadata['author'] = author title = make_pdf_metadata_str(pdf_metadata['Title'] if 'Title' in pdf_metadata else '') if title and title != '': metadata['title'] = title year = pdf_metadata_moddate_to_year( make_pdf_metadata_str(pdf_metadata['ModDate'] if 'ModDate' in pdf_metadata else '')) if year and year != '': metadata['year'] = year return metadata
def get_pdf_metadata(pdf): """Get PDF metadata with PDF content Args: pdf: PDF content (in bytes) Returns: metadata: PDF metadata dictionary """ temp_pdf_file = tempfile.TemporaryFile() temp_pdf_file.write(pdf) metadata = { 'author': 'UNKNOWN_AUTHOR', 'title': 'UNKNOWN_TITLE', 'year': 'UNKNOWN_YEAR' } pdf_parser = PDFParser(temp_pdf_file) try: pdf_doc = PDFDocument(pdf_parser) pdf_metadata = pdf_doc.info[0] author = make_pdf_metadata_str(pdf_metadata.get('Author', '')) if author and author != '': metadata['author'] = author title = make_pdf_metadata_str(pdf_metadata.get('Title', '')) if title and title != '': metadata['title'] = title year = pdf_metadata_moddate_to_year( make_pdf_metadata_str(pdf_metadata.get('ModDate', ''))) if year and year != '': metadata['year'] = year except Exception as e: pass temp_pdf_file.close() return metadata