Exemplo n.º 1
0
def main():
    args = do_cmd_args_line()

    for f in os.listdir(args.path):
        if f.endswith('.pdf'):
            fname = os.path.join(args.path, f)
            pdfile = PdfFileReader(file(fname, 'rb'))

            title = pdfile.getDocumentInfo().title
            subject = pdfile.getDocumentInfo().subject
            author = pdfile.getDocumentInfo().author

            if author == None or author == '':
                author = 'Unknown'
            if title == None or title == '':
                title = os.path.splitext(f)[0]

            tgtfname = '[{0}] {1}.pdf'.format(author, title)
            ftgtname = os.path.join(args.dest, tgtfname)

            print 'renaming {0} -> {1}'.format(fname, ftgtname)
            if not args.dryrun:
                try:
                    os.rename(fname, ftgtname)
                except Exception as e:
                    print e
def get_pdf_title(pdf_file_path):
    # must be open as 'rb', otherwise will raise "PdfReadError: EOF marker not found"
    with open(pdf_file_path,'rb') as f:
        pdf_reader = PdfFileReader(f) 
        # print(pdf_file_path)
        # print(pdf_reader.getDocumentInfo())
        if '/Title' in pdf_reader.getDocumentInfo().keys():
            return pdf_reader.getDocumentInfo()['/Title']
        else:
            return None
Exemplo n.º 3
0
    def iter_pdf_page_text(self, filename):
    	year=""
    	month=""
    	day=""
    	mydate=""
    	
        self.filename = filename
        reader = PdfFileReader(open(filename,"rb"))
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        
        metadata = reader.getDocumentInfo()
        logging.info("METADATA: " + str(metadata))
        
        try:
            if metadata.has_key('/CreationDate'):
                year = metadata['/CreationDate'][2:5]
                month = metadata['/CreationDate'][6:7]
                day = metadata['/CreationDate'][8:9]
                mydate =year+"-"+month+"-"+day 
            else:
                mydate = datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S")
        except: #hack ... but sometimes /creationdate is bunged
            mydate = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")

        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
Exemplo n.º 4
0
def get_file_info(fn):
    """
    Get the metadata stored in an image file returning ``None`` on failure.

    """
    ext = os.path.splitext(fn)[1].lower()
    if ext == ".png":
        if Image is None:
            raise ImportError("PIL or pillow must be installed to read "
                              "metadata from PNG files.")
        img = Image.open(fn)
        return img.info
    if ext == ".pdf":
        if PdfFileReader is None:
            raise ImportError("PyPDF2 must be installed to read "
                              "metadata from PDF files.")
        with open(fn, "rb") as f:
            pdf = PdfFileReader(f)
            di = pdf.getDocumentInfo()
            if "/Keywords" not in di:
                return None
            try:
                return json.loads(di["/Keywords"])
            except ValueError:
                return None
    return None
Exemplo n.º 5
0
 def _merge_pdf_images(self, docf, stream, outlines):
     pdfin = PdfFileReader(docf.name)
     
     pdfout = PdfFileWriter()
     pdfout._info.getObject().update(pdfin.getDocumentInfo())
     
     # embed images into file
     for pageno, page in enumerate(pdfin.pages):
         for img in self._pdf_images:
             if img.page != (pageno + 1):
                 continue
             
             # Load image
             imgin = PdfFileReader(img.fname)
             imgpage = imgin.getPage(0)
             scale = min(img.width / imgpage.mediaBox[2].as_numeric(), 
                         img.height / imgpage.mediaBox[3].as_numeric())
             
             page.mergeScaledTranslatedPage(imgpage, scale, img.x, img.y)
         
         pdfout.addPage(page)
     
     # create outlines
     stack = []
     for pageno, level, header in outlines:
         stack = stack[:level]
         
         parent = (stack[0] if stack else None)
         stack.append(pdfout.addBookmark(header.strip(), pageno - 1, parent))
     
     pdfout.write(stream)
Exemplo n.º 6
0
    def __init__(self, file_abs_path):
        """
        __init__(self, file_abs_path):

        Arguments:
            - file_abs_path: (string) Absolute file path.
        """

        self.absolute_path = file_abs_path
        self.name = os.path.basename(self.absolute_path)

        application_messages.print_file_name(self.name)
        application_messages.print_document_info('Path', self.absolute_path)

        try:
            document = PdfFileReader(file(self.absolute_path, 'rb'))

            self.__get_encrypted_status(document)

            document_info = document.getDocumentInfo()
            if document_info:
                self.__parse_document_info(document_info)

        except Exception as ex:
            if 'encode' not in str(ex):
                raise Exception(ex)
Exemplo n.º 7
0
def pdf_meta(tmp_file_path, original_file_name, original_file_extension):

    if (use_pdf_meta):
        pdf = PdfFileReader(open(tmp_file_path, 'rb'))
        doc_info = pdf.getDocumentInfo()
    else:
        doc_info = None

    if (doc_info is not None):
        author = doc_info.author if doc_info.author is not None else "Unknown"
        title = doc_info.title if doc_info.title is not None else original_file_name
        subject = doc_info.subject
    else:
        author = "Unknown"
        title = original_file_name
        subject = ""
    return uploader.BookMeta(
        file_path = tmp_file_path,
        extension = original_file_extension,
        title = title,
        author = author,
        cover = pdf_preview(tmp_file_path, original_file_name),
        description = subject,
        tags = "",
        series = "",
        series_id="")
Exemplo n.º 8
0
    def check_file_for_processing(self, ev_path):
        """
        This checks a path to see if it we should process it.

        :param ev_path: Fully qualified path to file to check
        :return: True if it should be convertred. False if not
        """
        if not ev_path.endswith(".pdf"):
            return False

        if ev_path.endswith("_ocr.pdf"):
            return False

        if self.archive_suffix and ev_path.endswith(self.archive_suffix):
            return False

        try:
            with open(ev_path, "rb") as f:
                pdf = PdfFileReader(f)
                pdf_info = pdf.getDocumentInfo()

                # It has been OCR'ed'
                if pdf_info is not None and '/PyPDFOCR' in pdf_info:
                    return False
        except IOError:
            return False
        except PdfReadError:
            return False

        return True
Exemplo n.º 9
0
    def test_backlog_list(self):
        user = factories.UserFactory.create(
            email='*****@*****.**', password='******')
        backlog = factories.create_project_sample_backlog(user)
        for i in range(0, 10):
            factories.create_sample_story(user, backlog=backlog)
        # special printing of -1 points
        story = factories.UserStory.objects.all()[0]
        story.points = -1
        story.save()
        url = reverse("print_stories")
        url_plus = "{0}?backlog_id={1}".format(url, backlog.pk)
        self.app.get(url_plus, status=302)
        response = self.app.get(url_plus, user=user)
        form = response.forms['print_pdf_form']
        for k, f in form.fields.items():
            if k and "story-" in k:
                form[k] = True
        form['print-side'] = "long"
        form['print-format'] = "a4"
        response = form.submit()
        self.assertEqual(response['Content-Type'], "application/pdf")
        o = StringIO.StringIO(response.content)
        pdf = PdfFileReader(o)
        info = pdf.getDocumentInfo()
        self.assertEqual(pdf.getNumPages(), 6)
        self.assertEqual("backlogman.com", info['/Author'])
        # A4 is not "round" in PDF unit format real value are
        # approximately : [0, 0, 841.88980, 595.27560]
        self.assertEqual([0, 0, 841, 595],
                         [int(x) for x in pdf.getPage(0)["/MediaBox"]])

        response = self.app.get(url_plus, user=user)
        form = response.forms['print_pdf_form']
        for k, f in form.fields.items():
            if k and "story-" in k:
                form[k] = True
        form['print-side'] = "short"
        form['print-format'] = "letter"
        response = form.submit()
        self.assertEqual(response['Content-Type'], "application/pdf")
        o = StringIO.StringIO(response.content)
        pdf = PdfFileReader(o)
        info = pdf.getDocumentInfo()
        self.assertEqual(pdf.getNumPages(), 6)
        self.assertEqual("backlogman.com", info['/Author'])
        self.assertEqual([0, 0, 792, 612], pdf.getPage(0)["/MediaBox"])
Exemplo n.º 10
0
  def getContentInformation(self):
    """Returns the information about the PDF document with pdfinfo.
    """
    if not self.hasData():
      return dict()
    try:
      return self._content_information.copy()
    except AttributeError:
      pass
    tmp = tempfile.NamedTemporaryFile()
    tmp.write(self.getData())
    tmp.seek(0)
    command_result = None
    try:

      # First, we use pdfinfo to get standard metadata
      command = ['pdfinfo', '-meta', '-box', tmp.name]
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdfinfo was not found')
        raise

      result = {}
      for line in command_result.splitlines():
        item_list = line.split(':')
        key = item_list[0].strip()
        value = ':'.join(item_list[1:]).strip()
        result[key] = value

      # Then we use PyPDF2 to get extra metadata
      try:
        from PyPDF2 import PdfFileReader
        from PyPDF2.utils import PdfReadError
      except ImportError:
        # if PyPDF2 not found, pass
        pass
      else:
        try:
          pdf_file = PdfFileReader(tmp)
          for info_key, info_value in (pdf_file.getDocumentInfo() or {}).iteritems():
            info_key = info_key.lstrip("/")
            if isinstance(info_value, unicode):
              info_value = info_value.encode("utf-8")

            # Ignore values that cannot be pickled ( such as AAPL:Keywords )
            try:
              pickle.dumps(info_value)
            except pickle.PicklingError:
              LOG("PDFDocument.getContentInformation", INFO,
                "Ignoring non picklable document info on %s: %s (%r)" % (
                self.getRelativeUrl(), info_key, info_value))
            else:
              result.setdefault(info_key, info_value)
        except PdfReadError:
          LOG("PDFDocument.getContentInformation", PROBLEM,
            "PyPDF2 is Unable to read PDF, probably corrupted PDF here : %s" % \
            (self.getRelativeUrl(),))
Exemplo n.º 11
0
def get_title(path):
    pdf = PdfFileReader(open(path, "rb"))
    metadata = pdf.getDocumentInfo()

    if metadata is not None:
        if metadata.title is not None:
            return str.strip(metadata.title)
    return ""
Exemplo n.º 12
0
def extract_title_pdf(filename):
    try:
        with open(filename, 'rb') as f:
            p = PdfFileReader(f)
            info = p.getDocumentInfo()
            title = info.get('/Title')
    except IOError:
        title = None
    return title
Exemplo n.º 13
0
 def test_docs_contents(self, cnf_about_pg):
     '''Test contents of each document'''
     #
     # Disabling for https://bugzilla.redhat.com/show_bug.cgi?id=1026943
     #
     for link in cnf_about_pg.docs_links:
         doc = requests.get(link['text_url'], verify=False)
         pdf = PdfFileReader(StringIO.StringIO(doc.content))
         pdf_info = pdf.getDocumentInfo()
Exemplo n.º 14
0
 def getMetadataField(self, pdf_filename, field_name):
     with open(pdf_filename, 'rb') as file_input:
         input_f = PdfFileReader(file_input)
         document_info = input_f.getDocumentInfo()
         key = '/' + field_name
         if key in document_info.keys():
             return document_info[key]
         else:
             return None
def printMeta():
    for dirpath, dirnames, files in os.walk("doc_pdf"):
        for name in files:
            ext = name.lower().rsplit('.', 1)[-1]
            if ext in ['pdf']:
                print "[+] Metadata for file: %s " %(dirpath+os.path.sep+name)
                pdfFile = PdfFileReader(file(dirpath+os.path.sep+name, 'rb'))
                docInfo = pdfFile.getDocumentInfo()
                for metaItem in docInfo:
                    print '[+] ' + metaItem + ':' + docInfo[metaItem]
Exemplo n.º 16
0
  def getPyPDF2Info(self):
    pdf_file = PdfFileReader(open(self.dir_downloads+self.filename+'.pdf', 'rb'))
    pdf_info = pdf_file.getDocumentInfo()

    self.author = pdf_info.author
    self.title = pdf_info.title
    self.subject = pdf_info.subject

    pdf_meta = pdf_file.getXmpMetadata()
    self.year = str(pdf_meta.xmp_createDate)
Exemplo n.º 17
0
def get_author(path):
    """

    :type path:
    """
    pdf = PdfFileReader(open(path, "rb"))
    metadata = pdf.getDocumentInfo()

    if metadata is not None:
        return metadata.author
Exemplo n.º 18
0
def get_pdf_metadata(fn):

    meta = dict()

    try:
        with open(os.path.abspath(fn), "rb") as pdf_file:
            pdf = PdfFileReader(pdf_file)
#            if pdf.isEncrypted:
#                pdf.decrypt('')
            meta['author']   = pdf.getDocumentInfo().author
            meta['creator']  = pdf.getDocumentInfo().creator
            meta['producer'] = pdf.getDocumentInfo().producer
            meta['subject']  = pdf.getDocumentInfo().subject
            meta['title']    = pdf.getDocumentInfo().title 

    except Exception as e:
        print e
        print 'file: %s' % fn

    return meta
Exemplo n.º 19
0
def test_contents(guides, soft_assert):
    """Test contents of each document."""
    pytest.sel.force_navigate("about")
    for link in guides:
        locator = getattr(about, link)
        url = pytest.sel.get_attribute(locator, "href")
        data = requests.get(url, verify=False)
        pdf = PdfFileReader(StringIO(data.content))
        pdf_info = pdf.getDocumentInfo()
        soft_assert("CloudForms" in pdf_info["/Title"], "CloudForms is not in the title!")
        soft_assert(pytest.sel.text(locator) in pdf_info["/Title"], "{} not in {}".format(
            pytest.sel.text(locator), pdf_info["/Title"]))
    def get_info_for_file(filepath):
        out = []
        reader = PdfFileReader(open(filepath, 'rb'))
        try:
            info = reader.getDocumentInfo()
            for key in info:
                if info[key]:
                    out.append((key, info[key]))
        except:
            pass

        return out
Exemplo n.º 21
0
 def process(self, content, mimetype='application/pdf'):
     """Process a PDF document.
     Args:
         content: Binary content of the document.
         mimetype: Id of MIME type (content ignored if it isn't `application/pdf`).
     Returns:
         Tuple:
             Relevancy of the document (based on keywords)
             Metadata extracted from the document (dictionary).
     """
     relevancy = 0
     metadata = {}
     if mimetype == 'application/pdf':
         # Obtain metadata
         doc = PdfFileReader(BytesIO(content))
         info = doc.getDocumentInfo()
         if info:
             for k in info:
                 metadata[k] = info.getText(k)
         # Extra metadata
         metadata['_num_pages'] = doc.getNumPages()
         # Process title, subject and metadata keywords
         # TODO guess title from page text when not provided
         if self.keywords:
             relevant = (metadata.get('/Title', '') + ' ' +
                         metadata.get('/Subject', '') + ' ' +
                         metadata.get('/Keywords', '')).lower()
             for word in self.keywords:
                 if word.lower() in relevant:
                     # Each relevant keyword increases relevancy in 10 points
                     relevancy += 10
             # Process pages.
             distance_factor = 1
             for p in range(doc.getNumPages()):
                 # Break if factor is too low
                 if distance_factor < 0.01:
                     break
                 try:
                     text = doc.getPage(p).extractText().lower()
                     for word in self.keywords:
                         relevancy += distance_factor * text.count(word.lower())
                 except Exception as ex:
                     # Some bad formed PDFs raise decoding errors. Skip page.
                     pass
                 # Each new page reduces relevancy factor in a half
                 distance_factor /= 2
         # Relevancy is significant by the nearest tenth
         relevancy = round(relevancy, 1)
     else:
         relevancy = 0
     metadata['_relevancy'] = relevancy
     return relevancy, metadata
Exemplo n.º 22
0
def getMetacsv(folder):	
  csvfile = open(METADATA,'w')	
  csvwriter = csv.writer(csvfile, dialect = 'excel')
  csvwriter.writerow(['FILENAME','Author', 'Company','Producer','Title','Creator','Creation Date','Modified Date','Subject','Keywords'])
	
  for filename in os.listdir(folder):
    metadata=[]
    try:
      if '.pdf' in filename:		
        pdfFile = PdfFileReader(file(folder+'/'+filename, 'rb'))
        docInfo = pdfFile.getDocumentInfo()
	  if '/Author' in docInfo:
	    metadata.append(docInfo['/Author'].strip())
	  else:
	    metadata.append('')
	  if '/Company' in docInfo:
	    metadata.append(docInfo['/Company'].strip())
	  else:
	    metadata.append('')		
	  if '/Producer' in docInfo:
		metadata.append(docInfo['/Producer'].strip())
	  else:
	    metadata.append('')
	  if '/Title' in docInfo:
	    metadata.append(docInfo['/Title'].strip())
	  else:
	    metadata.append('')
	  if '/Creator' in docInfo:
		metadata.append(docInfo['/Creator'].strip())
	  else:
		metadata.append('')
	  if '/CreationDate' in docInfo:
		metadata.append(docInfo['/CreationDate'].strip())
	  else:
		metadata.append('')	
	  if '/ModDate' in docInfo:
		metadata.append(docInfo['/ModDate'].strip())
	  else:
		metadata.append('')
	  if '/Subject' in docInfo:
		metadata.append(docInfo['/Subject'].strip())
	  else:
		metadata.append('')	
	  if '/Keywords' in docInfo:
        metadata.append(docInfo['/Keywords'].strip())
      else:
        metadata.append('')	
				
      csvwriter.writerow([filename]+metadata)
Exemplo n.º 23
0
 def printMetaData(self):
     for dirpath, dirnames, files in os.walk("pdfs"):
         try:
             for name in files:
                 ext = name.lower().rsplit('.', 1)[-1]
                 if ext in ['pdf']:
                     print "[+] Metadata for file: %s " %(dirpath+os.path.sep+name)
                     pdfFile = PdfFileReader(file(dirpath+os.path.sep+name, 'rb'))
                     docInfo = pdfFile.getDocumentInfo()
                     for metaItem in docInfo:
                         print '[+] ' + metaItem + ':' + docInfo[metaItem]
                     print "\n"
         except Exception,e:
             print "Error to Obtain PDF METADATA"
             pass
Exemplo n.º 24
0
def printMeta(fileName):
    """ Abre e recupera informacoes do pdf """

    # Abre o pdf
    pdfFile = PdfFileReader(open(fileName, 'rb'))
    # Pega as informacoes do pdf
    docInfo = pdfFile.getDocumentInfo()
    print('[*] PDF MetaData For: %s' % str(fileName))
    # Se existir informacoes retorna
    if docInfo:
        # Loopa as informacoes do pdf
        for metaItem in docInfo:
            print('[+] %s:%s' % (metaItem, docInfo[metaItem]))
    else:
        print('[+] No Document Info')
Exemplo n.º 25
0
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo()
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result
Exemplo n.º 26
0
def get_metadata_from_file(x):
    """Get metadata for file x.
    Returns a tuple (author, title)"""
    pdf_file = open(x, 'rb')
    pdfobj = PdfFileReader(pdf_file)
    dictinfo = pdfobj.getDocumentInfo()
    try:
        author = dictinfo['/Author']
    except:
        author = None
    try:
        title = dictinfo['/Title']
    except:
        title = None
    pdf_file.close()
    return (author, title)
Exemplo n.º 27
0
def __Get_info(file_path, plain_log, csv_log, analyzed_files, total_files):
    """
    Get_info(file_path)
        Opens the pdf file for reading.
    Args:
        - file_path: (string) Absolute file path.
        - plain_log: (None | string) Log file in plain text.
        - csv_log: (None | string) Log file in csv format.
    """

    file_name = os.path.basename(file_path)
    file_size = os.path.getsize(file_path)

    encrypted = 'No'

    try:  # Try to open not password encrypted pdf files and pdf files
          # encrypted with a blank password.
        pdf_file = PdfFileReader(file(file_path, 'rb'))
        if pdf_file.getIsEncrypted() is True:
            dec_res = pdf_file.decrypt('')
            if dec_res == 1:
                encrypted = 'Yes'

        #Get and parse metadata
        doc_info = pdf_file.getDocumentInfo()
        title, author, creator, subject, producer, c_date, m_date \
            = __Parse_doc_info(doc_info)

        num_pages = pdf_file.getNumPages()

        #Group info
        pdf_meta = pdf_metadata(file_name, title, author, creator,
                                subject, producer, c_date, m_date,
                                encrypted, num_pages, file_size)

        __Print_metadata(pdf_meta)

        if plain_log:
            Log(file_name, pdf_meta, plain_log, 'txt')
        if csv_log:
            Log(file_name, pdf_meta, f_log_csv, 'csv')

        analyzed_files = analyzed_files + 1

    except Exception, e:
        error = file_name + ' ' + str(e)
        __Print_error(error)
Exemplo n.º 28
0
def extract_creation_date(filename):
    #  Add strict=False in order to avoid 'PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]'
    pdf_toread = PdfFileReader(open(filename, "rb"), strict=False)
    # "file has not been decrypted" error https://github.com/mstamy2/PyPDF2/issues/51
    if pdf_toread.isEncrypted:
        pdf_toread.decrypt('')
    pdf_info = pdf_toread.getDocumentInfo()
    #print(str(pdf_info))
    # PDF Reference, 3.8.3 Dates, http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
    # A date is an ASCII string of the form (D:YYYYMMDDHHmmSSOHH'mm')
    # Examle: D:20170508085336+02'00'
    raw_date = pdf_info['/CreationDate']
    #print(str(raw_date))
    date_str = re.search('^D:(\d{14})', raw_date).group(1)
    #print(str(date_str))
    timestamp = datetime.strptime(date_str, "%Y%m%d%H%M%S")
    #print(str(date))
    return timestamp
Exemplo n.º 29
0
 def run(self):
     """
     Thread that waits on a queue and rips metadata from the pdfs and updates
     their entries in rethinkdb
     """
     while True:
         docid = self.in_queue.get()
         document = self.index.get({'id': docid})
         pdf = PdfFileReader(file(document['path'], 'rb'))
         doc_info = pdf.getDocumentInfo()
         pdf_info = {
             'title': doc_info.title or '',
             'author': doc_info.author or '',
             'creator': doc_info.creator or '',
             'producer': doc_info.producer or ''
         }
         r.table('documents').filter(
             {'id': docid}).update(
                 {'pdfinfo': pdf_info}).run(self.index.rdb)
Exemplo n.º 30
0
def get_file_info(fn):
    """
    Get the metadata stored in an image file returning ``None`` on failure.

    """
    ext = os.path.splitext(fn)[1].lower()
    if ext == ".png":
        img = Image.open(fn)
        return img.info
    if ext == ".pdf":
        with open(fn, "rb") as f:
            pdf = PdfFileReader(f)
            di = pdf.getDocumentInfo()
            if "/Keywords" not in di:
                return None
            try:
                return json.loads(di["/Keywords"])
            except ValueError:
                return None
    return None
Exemplo n.º 31
0
def get_info_pdf(filename):
    # 打开文件
    file_stream = open(filename, 'rb')

    # 创建一个实例用来读取pdf文件
    pdf_reader = PdfFileReader(file_stream)

    # 获取pdf文件的信息
    document_info = pdf_reader.getDocumentInfo()

    # 获取pdf文件的总页数
    pdf_page_nums = pdf_reader.getNumPages()

    # 获取单页pdf文件数据,得到一个PageObject对象
    single_page = pdf_reader.getPage(1)

    # 获取页面布局
    pdf_layout = pdf_reader.getPageLayout()

    # 检索指定PageObject的页码
    page_num = pdf_reader.getPageNumber(single_page)
Exemplo n.º 32
0
def Analyze_Metadata_pdf(filename):
    ####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value = (metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta == "/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)
Exemplo n.º 33
0
def get_info(path):

    with open('Bank Balance Statement.pdf', 'rb') as f:

        pdf = PdfFileReader(f)

        info = pdf.getDocumentInfo()

        number_of_pages = pdf.getNumPages()

    print(info)

    #author = info.author

    #creator = info.creator

    #producer = info.producer

    #ubject = info.subject

    title = info.title
Exemplo n.º 34
0
def printMeta(
    ruta
):  # funcion que obtiene los metadatos de archivos pdf en un directorio
    for dirpath, dirnames, files in os.walk(
            ruta):  # para el diretorio, nombre y archivos en la carpeta docs
        for name in files:  #recorremos los posibles fichreos
            ext = name.lower().rsplit('.', 1)[-1]
            if ext in ['pdf']:
                print chr(27) + "[0;31m" + "[+] Metadata for file: %s " % (
                    dirpath + os.path.sep + name
                ) + chr(
                    27
                ) + "[0m"  # pintamos el titulo de metadata for file y el directorio y nombre del documento
                pdfFile = PdfFileReader(
                    file(dirpath + os.path.sep + name,
                         'rb'))  # abrimos el fichero
                docInfo = pdfFile.getDocumentInfo(
                )  # creamos un diccionario con la info recolectada
                for metaItem in docInfo:
                    print '[+] ' + metaItem + ':' + docInfo[metaItem]
                print "\n"
Exemplo n.º 35
0
def extract_information(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        pdfReader = PyPDF2.PdfFileReader(pdf_path)
        information = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
        page = pdfReader.getPage(0)
    txt = f
    """
    Information about {pdf_path}:

    Author: {information.author}
    Creator: {information.creator}
    Producer: {information.producer}
    Subject: {information.subject}
    Title: {information.title}
    Number of pages: {number_of_pages}
    """

    print(page)
    return information
Exemplo n.º 36
0
    def info(self):
        """Get maetadata information about PDF
        """
        meta_info_data = {}

        with open(self.filepath, 'rb') as file:
            # initialize the PDF reader object
            reader = PdfFileReader(file) 

            if reader.isEncrypted:
                reader.decrypt(self.password)

            # Retrieves the PDF file's document information dictionary 
            info = reader.getDocumentInfo() 
            # Retrives XMP (Extensible Metadata Platform) data from the PDF document
            xmp = reader.getXmpMetadata()   
            # Number of pages in PDF
            num_of_pages = reader.getNumPages()

        if info is not None:
            info_key_lst = ['filepath', 'author', 'creator', 'producer', 'subject', 'title', 'number_of_pages']
            info_value_list = [self.filepath, info.author, info.creator, info.producer, info.subject, info.title, num_of_pages]

            for info_key, info_value in zip(info_key_lst, info_value_list):
                meta_info_data[info_key] = info_value

        if xmp is not None:
            xmp_key_lst = ['format', 'createDate', 'modifyDate', 'metadataDate',    'creatorTool']

            xmp_value_lst = [xmp.dc_format, xmp.xmp_createDate, xmp.xmp_modifyDate, xmp.xmp_metadataDate, xmp.xmp_creatorTool]
            
            for xmp_key, xmp_value in zip(xmp_key_lst, xmp_value_lst):
                if isinstance(xmp_value, datetime):
                    meta_info_data[xmp_key] = '{} {}'.format(xmp_value.date(), xmp_value.time())        
                else:    
                    meta_info_data[xmp_key] = xmp_value

            return meta_info_data
        
        return meta_info_data
Exemplo n.º 37
0
async def main():
    forms: Dict[str, Form] = await get_forms()
    for task_code in task1_data:
        parsed_task_code: str = parse_task_code(task_code)
        if parsed_task_code not in forms:
            print(
                json.dumps({
                    'error':
                    True,
                    'message':
                    f'task code "{parsed_task_code}" does not exist',
                }))
        else:
            async with ClientSession() as PdfPage.session:
                pdf_page: PdfPage = PdfPage(forms[parsed_task_code].url())
                content: bytes = await pdf_page.content()
                form_dict: Dict[str, Any] = forms[parsed_task_code].to_dict()
                form_dict['form_title'] = ''
                form_dict['form_number'] = task_code
                if not content:
                    form_dict['error_message'] = 'could not fetch PDF content'
                    print(json.dumps(form_dict))
                else:
                    try:
                        with warnings.catch_warnings():
                            warnings.simplefilter("ignore")
                            pdf = PdfFileReader(BytesIO(content))
                            info = pdf.getDocumentInfo()
                            try:
                                subject: str = info['/Subject']
                                form_dict['form_title'] = subject
                            except:
                                form_dict[
                                    'error_message'] = 'PDF error, /Subject does not exist'
                    except:
                        form_dict['error_message'] = 'PDF reading error'
                _ = form_dict.pop('code')
                _ = form_dict.pop('url')
                print(json.dumps(form_dict))  # PRINT RESULT
Exemplo n.º 38
0
def get_metadata(path):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f, strict=False)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
    
    if (info != None):
        author = info.author
        creator = info.creator
        producer = info.producer
        subject = info.subject
        title = info.title
    else:
        # print('Metadata: None')
        info = {}
        info.setdefault('/Author', '')
        
    # for key in info:
    #     print (key, ":", info[key])

    # print(info)
    return (info, number_of_pages)
Exemplo n.º 39
0
    def extract_metadata(self, file_path):
        with open(file_path, 'rb') as fh:
            pdf = PdfFileReader(fh, strict=False)
            meta = pdf.getDocumentInfo()
            if meta is not None:
                self.update('title', meta.title)
                self.update('author', meta.author)
                self.update('generator', meta.creator)
                self.update('generator', meta.producer)
                if meta.subject:
                    self.result.keywords.append(meta.subject)

            xmp = pdf.getXmpMetadata()
            if xmp is not None:
                self.update('id', xmp.xmpmm_documentId)
                for lang, title in xmp.dc_title.items():
                    self.update('title', title)
                    self.result.languages.append(lang)
                self.update('generator', xmp.pdf_producer)
                self.update('created_at', xmp.xmp_createDate)
                self.update('modified_at', xmp.xmp_modifyDate)
                self.result.languages.extend(xmp.dc_language)
Exemplo n.º 40
0
def get_info(path):
    with open(path, 'rb') as f:
        pages = convert_from_path(path, 200)
        image_counter = 1
        for page in pages:
            filename = "page_" + str(image_counter) + ".jpg"
            page.save(filename, 'JPEG')
            image_counter = image_counter + 1
        filelimit = image_counter - 1
        for i in range(1, filelimit + 1):
            filename = "page_" + str(i) + ".jpg"
            text = str(((pytesseract.image_to_string(Image.open(filename)))))
            text = text.replace('-\n', '')
            final_text = []
            final_text.append(text)
            finaltext = "\n".join(final_text)
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
    return ([
        path, finaltext, info.author, info.creator, info.producer,
        info.subject, info.title
    ])
Exemplo n.º 41
0
def extract_data(request):

    if request.FILES:

        file = request.FILES['file']
        input = PdfFileReader(file)
        if input.isEncrypted:
            input.decrypt('')

        info = input.getDocumentInfo()
        title = info.title
        author = info.author

        d = Data.objects.create(title=title, author=author)
        d.save()

        response = {'status': 1, 'message': 'Data saved'}
        return HttpResponse(response, content_type='application/json')

    else:
        output = ''
        return HttpResponse(output)
Exemplo n.º 42
0
    def saveDocumentInfos(self, pdfDocument: PdfFileReader,
                          fileDescriptor: FileDescriptor,
                          dbQuerier: DbQuerier) -> None:

        documentMetas = self.mapDocumentInfos(pdfDocument.getDocumentInfo())

        def getMeta(key: str) -> Any:
            if key in documentMetas:
                return documentMetas[key]
            else:
                return None

        # Saving document
        documentWordCount, documentCharacterCount = self.getDocumentWordAndCharacterCount(
            pdfDocument)
        documentEntity = dbQuerier.getFileDocumentEntity(
            fileDescriptor=fileDescriptor,
            title=getMeta('title'),
            author=getMeta('author'),
            pageCount=pdfDocument.getNumPages(),
            wordCount=documentWordCount,
            characterCount=documentCharacterCount)

        # Saving document chapters, TODO

        # Saving document metas
        for documentMetaName, documentMetaValue in documentMetas.items():
            metaNameEntity = dbQuerier.getMetaNameEntity(documentMetaName)

            if isinstance(documentMetaValue, list) or isinstance(
                    documentMetaValue, set):
                for _documentMetaValue in documentMetaValue:
                    dbQuerier.getMetaValueEntity(documentEntity,
                                                 metaNameEntity,
                                                 _documentMetaValue)
            else:
                dbQuerier.getMetaValueEntity(documentEntity, metaNameEntity,
                                             documentMetaValue)
Exemplo n.º 43
0
def _search(keyword, metaLocat, archive):
    """Assisting function to searchFile(). Searches through given archive and returns
       a list of the file(s) that match the keyword in the given metadata location."""

    global progLocation
    retList = []
    tempFolderPath = os.path.join(
        progLocation,
        "_tempFolder")  # Create temporary folder to extract zip to

    if os.path.exists(tempFolderPath):
        creTime = int(os.path.getctime(tempFolderPath))
        curTime = int(time.time())

        if curTime - creTime > 180:
            shutil.rmtree(os.path.join(tempFolderPath))

            os.makedirs(tempFolderPath)
            archive.extractall(tempFolderPath)
    else:
        os.makedirs(tempFolderPath)
        archive.extractall(tempFolderPath)

    count = 0
    for fileName in os.listdir(tempFolderPath):
        count += 1
        #print(count)
        if fileName.endswith(".pdf"):

            openPDF = PdfFileReader(
                open(os.path.join(tempFolderPath, fileName), "rb"))
            infoDict = openPDF.getDocumentInfo()
            #print(infoDict)

            if keyword.lower() in infoDict[metaLocat].lower():
                retList.append(fileName)

    return retList
Exemplo n.º 44
0
def main():
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('Making pdf_files.json from base pdf files')

    with ZipFile(DOCS_PATH) as myzip:
        # List files inside zip
        filenames = list(map(lambda x: x.filename, filter(lambda x: not x.is_dir(), myzip.infolist())))
        pdf_dict = defaultdict(dict)
        for file in filenames:
            logger.info(f"Processing {file}...")
            try:
                pdfReader = PdfFileReader(BytesIO(myzip.read(file)))  # read file
            except Exception as e:  # In case the file is corrupted
                logger.warning(e)
                logger.info(f"Attempting to recover {file}...")
                pdfReader = file_recovery(file, myzip)  # attempting to recover file
            # doc_dict holds the attributes of each pdf file
            doc_dict = {i[1:]: str(j) for i, j in pdfReader.getDocumentInfo().items()}
            doc_dict["Country"] = file.split("/")[0]
            doc_dict["Text"] = ""
            for page in range(pdfReader.numPages):
                try:
                    page_text = pdfReader.getPage(page).extractText()  # extracting pdf text
                except TypeError as e:
                    logger.warning(e)
                    logger.info(f"Skipping {file}...")
                    continue
                    # doc_dict["Text"] = 
                    # break
                page_text = text_cleaning(page_text)  # clean pdf text
                doc_dict["Text"] += page_text
            pdf_dict[os.path.splitext(os.path.basename(file))[0]] = doc_dict

    with open(os.path.join(INTER_PATH, 'pdf_files.json'), 'w') as outfile:
        json.dump(pdf_dict, outfile, ensure_ascii=False, indent=4)
Exemplo n.º 45
0
def pdfMetaData(file_path, save=True):
    '''Get PDF document metadata, takes 2 arguments, file_path and save (boolean, default is True)'''
    pdf_doc = PdfFileReader(open(file_path, "rb"))

    if pdf_doc.isEncrypted:
        try:
            if pdf_doc.decrypt("") != 1:
                sys.exit("target pdf document is encrypted... exiting...")
        except:
            sys.exit("target pdf document is encrypted with an unsupported algorithm... exiting...")

    doc_info = pdf_doc.getDocumentInfo()
    stats = os.stat(file_path)
    now = dt.now()
    file_name = getFileName(file_path)
    metadata = "Time: %d/%d/%d %d : %d : %d. Found the following metadata for file %s:\n\n" % (now.year, now.month,
                                                                                               now.day, now.hour, now.minute,
                                                                                               now.second, file_name[:-4])
    try:
        for md in doc_info:
            metadata += str(md[1:]) + " : " + pretifyPyPDF2Time(str(md[1:]) ,str(doc_info[md])) + "\n"
    except TypeError:
        sys.exit("Couldn't read document info! Make sure target is a valid pdf document...")

    metadata += "Last metadata mod Date: %s\nLast Mod Date: %s\nLast Access Date: %s\nOwner User ID: %s" %(dt.fromtimestamp(stats.st_ctime),
                                                                                                           dt.fromtimestamp(stats.st_mtime),
                                                                                                           dt.fromtimestamp(stats.st_atime),
                                                                                                           stats.st_uid)
    try:
        print(metadata)
    except UnicodeEncodeError:
        print("Console encoding can't decode the result. Enter chcp 65001 in the console and rerun the script.")

    if save:
        file_name = getFileName(file_path)
        tgt = file_name + ".txt"

        saveResult(tgt, metadata)
Exemplo n.º 46
0
def extract_info(document: Document):
    if document.filename is None:
        get_filename(document)
    if document.is_pdf:
        with open(document.path, 'rb') as f:
            pdf = PdfFileReader(f, strict=False)
            # TODO: Handle encrypted files

            document.num_pages = pdf.getNumPages()
            informations = pdf.getDocumentInfo()
            if informations is not None:
                document.info.author = "unknown" if not informations.author else informations.author
                document.info.creator = "unknown" if not informations.creator else informations.creator
                document.info.producer = "unknown" if not informations.producer else informations.producer
                document.info.subject = "unknown" if not informations.subject else informations.subject
                document.info.title = "unknown" if not informations.title else informations.title
    else:
        document.num_pages = 1
        document.info.author = "unknown"
        document.info.creator = "unknown"
        document.info.producer = "unknown"
        document.info.subject = "unknown"
        document.info.title = "unknown"
Exemplo n.º 47
0
 def get_pdf_exif(self, pdf_file):
     Logger.printMessage(
         message='{methodName}'.format(methodName='get_pdf_exif'),
         description=pdf_file,
         debug_module=True)
     info = ''
     data = {}
     try:
         with open(pdf_file, 'rb') as f:
             pdf = PdfFileReader(f)
             info = pdf.getDocumentInfo()
             number_of_pages = pdf.getNumPages()
         if info:
             for a in info:
                 data[a] = info[a]
         return data
     except Exception as e:
         Logger.printMessage(
             message='{methodName}'.format(methodName='exception'),
             description=e,
             debug_module=True)
         return e
     return -1
Exemplo n.º 48
0
def print_pdf(file_full_path, color_mode):
	"""Analyzes the metadata of a .pdf file"""
	# Header with file path
	if color_mode: cprint("\n[+] Metadata for file: %s" % (file_full_path), "green", attrs=["bold"])
	else: print "\n[+] Metadata for file: %s" % (file_full_path)
	# Open the file
	try: 
		pdf_file = PdfFileReader(file(file_full_path, "rb"))
	except: 
		if color_mode: cprint("Could not read this file. Sorry!", "red")
		else: print "Could not read this file. Sorry!"
		return
	if pdf_file.isEncrypted: # Temporary workaround, pdf encrypted with no pass
		try: 
			pdf_file.decrypt('')
		except: 
			if color_mode: cprint("\tCould not decrypt this file. Sorry!", "red")
			else: print "\tCould not decrypt this file. Sorry!"
			return
	# Data structure with document information
	pdf_info = pdf_file.getDocumentInfo()
	# Print metadata
	if pdf_info: 
		for metaItem in pdf_info: 
			try: 
				if color_mode: 
					cprint("\t-" + metaItem[1:] + ": ", "cyan", end="")
					cprint(pdf_info[metaItem])
				else: 
					print "\t-" + metaItem[1:] + ": " + pdf_info[metaItem]
			except TypeError: 
				if color_mode: cprint("\t-" + metaItem[1:] + ": " + "Error - Item not readable", "red")
				else: print "\t-" + metaItem[1:] + ": " + "Error - Item not readable"
	else:
		if color_mode: cprint("\t No data found", "red")
		else: print "\t No data found"
	print ""
Exemplo n.º 49
0
    def add_completed_pdf_files(self, azubi_name, year, completed_pdf_folder,
                                pdfbanner_grid):
        """
        Add completed PDF files to a scroll view

        Search every passed folder for completed PDF files of a given year
        and add for every completion a specific banner to the
        pdfbanner_grid.

        Parameters
        ----------
        azubi_name : str
            The name of the trainee
        year : str
            The apprenticeship year
        completed_pdf_folder : list
            All files from a folder which contains completed pdf files
        pdfbanner_grid : GridLayout
            Used to add Pdfbanner, which is a GridLayout,
            inside the pdfbanner_grid
            (is a child of the ScrollView Class)
        """
        self.root.ids['finalizedbanner_screen'].ids['title'].text = \
            f"Abgezeichnete Berichte - {year}. Lehrjahr"
        for pdf in completed_pdf_folder:
            with open(
                    os.path.join(os.getcwd(), 'Azubis', azubi_name,
                                 f'Nachweise_{year}', pdf), 'rb') as f:
                p = PdfFileReader(f)
                information = p.getDocumentInfo()
                c_w = information['/Calendar_week']
            f_num = pdf[-6:-4] if pdf[-6].isdigit() else pdf[-5]
            b = (PdfBanner(azubi_name,
                           int(f_num),
                           int(year),
                           c_w_number=int(c_w)))
            pdfbanner_grid.add_widget(b)
Exemplo n.º 50
0
def pdfMetaData(file_path, pwd="", save=True):
    pdf_doc = PdfFileReader(open(file_path, "rb"))

    if pdf_doc.isEncrypted:
        if pdf_doc.decrypt(pwd) == 0:
            sys.exit("target pdf document is encrypted... exiting...")

    doc_info = pdf_doc.getDocumentInfo()
    stats = os.stat(file_path)
    metadata = ""
    for md in doc_info:
        metadata += str(md[1:]) + " : " + pretifyPyPDF2Time(
            str(md[1:]), str(doc_info[md])) + "\n"

    metadata += "Last metadata mod Date: %s\nLast Mod Date: %s\nLast Access Date: %s\nOwner User ID: %s" % (
        dt.fromtimestamp(stats.st_ctime), dt.fromtimestamp(
            stats.st_mtime), dt.fromtimestamp(stats.st_atime), stats.st_uid)
    print(metadata)

    if save:
        file_name = getFileName(file_path)
        tgt = file_name + ".txt"

        saveResult(tgt, metadata)
Exemplo n.º 51
0
    def extractData(extension, location):
        meta_data = {'location': location, 'extension': extension}
        meta_data['location'] = meta_data['location'].split('\\')[-1]
        if extension == 'pdf':
            #print(location)
            with open(location, 'rb') as f:
                pdf_to_get = PdfFileReader(f)
                file_info = pdf_to_get.getDocumentInfo()
                print(file_info)
                meta_data['author'] = file_info[
                    '/Author'] if '/Author' in file_info else ''
                #meta_data['bookname']=file_info['/Title'] if ('/Title' in file_info and file_info['/Title'] is not "u''") else os.path.basename(f.name).split('.')[0]
                meta_data['bookname'] = os.path.basename(f.name).split('.')[0]
                print(meta_data['bookname'])
        if extension == 'docx' or extension == 'docs' or extension == 'doc':
            with open(location, 'rb') as f:
                zf = zipfile.ZipFile(location)
                doc = lxml.etree.fromstring(zf.read('docProps/core.xml'))
                ns = {'dc': 'http://purl.org/dc/elements/1.1/'}
                if doc.xpath('//dc:creator', namespaces=ns)[0].text:
                    meta_data['author'] = doc.xpath('//dc:creator',
                                                    namespaces=ns)[0].text
                else:
                    meta_data['author'] = ''
                if doc.xpath('//dc:title', namespaces=ns)[0].text:
                    meta_data['bookname'] = doc.xpath(
                        '//dc:title', namespaces=ns)[0].text if doc.xpath(
                            '//dc:title', namespaces=ns)[0].text else ''
                else:
                    meta_data['bookname'] = os.path.basename(
                        meta_data['location']).split('.')[0]

        ########################## --------------- ADD CODE FOR PPTX Files To EXTRACT DATA
        ########################## --------------- ALSO, SEARCH FOR META-DATA FILE WHEN ADDING DATA

        createOrUpdateBook(meta_data)
Exemplo n.º 52
0
def preprocess_pdf(pdf_name, document):
    # import pdb;pdb.set_trace()
    now = datetime.now()
    name = pdf_name.split('.')[0].split('/')[-1]
    path = os.getcwd() + '/media/' + 'CVS/{0}/{1}/{2}/'.format(
        now.year, '0' + str(now.month) if now.month < 10 else now.month,
        now.day) + name
    pdf_path = path + '.pdf'
    # f = open(document.read(), 'rb')
    pdf_copy = copy(document._file)
    pdf = PdfFileReader(document._file.file)
    # f.close()
    information = pdf.getDocumentInfo()
    number_of_pages = pdf.getNumPages()
    try:
        os.mkdir(path)
    except OSError:
        #print ("Creation of the directory %s failed" % path)
        pass
    else:
        #print ("Successfully created the directory %s " % path)
        pass

    convert_pdf_to_images(pdf_name, path, name, pdf_copy)
Exemplo n.º 53
0
class PdfHandler(object):
    def __init__(self, pdfFile):
        self.openFile(pdfFile)

    def openFile(self, pdfFile):
        self.pdfObj = open(pdfFile, 'rb')
        self.pdf = PdfFileReader(self.pdfObj)

        self.info = self.pdf.getDocumentInfo()
        self.number_of_pages = self.pdf.getNumPages()

    def closePdf(self):
        self.pdfObj.close()

    def getNumberOfPages(self):
        return self.pdf.getNumPages()

    def getPdfMetadata(self):
        metadata = {}

        metadata["number_of_pages"] = self.number_of_pages
        metadata["info"] = {}
        metadata["info"]["default"] = self.info
        metadata["info"]['producer'] = self.info.producer
        metadata["info"]['author'] = self.info.author
        metadata["info"]['creator'] = self.info.creator

        return metadata

    def getText(self):
        text = ""
        text = self.pdf.getPage(4).extractText()
        # for pageNumber in range(1, self.number_of_pages):
        #     text = text + " " + self.pdf.getPage(pageNumber).extractText()

        return text
Exemplo n.º 54
0
def main():
    # 获取一个PdfFileReader对象
    pdf_input = PdfFileReader(open('mdb.pdf', 'rb'))
    pge_num = pdf_input.getNumPages()  # 得到页码数
    print(pge_num)
    print(pdf_input.getDocumentInfo())  # 得到文档信息
    # 返回PageObject对象
    pages_from_row = [
        pdf_input.getPage(i) for i in range(pdf_input.getNumPages())
    ]

    # 获取一个PdfFileWriter对象
    pdf_output = PdfFileWriter()
    # 将PageObject添加到PdfFileWriter
    for page in pages_from_row:
        pdf_output.addPage(page)
    # 输出到文件中
    pdf_output.write(open('mdbt.pdf', 'wb'))

    # 合并两个pdf文件
    merger = PdfFileMerger()
    merger.append(PdfFileReader(open('mdb.pdf', 'rb')))
    merger.append(PdfFileReader(open('mdbt.pdf', 'rb')))
    merger.write(open('mdb-merger.pdf', 'wb'))
Exemplo n.º 55
0
def get_metadata(path):
    # Checks if Scanned PDF (Needs to be added)

    text = ""
    curr_page = 0
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()

    while curr_page < number_of_pages:
        page = pdf.getPage(curr_page)
        curr_page += 1
        text += page.extractText()

    metadata = {}
    metadata['author'] = info.author
    metadata['creator'] = info.creator
    metadata['producer'] = info.producer
    metadata['subject'] = info.subject
    metadata['title'] = info.title
    metadata['numpages'] = number_of_pages
    metadata['summary'] = generate_summary(text)
    return metadata
Exemplo n.º 56
0
def get_metadata(filename):
    # reading data from pdf file
    fin = open(filename, 'rb')

    # initializing  pyPDF2
    reader = PdfFileReader(fin)

    metadata = reader.getDocumentInfo()
    number_of_pages = reader.getNumPages()

    # Preparing data
    author = metadata.author
    creator = metadata.creator
    producer = metadata.producer
    subject = metadata.subject
    title = metadata.title
    pages = number_of_pages

    with open(filename + "_Output.txt", "w") as text_file:
        # print(metadata, file=text_file)
        pickle.dump(metadata, text_file)
    fin.close()
    pprint.pprint(metadata)
    print("Metadata has been saved to " + filename + "_Output.txt")
Exemplo n.º 57
0
def print_pdf(file_full_path):
    # Header with file path
    cprint("[+] Metadata for file: %s " % (file_full_path),
           "green",
           attrs=['bold'])
    # Open the file
    pdf_file = PdfFileReader(file(file_full_path, 'rb'))
    # Create a dictorionary with the info
    pdf_info = pdf_file.getDocumentInfo()
    # Print metadata
    if pdf_info:
        for metaItem in pdf_info:
            try:
                cprint('\t ' + metaItem[1:] + ': ', 'cyan', end="")
                cprint(pdf_info[metaItem])
            except TypeError:
                cprint(
                    '\t ' + metaItem[1:] + ': ' + 'Error - Item not redeable',
                    'red')
    else:
        cprint('Not data found', 'red')
    # Print other info
    cprint("\t Number of pages: %s" % pdf_file.getNumPages(), 'cyan')
    cprint("\t Is Encripted: %s" % pdf_file.getIsEncrypted(), 'cyan')
Exemplo n.º 58
0
    def parse_api(self, response):
        raw_data = response.body
        data = json.loads(raw_data)
        for issue in data["issues"]:
            for key in issue:
                if key == 'id':
                    link = 'https://www.myeblaettle.de/frontend/catalogs/' + str(
                        issue[key]) + '/1/pdf/complete.pdf'
                    print(link)
                    # path = "/Users/mr/Documents/scrapy_tutorial/Routines/2021-02-24 (34).pdf"

                    with open(link, "rb") as f:
                        pdf = PdfFileReader(f)
                        info = pdf.getDocumentInfo()
                        number_pages = pdf.getNumPages()
                        # date = datetime.strptime(info["/ModDate"],'%y/%m/%d %H:%M:%S')

                    print(info)
                    print(date)

                else:
                    pass

        yield {"linkid": link}
Exemplo n.º 59
0
    def pdf_print(self):
        file_list = self.__generate_pdfs()
        merger = PdfFileMerger()

        for pdf in file_list:
            doc = PdfFileReader(pdf)
            info = doc.getDocumentInfo()
            merger.append(pdf, bookmark=info["/Title"].split(" - Pharmaship")[0])

        # Create a temporary file
        tmp_file = tempfile.NamedTemporaryFile(
            prefix="pharmaship_all_",
            suffix=".pdf",
            delete=False
            )

        merger.write(tmp_file.name)
        merger.close()

        # Cleanup
        for pdf in file_list:
            Path(pdf).unlink()

        return tmp_file.name
Exemplo n.º 60
0
    def verify(self):
        """Verify signature of the current document

        Returns None if there is no signature, False if it is invalid and True if it is valid.
        """

        file_in = open(self.file, 'rb')
        pdf_reader = PdfFileReader(file_in)
        metadata = pdf_reader.getDocumentInfo().copy()
        if '/Signature' not in metadata:
            return None

        serialized_signature = metadata['/Signature']

        stored_signature = Signature.from_serialized(serialized_signature,
                                                     self.w3)
        if stored_signature == None:
            return False

        calculated_signature = Signature(
            stored_signature.name,
            self._get_hash(stored_signature.name),
            stored_signature.transaction,
        )

        if stored_signature == calculated_signature:
            return f'''Signed by **{stored_signature.name}**

            **Date and time**: {stored_signature.date}

            **From address**: {stored_signature.address}

            **Transaction hash**: {stored_signature.transaction}
            '''
        else:
            return False