Exemplo n.º 1
0
    def AddTags(self, pdfPath):

        lstVal = self.getData()

        trailer = PdfReader(pdfPath)

        metadata = PdfDict(Id=lstVal[0],
                           Data_Registo=lstVal[1],
                           CentroPesca=lstVal[2],
                           Registador=lstVal[3],
                           Info=lstVal[4])

        trailer.Info.update(metadata)
        PdfWriter().write(pdfPath, trailer)

        trailer = PdfReader(pdfPath)
        pprint(trailer.Info)
Exemplo n.º 2
0
 def load(self, sourcename):
     ''' Load a Form XObject from a uri
     '''
     info = ViewInfo(sourcename)
     fname = info.docname
     pcache = self.cached_pdfs
     doc = pcache.get(fname)
     if doc is None:
         doc = pcache[fname] = PdfReader(fname, decompress=self.decompress)
     return docxobj(info, doc, allow_compressed=not self.decompress)
Exemplo n.º 3
0
 def run_count_pages_from_file(self, filefield):
     count_pages = 0
     try:
         path = filefield.file.name
         pdf = PdfReader(path)
         count_pages += len(pdf.pages)
         filefield.file.close()
     except Exception as e:
         pass
         # print(e)
     else:
         return count_pages
Exemplo n.º 4
0
    def paginas(self):
        if not self.FIELDFILE_NAME:
            raise Exception

        if not self.id:
            raise Exception

        if self._paginas > 0:
            return self._paginas
        elif self._paginas == -1:
            return 0

        count_pages = 0
        try:
            for field in self.FIELDFILE_NAME:
                if not getattr(self, field):
                    return 0

                path = getattr(self, field).file.name

                if path.endswith('.pdf'):
                    pdf = PdfReader(path)
                    count_pages += len(pdf.pages)
                    getattr(self, field).file.close()
                elif '.doc' in path:
                    return 0

        except Exception as e:
            count_pages = -1

        finally:
            self._paginas = count_pages
            run_sql(
                """update {}
                        set _paginas = {}
                        where id = {};""".format(
                    '%s_%s' % (self._meta.app_label,
                               self._meta.model_name),
                    count_pages,
                    self.id
                ))
            if count_pages == -1:
                return 0
            return count_pages
Exemplo n.º 5
0
def docxobj(pageinfo, doc=None, allow_compressed=True):
    ''' docxobj creates and returns an actual Form XObject.
        Can work standalone, or in conjunction with
        the CacheXObj class (below).
    '''
    if not isinstance(pageinfo, ViewInfo):
        pageinfo = ViewInfo(pageinfo)

    # If we're explicitly passed a document,
    # make sure we don't have one implicitly as well.
    # If no implicit or explicit doc, then read one in
    # from the filename.
    if doc is not None:
        assert pageinfo.doc is None
        pageinfo.doc = doc
    elif pageinfo.doc is not None:
        doc = pageinfo.doc
    else:
        doc = pageinfo.doc = PdfReader(pageinfo.docname,
                                       decompress=not allow_compressed)
    assert isinstance(doc, PdfReader)

    sourcepage = doc.pages[(pageinfo.page or 1) - 1]
    return pagexobj(sourcepage, pageinfo, allow_compressed)
Exemplo n.º 6
0
    def run_bi_files(self):
        models = [
            {
                'model': MateriaLegislativa,
                'file_field': 'texto_original',
                'hook': '',  # 'run_bi_files_materias_legislativas'
            },
            {
                'model': DocumentoAcessorio,
                'file_field': 'arquivo',
                'hook': ''
            },
            {
                'model': NormaJuridica,
                'file_field': 'texto_integral',
                'hook': ''
            },
            {
                'model': AnexoNormaJuridica,
                'file_field': 'anexo_arquivo',
                'hook': ''
            },
            {
                'model': DocumentoAdministrativo,
                'file_field': 'texto_integral',
                'hook': ''
            },
            {
                'model': DocumentoAcessorioAdministrativo,
                'file_field': 'arquivo',
                'hook': ''
            },
            {
                'model': DiarioOficial,
                'file_field': 'arquivo',
                'hook': ''
            },
            {
                'model': VersaoDeMidia,
                'file_field': 'file',
                'hook': 'run_bi_files_midias'
            },
        ]

        for mt in models:  # mt = metadata
            if mt['hook']:
                getattr(self, mt['hook'])
                continue
            model = mt['model']
            file = mt['file_field']
            itens = model.objects.all()
            count_pages = 0
            count_reg = 0
            for i in itens:
                path = ''
                count_reg += 1
                try:
                    ff = getattr(i, file)
                    if ff:
                        path = ff.file.name
                        pdf = PdfReader(path)
                        count_pages += len(pdf.pages)
                        ff.file.close()
                except Exception as e:
                    print(e)

                if count_reg % 100 == 0:
                    print(count_reg, model._meta.verbose_name_plural,
                          count_pages)

            print(model._meta.verbose_name_plural, count_pages)
Exemplo n.º 7
0
    def AddMetadata(self, lstPdfIn):
        lstNameOut = []
        if lstPdfIn != []:
            try:

                Mx_name = self.name_Generator()
                for idx, pdf in enumerate(lstPdfIn):
                    _, file = os.path.split(pdf)
                    file_name = self.extract_file_name(file)
                    if self.PTEInfo.toPlainText() == '':
                        info = 'N/A'
                    else:
                        info = self.PTEInfo.toPlainText()

                    dictIn = {
                        'doc_num': str(idx + 1) + " Of " + str(len(lstPdfIn)),
                        'nome': file_name,
                        'data': datetime.today().date().isoformat(),
                        'horas': datetime.today().time().isoformat(),
                        'treePath': self.CBToStore.currentText(),
                        'doc_type': self.CBType.currentText(),
                        'info': info,
                        'systemEncoding': os.sys.getfilesystemencoding(),
                        'os': platform.platform()
                    }

                    trailer = PdfReader(pdf)  #Open the pdf FIle
                    metadata = PdfDict(
                        doc_num=dictIn['doc_num'],
                        nome=dictIn['nome'],
                        data=dictIn['data'],
                        horas=dictIn['horas'],
                        treePath=dictIn['treePath'],
                        doc_type=dictIn['doc_type'],
                        info=dictIn['info'],
                        systemEncoding=dictIn['systemEncoding'],
                        os=dictIn['os'],
                        numero=idx)  #Dict where we gointo insert our metadata
                    trailer.Info.clear(
                    )  #we clear the default data that the lib adds
                    trailer.Info.update(metadata)  #We add our own Metadata

                    abs_path = self.CBToStore.currentText()
                    if abs_path == None:
                        return False, None
                    else:
                        cat = self.CBType.currentText()
                        file_path = self.return_path(abs_path, cat)
                        #file_mx = file[:7] #separe the firts 6 char

                        namePath = os.path.join(file_path, Mx_name + ".pdf")
                        PdfWriter().write(namePath, trailer)
                        lstNameOut.append(namePath)
                bOK = True
            except:
                bOK = False
                #Check if it is encrypted
                try:
                    PdfReader(pdf)
                except ValueError:
                    QT_msg.aviso(
                        txt='O Ficherio <i>' + pdf +
                        '</i> esta <b>Corrompido</b> ou <b>Encriptado</b>, queria por favor removê-lo e voltar a tentar.'
                    )
        else:
            bOK = None
        return bOK, lstNameOut
Exemplo n.º 8
0
#! /usr/bin/env python

# ./gp.py -c hidost -s samples/seeds/ff0bfa347b60be403f3f13b8461d9e230570078b -e /Users/apple/EvadeML-master/samples/hidost_benign_3 -p 48 -g 20 -m 0.1 -x 0 -f 0 -t attack_hidost_hidost_benign_3 --round 1
# ./utils/detection_agent_server.py ./utils/36vms_sigs.pickle
from common import *
import pickle
import random
from pdfrw.pdfreader import PdfReader
from pdf_genome import PdfGenome
ben_folder = '/Users/apple/EvadeML-master/samples/hidost_benign_3'
ben_path = '/Users/apple/EvadeML2.0/samples/hidost_benign_3/e23abe0df1bf1c01df7567ca11192f2576aaaf5c.pdf'
mal_path = '/Users/apple/EvadeML2.0/samples/seeds/ff0bfa347b60be403f3f13b8461d9e230570078b'


file = ben_path
print "file: %s" % file
PdfReader(file, slow_parsing = True)

root = PdfGenome.load_genome(file, pickleable = False)
print "load: %s" % file

files = list_file_paths(ben_folder)
print "ben_folder: %s" % ben_folder
print "ben_files: %s" % files

ext_genome = PdfGenome.load_external_genome(ben_folder, pickleable = False)
print "load_external_genome: %s" % ben_folder