Exemplo n.º 1
0
def remove_metadata_text(infile, ext):
        if ext.lower() == "pdf":
                trailer = PdfReader(infile)
                trailer.Info = {}
                PdfWriter(infile, trailer=trailer).write()
        elif ext.lower() == "docx":
                meta_fields= ["author", "category", "comments", "content_status", "identifier", "keywords", "language", "subject", "title"]
                document = Document(infile)
                properties = document.core_properties
                for meta_field in meta_fields:
                        setattr(properties, meta_field, "")
                document.save("%s" % infile)
                document = Document("%s" % infile)
Exemplo n.º 2
0
 def set_pdf_field_value(self, pdf_file, field, fieldValue):
     try:
         trailer = PdfReader(pdf_file)
         if trailer.Info is None:
             trailer.Info = pdfrw.objects.pdfdict.PdfDict()
         setDataQuery = 'trailer.Info.{f} = "{v}"'.format(f=field.replace(
             '/', ''),
                                                          v=fieldValue)
         exec(setDataQuery)
         new_pdf = os.path.split(pdf_file)[1]
         new_file = os.path.join(output_dir, new_pdf)
         PdfWriter(new_file, trailer=trailer).write()
         return new_file
     except:
         Logger.printMessage(pdf_file, is_error=True)
         return pdf_file
Exemplo n.º 3
0
def remove_metadata_text(infile, ext):
    if ext.lower() == "pdf":
        trailer = PdfReader(infile)
        trailer.Info = {}
        PdfWriter(infile, trailer=trailer).write()
    elif ext.lower() == "docx":
        meta_fields = [
            "author", "category", "comments", "content_status", "identifier",
            "keywords", "language", "subject", "title"
        ]
        document = Document(infile)
        properties = document.core_properties
        for meta_field in meta_fields:
            setattr(properties, meta_field, "")
        document.save("%s" % infile)
        document = Document("%s" % infile)
Exemplo n.º 4
0
import sys
import os

from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict

argv = sys.argv[1:]

if '-o' in argv:
    outfn = argv[argv.index('-o') + 1]
    del argv[argv.index('-o') + 1]
    del argv[argv.index('-o')]
else:
    outfn = 'output.pdf'

inpfn, underfn = argv
under = PdfReader(underfn)
trailer = PdfReader(inpfn)
for page, upage in zip(trailer.pages, under.pages):
    PageMerge(page).add(upage, prepend=1).render()

if trailer.Info is None:
    trailer.Info = IndirectPdfDict({})

# meta data comes from underneath.pdf
trailer.Info.Title = under.Info.Title
trailer.Info.Author = under.Info.Author
trailer.Info.Subject = under.Info.Subject

PdfWriter(outfn, trailer=trailer).write()
Exemplo n.º 5
0
def refingerprint_pdf(filename, directory, copy_count, suffix):
    try:

        base_dir = os.path.join('/tmp/', directory)

        base_file_path = os.path.join(base_dir, filename)

        #file size kb
        file_size = round(os.path.getsize(base_file_path) / 1024)

        content = PdfReader(base_file_path)

        if content.ID is None:
            file_id = 'No ID'
        else:
            file_id = str(content.ID[0]).replace('<', '').replace('>', '')\
                    .replace('(', '').replace(')', '')

        #bad file_ids can contain strange characters
        try:
            file_id.encode('utf-8').strip()
        except UnicodeDecodeError:
            file_id = 'Unreadable'

        processed_files = []

        for copy_index in range(copy_count):

            if suffix and suffix != '':
                save_filename = filename + '-' + suffix + '-' + str(
                    copy_index + 1) + '.pdf'
            else:
                save_filename = filename + '-' + str(copy_index + 1) + '.pdf'

            file_path = os.path.join('/tmp', directory, save_filename)

            download_link = '/fingerprinter/download/%s?file=%s' % (
                directory, save_filename)

            content = PdfReader(base_file_path)

            #add some random meta data
            if not content.Info:
                content.Info = PDFInfo()

            content.Info.randomMetaData = binascii.b2a_hex(
                os.urandom(20)).upper()

            _filename = filename.strip().encode('utf-8')

            #change id to random id
            md = md5(_filename)

            md.update(str(time.time()).encode('utf-8'))
            md.update(os.urandom(10))

            new_id = md.hexdigest().upper()

            #keep length 32
            new_id = new_id[0:32]

            while len(new_id) < 32:
                new_id += random.choice('0123456789ABCDEF')

            content.ID = [new_id, new_id]

            PdfWriter(file_path, trailer=content).write()

            copy_info = {
                'filename': save_filename,
                'download_link': download_link,
                'id': content.ID[0]
            }

            processed_files.append(copy_info)

        #save copy of info in file directory
        out_file = open(os.path.join(base_dir, 'file_info.json'), 'w')

        file_info = {
            'filename': filename,
            'size': file_size,
            'id': file_id,
            'directory_name': directory,
            'processed_files': processed_files
        }

        json.dump(file_info, out_file, indent=4)

        out_file.close()

        #delete generated files
        delete_refingerprint.delay(base_dir)

    except Exception as e:
        delete_refingerprint.delay(base_dir)

        raise (e)
Exemplo n.º 6
0
def cli(verbose, input, output):
    """
        input: input file or files

        output: output folder, will create if not found. 
    """
    if verbose:
        click.echo(f"Current args: {input} {output}")

    path = Path(input)
    folder = path.resolve()
    file_name = '.'

    if path.is_file():
        folder = path.absolute().parent
        file_name = path.name
    if verbose:
        click.echo(f"Current path: {path} {folder} {path.name}")

    files = [
        entry.path for entry in os.scandir(folder)
        if file_name in entry.name and entry.name.endswith('.pdf')
    ]

    if verbose:
        click.echo(f"Found {len(files)} files")
    number = 1
    out_path = os.path.realpath(output)

    if not os.path.exists(out_path):
        try:
            os.makedirs(out_path)
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    for file in files:
        out_file = os.path.join(out_path, file)
        trailer = PdfReader(file)

        if trailer.Info and trailer.Info.Title:
            click.echo(f'Current title: {trailer.Info.Title}')
        else:
            click.echo("Current file doesn't have an existing title")
            if not trailer.Info:
                trailer.Info = IndirectPdfDict(
                    Title='your title goes here',
                    Author='Title change',
                    Subject='This is a file with a changed title',
                    Creator='Title Change 0.1',
                )

        trailer.Info.Title = click.prompt(
            f'Write the new metadata title for {file}', type=str)

        PdfWriter(out_file, trailer=trailer).write()

        if verbose:
            click.echo(
                f"Wrote {os.path.basename(file)}, {number}/{len(files)}")

        number += 1

    click.echo('Done!')
Exemplo n.º 7
0
def _fill_pdf_metadata(out_file,
                       issuer,
                       issuer_address,
                       column_fields,
                       data,
                       global_columns,
                       verify_issuer,
                       conf,
                       interactive=False):

    # create version
    version = 2

    # create issuer object (json)
    issuer = {
        "name": issuer,
        "identity": {
            "address": issuer_address,
            "verification": json.loads(verify_issuer)['methods']
        }
    }

    # create metadata object (json) and add metadata
    metadata = {}

    # add custom metadata
    if column_fields:
        metadata_fields = json.loads(column_fields)['columns']
        for f in metadata_fields:
            key = list(f)[0]
            if key in data:
                field_properties = f[key]
                field_properties['value'] = data[key]
                metadata[key] = field_properties

    # add global field metadata
    if global_columns:
        global_fields = json.loads(global_columns)['fields']
        for g in global_fields:
            key = list(g)[0]
            # note that global fields override column data
            metadata[key] = g[key]

    # now look at special owner name/pubkey columns explicitly in code
    # TODO we should probably check if the public key is valid
    owner = None
    owner_address = None
    owner_pk = None
    if '__OWNER_PK__' in data and data[
            '__OWNER_PK__'] and '__OWNER_ADDRESS__' in data and data[
                '__OWNER_ADDRESS__']:
        # TODO maybe just calculate address from public key?
        owner_address = data['__OWNER_ADDRESS__']
        owner_pk = data['__OWNER_PK__']

        owner = {
            "name": data['__OWNER_NAME__'],
            "owner_address":
            data['__OWNER_ADDRESS__'],  # TODO needed? - can be derived
            "pk": owner_pk
        }

        # add the metadata
        pdf_metadata = PdfDict(version=version,
                               issuer=json.dumps(issuer),
                               metadata=json.dumps(metadata),
                               owner=json.dumps(owner),
                               owner_proof='',
                               chainpoint_proof='')
    else:
        # add the metadata (without dumps(owner) to keep owner empty)
        pdf_metadata = PdfDict(version=version,
                               issuer=json.dumps(issuer),
                               metadata=json.dumps(metadata),
                               owner='',
                               owner_proof='',
                               chainpoint_proof='')

    pdf = PdfReader(out_file)
    if pdf.Info:
        pdf.Info.update(pdf_metadata)
    else:
        pdf.Info = pdf_metadata
    PdfWriter().write(out_file, pdf)

    # if owner exists then need to add owner_proof
    # hash pdf, sign hash message using node and add in owner_proof
    if owner:
        ##import time
        ##start = time.time()
        sha256_hash = None
        with open(out_file, 'rb') as pdf:
            sha256_hash = hashlib.sha256(pdf.read()).hexdigest()
        if (conf.testnet):
            setup('testnet')
        else:
            setup('mainnet')

        host, port = conf.full_node_url.split(
            ':')  #TODO: update when NodeProxy accepts full url!
        proxy = NodeProxy(conf.full_node_rpc_user, conf.full_node_rpc_password,
                          host, port).get_proxy()

        # Due to an old unresolved issue still pending in Bitcoin v0.20.0
        # signmessage does not support signing with bech32 key.
        # To resolve we use the public key to get the base58check encoding that
        # signmessage is happy with so that we can sign!
        if (owner_address.startswith('bc') or owner_address.startswith('tb')):
            owner_address = PublicKey(owner_pk).get_address().to_string()

        # NOTE that address (the encoding) might have changed here from bech32
        # to legacy... take care if you use it again in this function!

        sig = proxy.signmessage(owner_address, sha256_hash)

        # add owner_proof to metadata
        pdf_metadata = PdfDict(owner_proof=sig)
        pdf = PdfReader(out_file)
        pdf.Info.update(pdf_metadata)
        PdfWriter().write(out_file, pdf)
        ##end = time.time()
        ##print(end-start, " seconds")
        ##exit()

    if interactive:
        # print progress
        print('.', end="", flush=True)