Пример #1
0
def redact_(file_name, project):
    import re

    output_file_name = re.split('[.]', file_name)
    output_file_name = output_file_name[
        0] + '_redacted' + '.' + output_file_name[1]
    import mimetypes
    import google.cloud.dlp
    import os
    os.environ[
        "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\gcp_credentials\elaborate-howl-285701-105c2e8355a8.json"
    dlp = google.cloud.dlp_v2.DlpServiceClient()
    info_types = ['EMAIL_ADDRESS',
                  'PHONE_NUMBER']  #info types need to redacted
    info_types = [{"name": info_type} for info_type in info_types]

    image_redaction_configs = []  #redaction config
    inspect_config = { #inspection config
        "min_likelihood": 'POSSIBLE',
        "info_types": info_types,
    }
    #filename='page0.jpg'
    #project='elaborate-howl-285701'
    if info_types is not None:
        for info_type in info_types:
            image_redaction_configs.append({"info_type": info_type})
    supported_content_types = {
        None: 0,  # "Unspecified"
        "image/jpeg": 1,
        "image/bmp": 2,
        "image/png": 3,
        "image/svg": 4,
        "text/plain": 5,
    }
    content_type_index = supported_content_types.get(
        "application/octet-stream", 1)
    with open(file_name, mode="rb") as f:
        byte_item = {"type_": content_type_index, "data": f.read()}
    parent = f"projects/{project}"
    response = dlp.redact_image(
        request={
            "parent": parent,
            "inspect_config": inspect_config,
            "image_redaction_configs": image_redaction_configs,
            "byte_item": byte_item,
        })
    with open(output_file_name, mode="wb") as f:
        f.write(response.redacted_image)
    print("Wrote {byte_count} to {filename}".format(byte_count=len(
        response.redacted_image),
                                                    filename=output_file_name))
    return (output_file_name)
Пример #2
0
def redact_image_all_text(
    project,
    filename,
    output_filename,
):
    """Uses the Data Loss Prevention API to redact all text in an image.

    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        output_filename: The path to which the redacted image will be written.

    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct the image_redaction_configs, indicating to DLP that all text in
    # the input image should be redacted.
    image_redaction_configs = [{"redact_all_text": True}]

    # Construct the byte_item, containing the file's byte data.
    with open(filename, mode="rb") as f:
        byte_item = {
            "type": google.cloud.dlp_v2.FileType.IMAGE,
            "data": f.read()
        }

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Call the API.
    response = dlp.redact_image(
        request={
            "parent": parent,
            "image_redaction_configs": image_redaction_configs,
            "byte_item": byte_item,
        })

    # Write out the results.
    with open(output_filename, mode="wb") as f:
        f.write(response.redacted_image)

    print("Wrote {byte_count} to {filename}".format(byte_count=len(
        response.redacted_image),
                                                    filename=output_filename))
Пример #3
0
def redact_image(project,
                 filename,
                 output_filename,
                 info_types,
                 min_likelihood=None,
                 mime_type=None):

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare image_redaction_configs, a list of dictionaries. Each dictionary
    # contains an info_type and optionally the color used for the replacement.
    # The color is omitted in this sample, so the default (black) will be used.
    image_redaction_configs = []

    if info_types is not None:
        for info_type in info_types:
            image_redaction_configs.append({'info_type': info_type})

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'min_likelihood': min_likelihood,
        'info_types': info_types,
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0] or 'application/octet-stream'

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the byte_item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        byte_item = {'type': content_type_index, 'data': f.read()}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.redact_image(
        parent,
        inspect_config=inspect_config,
        image_redaction_configs=image_redaction_configs,
        byte_item=byte_item)

    # Write out the results.
    with open(output_filename, mode='wb') as f:
        f.write(response.redacted_image)
    print("Wrote {byte_count} to {filename}".format(byte_count=len(
        response.redacted_image),
                                                    filename=output_filename))
Пример #4
0
def redact_image(project, filename, output_filename,
                 info_types, min_likelihood=None, mime_type=None):
    """Uses the Data Loss Prevention API to redact protected data in an image.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        output_filename: The path to which the redacted image will be written.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare image_redaction_configs, a list of dictionaries. Each dictionary
    # contains an info_type and optionally the color used for the replacement.
    # The color is omitted in this sample, so the default (black) will be used.
    image_redaction_configs = []

    if info_types is not None:
        for info_type in info_types:
            image_redaction_configs.append({'info_type': info_type})

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'min_likelihood': min_likelihood,
        'info_types': info_types,
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0] or 'application/octet-stream'

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the byte_item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        byte_item = {'type': content_type_index, 'data': f.read()}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.redact_image(
        parent, inspect_config=inspect_config,
        image_redaction_configs=image_redaction_configs,
        byte_item=byte_item)

    # Write out the results.
    with open(output_filename, mode='wb') as f:
        f.write(response.redacted_image)
    print("Wrote {byte_count} to {filename}".format(
        byte_count=len(response.redacted_image), filename=output_filename))
Пример #5
0
def redact_image(
    project,
    filename,
    #output_filename,
    info_types,
    custom_regexes=None,
    min_likelihood=None,
    mime_type=None,
):
    """Uses the Data Loss Prevention API to redact protected data in an image.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        output_filename: The path to which the redacted image will be written.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{"name": info_type} for info_type in info_types]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        "info_type": {
            "name": "CUSTOM_REGEX_{}".format(i)
        },
        "regex": {
            "pattern": "[\d]"
        },
    } for i, custom_regex in enumerate(custom_regexes)]

    # Prepare image_redaction_configs, a list of dictionaries. Each dictionary
    # contains an info_type and optionally the color used for the replacement.
    # The color is omitted in this sample, so the default (black) will be used.
    image_redaction_configs = []

    if info_types is not None:
        for info_type in info_types:
            image_redaction_configs.append({"info_type": info_type})

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "min_likelihood":
        min_likelihood,
        "info_types":
        info_types,
        "custom_info_types": [{
            "info_type": {
                "name": "PONumber"
            },
            "regex": {
                "pattern": "[\d]"
            },
            "likelihood": "POSSIBLE",
        }]
    }

    pdffile = filename
    fileName = os.path.basename(pdffile)
    doc = fitz.open(pdffile)
    os.makedirs("output/" + fileName + "/pdftoImage", exist_ok=True)
    os.makedirs("output/" + fileName + "/redactedImage", exist_ok=True)
    numPages = doc.pageCount  #number of page
    print("Your document has " + str(numPages) + " pages.")
    for pageNum in range(numPages):
        page = doc.loadPage(pageNum)
        pix = page.getPixmap()
        output = "output/" + fileName + "/pdftoImage/page" + str(
            pageNum).zfill(3) + ".png"
        pix.writePNG(output)
        pageNum = pageNum + 1

    # If mime_type is not specified, guess it from the filename.
    imgdir = "output/" + fileName + "/pdftoImage"
    imglist = os.listdir(imgdir)
    imgcount = len(imglist)  # pic count

    for i, f in enumerate(imglist):
        filename = os.path.join(imgdir, f)
        #print(filename)
        if mime_type is None:
            mime_guess = mimetypes.MimeTypes().guess_type(str(filename))
            mime_type = mime_guess[0] or "application/octet-stream"

        # Select the content type index from the list of supported types.
        supported_content_types = {
            None: 0,  # "Unspecified"
            "image/jpeg": 1,
            "image/bmp": 2,
            "image/png": 3,
            "image/svg": 4,
            "text/plain": 5,
        }
        content_type_index = supported_content_types.get(mime_type, 0)

        # Construct the byte_item, containing the file's byte data.
        with open(str(filename), mode="rb") as f:
            byte_item = {"type": content_type_index, "data": f.read()}
        #byte_item = encoded_string
        # Convert the project id into a full resource id.
        parent = dlp.project_path(project)

        # Call the API.
        response = dlp.redact_image(
            parent,
            inspect_config=inspect_config,
            image_redaction_configs=image_redaction_configs,
            byte_item=byte_item,
        )

        # Write out the results.
        output_filename = "output/" + fileName + "/redactedImage/" + os.path.basename(
            filename)
        with open(output_filename, mode="wb") as fn:
            fn.write(response.redacted_image)
        print("Redacted Page : " + str(i + 1))
        '''    
        print(
            "Wrote {byte_count} to {filename}".format(
                byte_count=len(response.redacted_image), filename=output_filename
            )
        )
        '''
    doc.close()
    docBuild = fitz.open()
    doc = docBuild
    redactedimgdir = "output/" + fileName + "/redactedImage"
    redactedimglist = os.listdir(redactedimgdir)
    redactedimgcount = len(redactedimglist)  # pic count

    for i, f in enumerate(redactedimglist):
        img = fitz.open(os.path.join(redactedimgdir,
                                     f))  # open pic as document
        #print(os.path.join(redactedimgdir, f))
        #print(os.path.join(redactedimgdir, f))
        rect = img[0].rect  # pic dimension
        pdfbytes = img.convertToPDF()  # make a PDF stream
        img.close()  # no longer needed
        imgPDF = fitz.open("pdf", pdfbytes)  # open stream as PDF
        page = doc.newPage(
            width=rect.width,  # new page with ...
            height=rect.height)  # pic dimension
        page.showPDFpage(rect, imgPDF, 0)  # image fills the page
    doc.save("output/" + fileName + "/Redacted-" + fileName + ".pdf")
    print("Document generated Successfully in " + "output/" + fileName +
          "/Redacted-" + fileName)
Пример #6
0
def redact_imageTypes(project,
                      filename,
                      output_filename,
                      info_types,
                      min_likelihood=None,
                      mime_type=None):

    # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "kubernetes-e9dc8af4883c.json"

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{"name": "ALL_BASIC"}]

    # Prepare image_redaction_configs, a list of dictionaries. Each dictionary
    # contains an info_type and optionally the color used for the replacement.
    # The color is omitted in this sample, so the default (black) will be used.
    image_redaction_configs = []

    if info_types is not None:
        for info_type in info_types:
            image_redaction_configs.append({'info_type': info_type})

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'min_likelihood': min_likelihood,
        'info_types': info_types
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0] or 'application/octet-stream'

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the byte_item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        a = f.read()
        byte_item = {'type': content_type_index, 'data': a}

    print(byte_item)
    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.redact_image(
        parent,
        inspect_config=inspect_config,
        image_redaction_configs=image_redaction_configs,
        byte_item=byte_item)

    # Write out the results.
    with open(output_filename, mode='wb') as f:
        f.write(response.item.byte_item.data)
    print('Written')