def redact_(file_name, project): import re output_file_name = re.split('[.]', file_name) output_file_name = output_file_name[ 0] + '_redacted' + '.' + output_file_name[1] import mimetypes import google.cloud.dlp import os os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\gcp_credentials\elaborate-howl-285701-105c2e8355a8.json" dlp = google.cloud.dlp_v2.DlpServiceClient() info_types = ['EMAIL_ADDRESS', 'PHONE_NUMBER'] #info types need to redacted info_types = [{"name": info_type} for info_type in info_types] image_redaction_configs = [] #redaction config inspect_config = { #inspection config "min_likelihood": 'POSSIBLE', "info_types": info_types, } #filename='page0.jpg' #project='elaborate-howl-285701' if info_types is not None: for info_type in info_types: image_redaction_configs.append({"info_type": info_type}) supported_content_types = { None: 0, # "Unspecified" "image/jpeg": 1, "image/bmp": 2, "image/png": 3, "image/svg": 4, "text/plain": 5, } content_type_index = supported_content_types.get( "application/octet-stream", 1) with open(file_name, mode="rb") as f: byte_item = {"type_": content_type_index, "data": f.read()} parent = f"projects/{project}" response = dlp.redact_image( request={ "parent": parent, "inspect_config": inspect_config, "image_redaction_configs": image_redaction_configs, "byte_item": byte_item, }) with open(output_file_name, mode="wb") as f: f.write(response.redacted_image) print("Wrote {byte_count} to {filename}".format(byte_count=len( response.redacted_image), filename=output_file_name)) return (output_file_name)
def redact_image_all_text( project, filename, output_filename, ): """Uses the Data Loss Prevention API to redact all text in an image. Args: project: The Google Cloud project id to use as a parent resource. filename: The path to the file to inspect. output_filename: The path to which the redacted image will be written. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct the image_redaction_configs, indicating to DLP that all text in # the input image should be redacted. image_redaction_configs = [{"redact_all_text": True}] # Construct the byte_item, containing the file's byte data. with open(filename, mode="rb") as f: byte_item = { "type": google.cloud.dlp_v2.FileType.IMAGE, "data": f.read() } # Convert the project id into a full resource id. parent = f"projects/{project}" # Call the API. response = dlp.redact_image( request={ "parent": parent, "image_redaction_configs": image_redaction_configs, "byte_item": byte_item, }) # Write out the results. with open(output_filename, mode="wb") as f: f.write(response.redacted_image) print("Wrote {byte_count} to {filename}".format(byte_count=len( response.redacted_image), filename=output_filename))
def redact_image(project, filename, output_filename, info_types, min_likelihood=None, mime_type=None): # Import the client library import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). info_types = [{'name': info_type} for info_type in info_types] # Prepare image_redaction_configs, a list of dictionaries. Each dictionary # contains an info_type and optionally the color used for the replacement. # The color is omitted in this sample, so the default (black) will be used. image_redaction_configs = [] if info_types is not None: for info_type in info_types: image_redaction_configs.append({'info_type': info_type}) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'min_likelihood': min_likelihood, 'info_types': info_types, } # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] or 'application/octet-stream' # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" 'image/jpeg': 1, 'image/bmp': 2, 'image/png': 3, 'image/svg': 4, 'text/plain': 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the byte_item, containing the file's byte data. with open(filename, mode='rb') as f: byte_item = {'type': content_type_index, 'data': f.read()} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.redact_image( parent, inspect_config=inspect_config, image_redaction_configs=image_redaction_configs, byte_item=byte_item) # Write out the results. with open(output_filename, mode='wb') as f: f.write(response.redacted_image) print("Wrote {byte_count} to {filename}".format(byte_count=len( response.redacted_image), filename=output_filename))
def redact_image(project, filename, output_filename, info_types, min_likelihood=None, mime_type=None): """Uses the Data Loss Prevention API to redact protected data in an image. Args: project: The Google Cloud project id to use as a parent resource. filename: The path to the file to inspect. output_filename: The path to which the redacted image will be written. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. mime_type: The MIME type of the file. If not specified, the type is inferred via the Python standard library's mimetypes module. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). info_types = [{'name': info_type} for info_type in info_types] # Prepare image_redaction_configs, a list of dictionaries. Each dictionary # contains an info_type and optionally the color used for the replacement. # The color is omitted in this sample, so the default (black) will be used. image_redaction_configs = [] if info_types is not None: for info_type in info_types: image_redaction_configs.append({'info_type': info_type}) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'min_likelihood': min_likelihood, 'info_types': info_types, } # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] or 'application/octet-stream' # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" 'image/jpeg': 1, 'image/bmp': 2, 'image/png': 3, 'image/svg': 4, 'text/plain': 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the byte_item, containing the file's byte data. with open(filename, mode='rb') as f: byte_item = {'type': content_type_index, 'data': f.read()} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.redact_image( parent, inspect_config=inspect_config, image_redaction_configs=image_redaction_configs, byte_item=byte_item) # Write out the results. with open(output_filename, mode='wb') as f: f.write(response.redacted_image) print("Wrote {byte_count} to {filename}".format( byte_count=len(response.redacted_image), filename=output_filename))
def redact_image( project, filename, #output_filename, info_types, custom_regexes=None, min_likelihood=None, mime_type=None, ): """Uses the Data Loss Prevention API to redact protected data in an image. Args: project: The Google Cloud project id to use as a parent resource. filename: The path to the file to inspect. output_filename: The path to which the redacted image will be written. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. mime_type: The MIME type of the file. If not specified, the type is inferred via the Python standard library's mimetypes module. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). info_types = [{"name": info_type} for info_type in info_types] if custom_regexes is None: custom_regexes = [] regexes = [{ "info_type": { "name": "CUSTOM_REGEX_{}".format(i) }, "regex": { "pattern": "[\d]" }, } for i, custom_regex in enumerate(custom_regexes)] # Prepare image_redaction_configs, a list of dictionaries. Each dictionary # contains an info_type and optionally the color used for the replacement. # The color is omitted in this sample, so the default (black) will be used. image_redaction_configs = [] if info_types is not None: for info_type in info_types: image_redaction_configs.append({"info_type": info_type}) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { "min_likelihood": min_likelihood, "info_types": info_types, "custom_info_types": [{ "info_type": { "name": "PONumber" }, "regex": { "pattern": "[\d]" }, "likelihood": "POSSIBLE", }] } pdffile = filename fileName = os.path.basename(pdffile) doc = fitz.open(pdffile) os.makedirs("output/" + fileName + "/pdftoImage", exist_ok=True) os.makedirs("output/" + fileName + "/redactedImage", exist_ok=True) numPages = doc.pageCount #number of page print("Your document has " + str(numPages) + " pages.") for pageNum in range(numPages): page = doc.loadPage(pageNum) pix = page.getPixmap() output = "output/" + fileName + "/pdftoImage/page" + str( pageNum).zfill(3) + ".png" pix.writePNG(output) pageNum = pageNum + 1 # If mime_type is not specified, guess it from the filename. imgdir = "output/" + fileName + "/pdftoImage" imglist = os.listdir(imgdir) imgcount = len(imglist) # pic count for i, f in enumerate(imglist): filename = os.path.join(imgdir, f) #print(filename) if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(str(filename)) mime_type = mime_guess[0] or "application/octet-stream" # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" "image/jpeg": 1, "image/bmp": 2, "image/png": 3, "image/svg": 4, "text/plain": 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the byte_item, containing the file's byte data. with open(str(filename), mode="rb") as f: byte_item = {"type": content_type_index, "data": f.read()} #byte_item = encoded_string # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.redact_image( parent, inspect_config=inspect_config, image_redaction_configs=image_redaction_configs, byte_item=byte_item, ) # Write out the results. output_filename = "output/" + fileName + "/redactedImage/" + os.path.basename( filename) with open(output_filename, mode="wb") as fn: fn.write(response.redacted_image) print("Redacted Page : " + str(i + 1)) ''' print( "Wrote {byte_count} to {filename}".format( byte_count=len(response.redacted_image), filename=output_filename ) ) ''' doc.close() docBuild = fitz.open() doc = docBuild redactedimgdir = "output/" + fileName + "/redactedImage" redactedimglist = os.listdir(redactedimgdir) redactedimgcount = len(redactedimglist) # pic count for i, f in enumerate(redactedimglist): img = fitz.open(os.path.join(redactedimgdir, f)) # open pic as document #print(os.path.join(redactedimgdir, f)) #print(os.path.join(redactedimgdir, f)) rect = img[0].rect # pic dimension pdfbytes = img.convertToPDF() # make a PDF stream img.close() # no longer needed imgPDF = fitz.open("pdf", pdfbytes) # open stream as PDF page = doc.newPage( width=rect.width, # new page with ... height=rect.height) # pic dimension page.showPDFpage(rect, imgPDF, 0) # image fills the page doc.save("output/" + fileName + "/Redacted-" + fileName + ".pdf") print("Document generated Successfully in " + "output/" + fileName + "/Redacted-" + fileName)
def redact_imageTypes(project, filename, output_filename, info_types, min_likelihood=None, mime_type=None): # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "kubernetes-e9dc8af4883c.json" # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). info_types = [{"name": "ALL_BASIC"}] # Prepare image_redaction_configs, a list of dictionaries. Each dictionary # contains an info_type and optionally the color used for the replacement. # The color is omitted in this sample, so the default (black) will be used. image_redaction_configs = [] if info_types is not None: for info_type in info_types: image_redaction_configs.append({'info_type': info_type}) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'min_likelihood': min_likelihood, 'info_types': info_types } # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] or 'application/octet-stream' # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" 'image/jpeg': 1, 'image/bmp': 2, 'image/png': 3, 'image/svg': 4, 'text/plain': 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the byte_item, containing the file's byte data. with open(filename, mode='rb') as f: a = f.read() byte_item = {'type': content_type_index, 'data': a} print(byte_item) # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.redact_image( parent, inspect_config=inspect_config, image_redaction_configs=image_redaction_configs, byte_item=byte_item) # Write out the results. with open(output_filename, mode='wb') as f: f.write(response.item.byte_item.data) print('Written')