def sendToDLP(transcript, projectID): dlpClient = dlp.DlpServiceClient() parent = dlpClient.project_path(projectID) # Prepare info_types by converting the list of strings into a list of info_types = [ 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'CREDIT_CARD_NUMBER', 'US_SOCIAL_SECURITY_NUMBER' ] # dictionaries (protos are also accepted). inspect_config = { 'info_types': [{ 'name': info_type } for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { 'transformations': [{ 'primitive_transformation': { 'replace_with_info_type_config': {} } }] } } regex = r".([A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*)(\sat\s+)((?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9]))" updatedTranscript = re.sub(regex, r" \1@\3", transcript) item = {'value': updatedTranscript} # Call the API dlpResponse = dlpClient.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) # Print out the results. print(dlpResponse.item.value)
"""The maximum number of findings to report (0 = server maximum)""" MAX_FINDINGS = 0 """The infoTypes of information to match""" """For more info visit: https://cloud.google.com/dlp/docs/concepts-infotypes""" INFO_TYPES = [ 'FIRST_NAME', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'US_SOCIAL_SECURITY_NUMBER' ] PROJECT_ID = '[PROJECT_ID_FOR_DLP_FINDINGS]' DATASET_ID = '[DATASET_ID_FOR_DLP_FINDINGS]' TABLE_ID = '[TABLE_ID_FOR_DLP_FINDINGS]' # End of User-configurable Constants # ---------------------------------- # Initialize the Google Cloud client libraries dlp = dlp.DlpServiceClient() storage_client = storage.Client() publisher = pubsub.PublisherClient() subscriber = pubsub.SubscriberClient() def create_DLP_job(data, done): """This function is triggered by new files uploaded to the designated Cloud Storage quarantine/staging bucket. It creates a dlp job for the uploaded file. Arg: data: The Cloud Storage Event Returns: None. Debug information is printed to the log. """ # Get the targeted file in the quarantine bucket
def deidentify(file_name, projectID): # Instantiates a client speechClient = speech.SpeechClient() dlpClient = dlp.DlpServiceClient() parent = dlpClient.project_path(projectID) # Prepare info_types by converting the list of strings into a list of info_types = ['PHONE_NUMBER', 'EMAIL_ADDRESS', 'CREDIT_CARD_NUMBER', 'US_SOCIAL_SECURITY_NUMBER'] # dictionaries (protos are also accepted). inspect_config = { 'info_types': [{'name': info_type} for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { 'transformations': [ { 'primitive_transformation': { 'replace_with_info_type_config': { } } } ] } } # The name of the audio file and path to transcribe #file_name = Path('./resources/sallybrown.flac') # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='en-US') # Detects speech in the audio file response = speechClient.recognize(config, audio) transcript = "" for result in response.results: transcript = transcript + result.alternatives[0].transcript; print('Original Transcript: {}'.format(transcript)) # Check transcription for email address, since speech-to-text returns " at " instead of "@" # Format with regex before sending to DLP api # Currently social security numbers and credit card numbers are interpreted as phone numbers regex = r".([A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*)(\sat\s+)((?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9]))" updatedTranscript = re.sub(regex, r" \1@\3", transcript) print('Email addresses reformatted: {}'.format(updatedTranscript)) # Construct item item = {'value': updatedTranscript} # Call the API dlpResponse = dlpClient.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) # Print out the results. print('Final Result with sensitive content redacted: {}'.format(dlpResponse.item.value))