def validate_obfuscation(self): """Validates obfuscation workflow.""" output_target = self.get_targets_from_remote_path(self.test_out, "*.tar.gz.gpg")[0] output_filename = os.path.basename(output_target.path) temp_output_filepath = os.path.join(self.temporary_dir, output_filename) with output_target.open("r") as input_file: with open(temp_output_filepath, "w") as output_file: copy_file_to_file(input_file, output_file) decrypted_filepath = temp_output_filepath[: -len(".gpg")] fs.decrypt_file(temp_output_filepath, decrypted_filepath, "insecure_secret.key") with tarfile.open(decrypted_filepath, "r:gz") as tfile: tfile.extractall(self.temporary_dir) # Validate package metadata info. metadata_filepath = os.path.join(self.temporary_dir, "metadata_file.json") with open(metadata_filepath) as metadata_file: metadata_info = json.load(metadata_file) self.assertItemsEqual(metadata_info["format_version"], self.FORMAT_VERSION) self.assertItemsEqual(metadata_info["pipeline_version"], self.PIPELINE_VERSION) self.validate_data_obfuscation() self.validate_events_obfuscation()
def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None, dir=None, hadoop_counter_incr_func=DEFAULT_HADOOP_COUNTER_FUNC): """ Creates a file object to be written to, whose contents will afterwards be encrypted. Parameters: output_file: a file object, opened for writing. key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded. recipients: an optional list of recipients to be loaded. If not specified, uses all loaded keys. progress: a function that is called periodically as progress is made. hadoop_counter_incr_func: A callback to a function that can generate MR counters so that non-critical GPG messages can be promoted to a visible section of the MR run log. """ with make_temp_directory(prefix="encrypt", dir=dir) as temp_dir: # Use temp directory to hold gpg keys. gpg = gnupg.GPG(gnupghome=temp_dir) gpg.encoding = 'utf-8' _import_key_files(gpg_instance=gpg, key_file_targets=key_file_targets, hadoop_counter_incr_func=hadoop_counter_incr_func) # Create a temp file to contain the unencrypted output, in the same temp directory. with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file: temp_input_filepath = temp_input_file.name log.info('Writing data to temporary file: %s', temp_input_filepath) yield temp_input_file # Encryption produces a second file in the same temp directory. temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath) if recipients is None: recipients = [key['keyid'] for key in gpg.list_keys()] with open(temp_input_filepath, 'r') as temp_input_file: _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients) with open(temp_encrypted_filepath) as temp_encrypted_file: copy_file_to_file(temp_encrypted_file, output_file, progress)
def run(self): with self.output().open('w') as output_file: with self.input()['data'][0].open('r') as input_file: with make_temp_directory(prefix='obfuscate-course.') as tmp_directory: with tempfile.TemporaryFile() as temp_input_file: # We cannot seek in HDFS streams, so copy the file to the local disk before extracting copy_file_to_file(input_file, temp_input_file) temp_input_file.flush() temp_input_file.seek(0) with tarfile.open(mode='r:gz', fileobj=temp_input_file) as course_archive: course_archive.extractall(tmp_directory) course_dir = os.listdir(tmp_directory)[0] root_dir = os.path.join(tmp_directory, course_dir) self.clean_drafts(root_dir) course_package_ref = self.read_course_package_ref(root_dir) policy_file_path = os.path.join(root_dir, 'policies', course_package_ref, 'policy.json') self.clean_course_policy(course_package_ref, policy_file_path) self.clean_xml_files(root_dir) with tarfile.open(mode='w:gz', fileobj=output_file) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def validate_obfuscation(self): """Validates obfuscation workflow.""" output_target = PathSetTask([self.test_out], ['*.tar.gz.gpg']).output()[0] output_filename = os.path.basename(output_target.path) output_filepath = os.path.join(self.temporary_dir, output_filename) if output_target.path.startswith('s3://'): output_target = get_target_from_url(output_target.path.replace('s3://', 's3+https://')) with output_target.open('r') as input_file: with open(output_filepath, 'w') as output_file: copy_file_to_file(input_file, output_file) decrypted_filepath = output_filepath[:-len('.gpg')] fs.decrypt_file(output_filepath, decrypted_filepath, 'insecure_secret.key') with tarfile.open(decrypted_filepath, 'r:gz') as tfile: tfile.extractall(self.temporary_dir) # Validate package metadata info. metadata_filepath = os.path.join(self.temporary_dir, 'metadata_file.json') with open(metadata_filepath) as metadata_file: metadata_info = json.load(metadata_file) self.assertItemsEqual(metadata_info['format_version'], self.FORMAT_VERSION) self.assertItemsEqual(metadata_info['pipeline_version'], self.PIPELINE_VERSION) self.validate_data_obfuscation() self.validate_events_obfuscation()
def validate_obfuscation(self): """Validates obfuscation workflow.""" output_target = self.get_targets_from_remote_path( self.test_out, '*.tar.gz.gpg')[0] output_filename = os.path.basename(output_target.path) temp_output_filepath = os.path.join(self.temporary_dir, output_filename) with output_target.open('r') as input_file: with open(temp_output_filepath, 'w') as output_file: copy_file_to_file(input_file, output_file) decrypted_filepath = temp_output_filepath[:-len('.gpg')] fs.decrypt_file(temp_output_filepath, decrypted_filepath, 'insecure_secret.key') with tarfile.open(decrypted_filepath, 'r:gz') as tfile: tfile.extractall(self.temporary_dir) # Validate package metadata info. metadata_filepath = os.path.join(self.temporary_dir, 'metadata_file.json') with open(metadata_filepath) as metadata_file: metadata_info = json.load(metadata_file) self.assertItemsEqual(metadata_info['format_version'], self.FORMAT_VERSION) self.assertItemsEqual(metadata_info['pipeline_version'], self.PIPELINE_VERSION) self.validate_data_obfuscation() self.validate_events_obfuscation()
def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None): """ Creates a file object to be written to, whose contents will afterwards be encrypted. Parameters: output_file: a file object, opened for writing. key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded. recipients: an optional list of recipients to be loaded. If not specified, uses all loaded keys. progress: a function that is called periodically as progress is made. """ with make_temp_directory(prefix="encrypt") as temp_dir: # Use temp directory to hold gpg keys. gpg = gnupg.GPG(gnupghome=temp_dir) gpg.encoding = 'utf-8' _import_key_files(gpg, key_file_targets) # Create a temp file to contain the unencrypted output, in the same temp directory. with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file: temp_input_filepath = temp_input_file.name log.info('Writing data to temporary file: %s', temp_input_filepath) yield temp_input_file # Encryption produces a second file in the same temp directory. temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath) if recipients is None: recipients = [key['keyid'] for key in gpg.list_keys()] with open(temp_input_filepath, 'r') as temp_input_file: _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients) with open(temp_encrypted_filepath) as temp_encrypted_file: copy_file_to_file(temp_encrypted_file, output_file, progress)
def validate_obfuscation(self): """Validates obfuscation workflow.""" output_target = PathSetTask([self.test_out], ['*.tar.gz.gpg']).output()[0] output_filename = os.path.basename(output_target.path) output_filepath = os.path.join(self.temporary_dir, output_filename) if output_target.path.startswith('s3://'): output_target = get_target_from_url( output_target.path.replace('s3://', 's3+https://')) with output_target.open('r') as input_file: with open(output_filepath, 'w') as output_file: copy_file_to_file(input_file, output_file) decrypted_filepath = output_filepath[:-len('.gpg')] fs.decrypt_file(output_filepath, decrypted_filepath, 'insecure_secret.key') with tarfile.open(decrypted_filepath, 'r:gz') as tfile: tfile.extractall(self.temporary_dir) # Validate package metadata info. metadata_filepath = os.path.join(self.temporary_dir, 'metadata_file.json') with open(metadata_filepath) as metadata_file: metadata_info = json.load(metadata_file) self.assertItemsEqual(metadata_info['format_version'], self.FORMAT_VERSION) self.assertItemsEqual(metadata_info['pipeline_version'], self.PIPELINE_VERSION) self.validate_data_obfuscation() self.validate_events_obfuscation()
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse( self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len( course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file ) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse(self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len(course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir ) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')