def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None): """ Creates a file object to be written to, whose contents will afterwards be encrypted. Parameters: output_file: a file object, opened for writing. key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded. recipients: an optional list of recipients to be loaded. If not specified, uses all loaded keys. progress: a function that is called periodically as progress is made. """ with make_temp_directory(prefix="encrypt") as temp_dir: # Use temp directory to hold gpg keys. gpg = gnupg.GPG(gnupghome=temp_dir) gpg.encoding = 'utf-8' _import_key_files(gpg, key_file_targets) # Create a temp file to contain the unencrypted output, in the same temp directory. with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file: temp_input_filepath = temp_input_file.name log.info('Writing data to temporary file: %s', temp_input_filepath) yield temp_input_file # Encryption produces a second file in the same temp directory. temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath) if recipients is None: recipients = [key['keyid'] for key in gpg.list_keys()] with open(temp_input_filepath, 'r') as temp_input_file: _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients) _copy_file_to_open_file(temp_encrypted_filepath, output_file, progress)
def make_encrypted_file(output_file, key_file_targets, recipients=None): """ Creates a file object to be written to, whose contents will afterwards be encrypted. Parameters: output_file: a file object, opened for writing. key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded. recipients: an optional list of recipients to be loaded. If not specified, uses all loaded keys. """ with make_temp_directory(prefix="encrypt") as temp_dir: # Use temp directory to hold gpg keys. gpg = gnupg.GPG(gnupghome=temp_dir) gpg.encoding = 'utf-8' _import_key_files(gpg, key_file_targets) # Create a temp file to contain the unencrypted output, in the same temp directory. with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file: temp_input_filepath = temp_input_file.name yield temp_input_file # Encryption produces a second file in the same temp directory. temp_encrypted_filepath = "{filepath}.gpg".format( filepath=temp_input_filepath) if recipients is None: recipients = [key['keyid'] for key in gpg.list_keys()] with open(temp_input_filepath, 'r') as temp_input_file: _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients) _copy_file_to_open_file(temp_encrypted_filepath, output_file)
def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None, dir=None, hadoop_counter_incr_func=DEFAULT_HADOOP_COUNTER_FUNC): """ Creates a file object to be written to, whose contents will afterwards be encrypted. Parameters: output_file: a file object, opened for writing. key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded. recipients: an optional list of recipients to be loaded. If not specified, uses all loaded keys. progress: a function that is called periodically as progress is made. hadoop_counter_incr_func: A callback to a function that can generate MR counters so that non-critical GPG messages can be promoted to a visible section of the MR run log. """ with make_temp_directory(prefix="encrypt", dir=dir) as temp_dir: # Use temp directory to hold gpg keys. gpg = gnupg.GPG(gnupghome=temp_dir) gpg.encoding = 'utf-8' _import_key_files(gpg_instance=gpg, key_file_targets=key_file_targets, hadoop_counter_incr_func=hadoop_counter_incr_func) # Create a temp file to contain the unencrypted output, in the same temp directory. with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file: temp_input_filepath = temp_input_file.name log.info('Writing data to temporary file: %s', temp_input_filepath) yield temp_input_file # Encryption produces a second file in the same temp directory. temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath) if recipients is None: recipients = [key['keyid'] for key in gpg.list_keys()] with open(temp_input_filepath, 'r') as temp_input_file: _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients) with open(temp_encrypted_filepath) as temp_encrypted_file: copy_file_to_file(temp_encrypted_file, output_file, progress)
def run(self): with self.output().open('w') as output_file: with self.input()['data'][0].open('r') as input_file: with make_temp_directory(prefix='obfuscate-course.') as tmp_directory: with tempfile.TemporaryFile() as temp_input_file: # We cannot seek in HDFS streams, so copy the file to the local disk before extracting copy_file_to_file(input_file, temp_input_file) temp_input_file.flush() temp_input_file.seek(0) with tarfile.open(mode='r:gz', fileobj=temp_input_file) as course_archive: course_archive.extractall(tmp_directory) course_dir = os.listdir(tmp_directory)[0] root_dir = os.path.join(tmp_directory, course_dir) self.clean_drafts(root_dir) course_package_ref = self.read_course_package_ref(root_dir) policy_file_path = os.path.join(root_dir, 'policies', course_package_ref, 'policy.json') self.clean_course_policy(course_package_ref, policy_file_path) self.clean_xml_files(root_dir) with tarfile.open(mode='w:gz', fileobj=output_file) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def get_decrypted_data(self, input_file, key_file_target): """Decrypts contents of input, and writes to output file object open for writing.""" with make_temp_directory(prefix="decrypt") as temp_dir: # Use temp directory to hold gpg keys. gpg_instance = gnupg.GPG(gnupghome=temp_dir) _import_key_files(gpg_instance, [key_file_target]) decrypted_data = gpg_instance.decrypt_file(input_file, always_trust=True) return decrypted_data
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse( self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len( course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file ) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse(self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len(course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir ) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')