def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None):
    """
    Creates a file object to be written to, whose contents will afterwards be encrypted.

    Parameters:
        output_file:  a file object, opened for writing.
        key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded.
        recipients:  an optional list of recipients to be loaded.  If not specified, uses all loaded keys.
        progress:  a function that is called periodically as progress is made.
    """
    with make_temp_directory(prefix="encrypt") as temp_dir:
        # Use temp directory to hold gpg keys.
        gpg = gnupg.GPG(gnupghome=temp_dir)
        gpg.encoding = 'utf-8'
        _import_key_files(gpg, key_file_targets)

        # Create a temp file to contain the unencrypted output, in the same temp directory.
        with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file:
            temp_input_filepath = temp_input_file.name
            log.info('Writing data to temporary file: %s', temp_input_filepath)
            yield temp_input_file

        # Encryption produces a second file in the same temp directory.
        temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath)
        if recipients is None:
            recipients = [key['keyid'] for key in gpg.list_keys()]
        with open(temp_input_filepath, 'r') as temp_input_file:
            _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients)
        _copy_file_to_open_file(temp_encrypted_filepath, output_file, progress)
def make_encrypted_file(output_file, key_file_targets, recipients=None):
    """
    Creates a file object to be written to, whose contents will afterwards be encrypted.

    Parameters:
        output_file:  a file object, opened for writing.
        key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded.
        recipients:  an optional list of recipients to be loaded.  If not specified, uses all loaded keys.
    """
    with make_temp_directory(prefix="encrypt") as temp_dir:
        # Use temp directory to hold gpg keys.
        gpg = gnupg.GPG(gnupghome=temp_dir)
        gpg.encoding = 'utf-8'
        _import_key_files(gpg, key_file_targets)

        # Create a temp file to contain the unencrypted output, in the same temp directory.
        with tempfile.NamedTemporaryFile(dir=temp_dir,
                                         delete=False) as temp_input_file:
            temp_input_filepath = temp_input_file.name
            yield temp_input_file

        # Encryption produces a second file in the same temp directory.
        temp_encrypted_filepath = "{filepath}.gpg".format(
            filepath=temp_input_filepath)
        if recipients is None:
            recipients = [key['keyid'] for key in gpg.list_keys()]
        with open(temp_input_filepath, 'r') as temp_input_file:
            _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath,
                          recipients)
        _copy_file_to_open_file(temp_encrypted_filepath, output_file)
예제 #3
0
def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None, dir=None,
                        hadoop_counter_incr_func=DEFAULT_HADOOP_COUNTER_FUNC):
    """
    Creates a file object to be written to, whose contents will afterwards be encrypted.

    Parameters:
        output_file:  a file object, opened for writing.
        key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded.
        recipients:  an optional list of recipients to be loaded.  If not specified, uses all loaded keys.
        progress:  a function that is called periodically as progress is made.
        hadoop_counter_incr_func:  A callback to a function that can generate MR counters so that non-critical GPG
            messages can be promoted to a visible section of the MR run log.
    """
    with make_temp_directory(prefix="encrypt", dir=dir) as temp_dir:
        # Use temp directory to hold gpg keys.
        gpg = gnupg.GPG(gnupghome=temp_dir)
        gpg.encoding = 'utf-8'
        _import_key_files(gpg_instance=gpg, key_file_targets=key_file_targets,
                          hadoop_counter_incr_func=hadoop_counter_incr_func)

        # Create a temp file to contain the unencrypted output, in the same temp directory.
        with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file:
            temp_input_filepath = temp_input_file.name
            log.info('Writing data to temporary file: %s', temp_input_filepath)
            yield temp_input_file

        # Encryption produces a second file in the same temp directory.
        temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath)
        if recipients is None:
            recipients = [key['keyid'] for key in gpg.list_keys()]
        with open(temp_input_filepath, 'r') as temp_input_file:
            _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients)
        with open(temp_encrypted_filepath) as temp_encrypted_file:
            copy_file_to_file(temp_encrypted_file, output_file, progress)
    def run(self):
        with self.output().open('w') as output_file:
            with self.input()['data'][0].open('r') as input_file:
                with make_temp_directory(prefix='obfuscate-course.') as tmp_directory:
                    with tempfile.TemporaryFile() as temp_input_file:
                        # We cannot seek in HDFS streams, so copy the file to the local disk before extracting
                        copy_file_to_file(input_file, temp_input_file)
                        temp_input_file.flush()
                        temp_input_file.seek(0)

                        with tarfile.open(mode='r:gz', fileobj=temp_input_file) as course_archive:
                            course_archive.extractall(tmp_directory)

                        course_dir = os.listdir(tmp_directory)[0]
                        root_dir = os.path.join(tmp_directory, course_dir)

                        self.clean_drafts(root_dir)

                        course_package_ref = self.read_course_package_ref(root_dir)

                        policy_file_path = os.path.join(root_dir, 'policies', course_package_ref, 'policy.json')
                        self.clean_course_policy(course_package_ref, policy_file_path)

                        self.clean_xml_files(root_dir)

                        with tarfile.open(mode='w:gz', fileobj=output_file) as output_archive_file:
                            output_archive_file.add(tmp_directory, arcname='')
예제 #5
0
    def run(self):
        with self.output().open('w') as output_file:
            with self.input()['data'][0].open('r') as input_file:
                with make_temp_directory(prefix='obfuscate-course.') as tmp_directory:
                    with tempfile.TemporaryFile() as temp_input_file:
                        # We cannot seek in HDFS streams, so copy the file to the local disk before extracting
                        copy_file_to_file(input_file, temp_input_file)
                        temp_input_file.flush()
                        temp_input_file.seek(0)

                        with tarfile.open(mode='r:gz', fileobj=temp_input_file) as course_archive:
                            course_archive.extractall(tmp_directory)

                        course_dir = os.listdir(tmp_directory)[0]
                        root_dir = os.path.join(tmp_directory, course_dir)

                        self.clean_drafts(root_dir)

                        course_package_ref = self.read_course_package_ref(root_dir)

                        policy_file_path = os.path.join(root_dir, 'policies', course_package_ref, 'policy.json')
                        self.clean_course_policy(course_package_ref, policy_file_path)

                        self.clean_xml_files(root_dir)

                        with tarfile.open(mode='w:gz', fileobj=output_file) as output_archive_file:
                            output_archive_file.add(tmp_directory, arcname='')
예제 #6
0
def make_encrypted_file(output_file, key_file_targets, recipients=None, progress=None, dir=None,
                        hadoop_counter_incr_func=DEFAULT_HADOOP_COUNTER_FUNC):
    """
    Creates a file object to be written to, whose contents will afterwards be encrypted.

    Parameters:
        output_file:  a file object, opened for writing.
        key_file_targets: a list of luigi.Target objects defining the gpg public key files to be loaded.
        recipients:  an optional list of recipients to be loaded.  If not specified, uses all loaded keys.
        progress:  a function that is called periodically as progress is made.
        hadoop_counter_incr_func:  A callback to a function that can generate MR counters so that non-critical GPG
            messages can be promoted to a visible section of the MR run log.
    """
    with make_temp_directory(prefix="encrypt", dir=dir) as temp_dir:
        # Use temp directory to hold gpg keys.
        gpg = gnupg.GPG(gnupghome=temp_dir)
        gpg.encoding = 'utf-8'
        _import_key_files(gpg_instance=gpg, key_file_targets=key_file_targets,
                          hadoop_counter_incr_func=hadoop_counter_incr_func)

        # Create a temp file to contain the unencrypted output, in the same temp directory.
        with tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) as temp_input_file:
            temp_input_filepath = temp_input_file.name
            log.info('Writing data to temporary file: %s', temp_input_filepath)
            yield temp_input_file

        # Encryption produces a second file in the same temp directory.
        temp_encrypted_filepath = "{filepath}.gpg".format(filepath=temp_input_filepath)
        if recipients is None:
            recipients = [key['keyid'] for key in gpg.list_keys()]
        with open(temp_input_filepath, 'r') as temp_input_file:
            _encrypt_file(gpg, temp_input_file, temp_encrypted_filepath, recipients)
        with open(temp_encrypted_filepath) as temp_encrypted_file:
            copy_file_to_file(temp_encrypted_file, output_file, progress)
 def get_decrypted_data(self, input_file, key_file_target):
     """Decrypts contents of input, and writes to output file object open for writing."""
     with make_temp_directory(prefix="decrypt") as temp_dir:
         # Use temp directory to hold gpg keys.
         gpg_instance = gnupg.GPG(gnupghome=temp_dir)
         _import_key_files(gpg_instance, [key_file_target])
         decrypted_data = gpg_instance.decrypt_file(input_file, always_trust=True)
         return decrypted_data
 def get_decrypted_data(self, input_file, key_file_target):
     """Decrypts contents of input, and writes to output file object open for writing."""
     with make_temp_directory(prefix="decrypt") as temp_dir:
         # Use temp directory to hold gpg keys.
         gpg_instance = gnupg.GPG(gnupghome=temp_dir)
         _import_key_files(gpg_instance, [key_file_target])
         decrypted_data = gpg_instance.decrypt_file(input_file,
                                                    always_trust=True)
         return decrypted_data
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.',
                                 dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(
                        self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(
                        course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory,
                                                   relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                        output_file,
                        key_file_targets,
                        progress=report_encrypt_progress,
                        dir=self.temporary_dir) as encrypted_output_file:
                    with tarfile.open(mode='w:gz',
                                      fileobj=encrypted_output_file
                                      ) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory, relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                    output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir
                ) as encrypted_output_file:
                    with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')