def validate_obfuscation(self): """Validates obfuscation workflow.""" output_target = PathSetTask([self.test_out], ['*.tar.gz.gpg']).output()[0] output_filename = os.path.basename(output_target.path) output_filepath = os.path.join(self.temporary_dir, output_filename) if output_target.path.startswith('s3://'): output_target = get_target_from_url(output_target.path.replace('s3://', 's3+https://')) with output_target.open('r') as input_file: with open(output_filepath, 'w') as output_file: copy_file_to_file(input_file, output_file) decrypted_filepath = output_filepath[:-len('.gpg')] fs.decrypt_file(output_filepath, decrypted_filepath, 'insecure_secret.key') with tarfile.open(decrypted_filepath, 'r:gz') as tfile: tfile.extractall(self.temporary_dir) # Validate package metadata info. metadata_filepath = os.path.join(self.temporary_dir, 'metadata_file.json') with open(metadata_filepath) as metadata_file: metadata_info = json.load(metadata_file) self.assertItemsEqual(metadata_info['format_version'], self.FORMAT_VERSION) self.assertItemsEqual(metadata_info['pipeline_version'], self.PIPELINE_VERSION) self.validate_data_obfuscation() self.validate_events_obfuscation()
def validate_obfuscation(self): """Validates obfuscation workflow.""" output_target = PathSetTask([self.test_out], ['*.tar.gz.gpg']).output()[0] output_filename = os.path.basename(output_target.path) output_filepath = os.path.join(self.temporary_dir, output_filename) if output_target.path.startswith('s3://'): output_target = get_target_from_url( output_target.path.replace('s3://', 's3+https://')) with output_target.open('r') as input_file: with open(output_filepath, 'w') as output_file: copy_file_to_file(input_file, output_file) decrypted_filepath = output_filepath[:-len('.gpg')] fs.decrypt_file(output_filepath, decrypted_filepath, 'insecure_secret.key') with tarfile.open(decrypted_filepath, 'r:gz') as tfile: tfile.extractall(self.temporary_dir) # Validate package metadata info. metadata_filepath = os.path.join(self.temporary_dir, 'metadata_file.json') with open(metadata_filepath) as metadata_file: metadata_info = json.load(metadata_file) self.assertItemsEqual(metadata_info['format_version'], self.FORMAT_VERSION) self.assertItemsEqual(metadata_info['pipeline_version'], self.PIPELINE_VERSION) self.validate_data_obfuscation() self.validate_events_obfuscation()
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse( self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len( course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file ) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def get_targets_from_remote_path(remote_path, pattern='*'): output_targets = PathSetTask([remote_path], [pattern]).output() modified = [ modify_target_for_local_server(output_target) for output_target in output_targets ] return modified
def __init__(self, *args, **kwargs): super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs) filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( self.course) dump_path = url_path_join(self.dump_root, filename_safe_course_id, 'state') auth_userprofile_targets = PathSetTask( [dump_path], ['*auth_userprofile*']).output() # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that dates = [ target.path.rsplit('/', 2)[-2] for target in auth_userprofile_targets ] # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override? # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error. if len(dates) == 0: raise Exception( 'Missing auth_userprofile data file in {}'.format(dump_path)) latest_date = sorted(dates)[-1] self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date) self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse(self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len(course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir ) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def obfuscate_directory(self, input_dir, output_dir): if output_dir is not None: create_directory(output_dir) if self.parameters['wiki']: for filepath in glob.glob(os.path.join(input_dir, '*wiki_articlerevision-prod-analytics.sql')): self.obfuscate_wiki_file(filepath, output_dir) if self.parameters['courseware']: for filepath in glob.glob(os.path.join(input_dir, '*courseware_studentmodule-prod-analytics.sql')): self.obfuscate_courseware_file(filepath, output_dir) if self.parameters['forum']: for filepath in glob.glob(os.path.join(input_dir, '*.mongo')): self.obfuscate_forum_file(filepath, output_dir) if self.parameters['event']: # This is generalized beyond localfs/glob. task = PathSetTask(src=[input_dir], include=['*-events-*.log.gz']) requirements = task.requires() for requirement in requirements: self.obfuscate_event_file(requirement.output(), output_dir)
def requires(self): base_reqs = { # We want to process files that are zero-length. 'data': PathSetTask([self.data_directory], [self.file_pattern], include_zero_length=True) } base_reqs.update(self.user_info_requirements()) return base_reqs
def __init__(self, *args, **kwargs): super(LoadInternalReportingUserActivityToWarehouse, self).__init__(*args, **kwargs) path = url_path_join(self.warehouse_path, 'internal_reporting_user_activity') path_targets = PathSetTask([path]).output() paths = list(set([os.path.dirname(target.path) for target in path_targets])) dates = [path.rsplit('/', 2)[-1] for path in paths] latest_date = sorted(dates)[-1] self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
def read_dfs_directory(url): """Given the URL to a directory, read all of the files from it and concatenate them.""" output_targets = PathSetTask([url], ['*']).output() raw_output = [] for output_target in output_targets: if isinstance(output_target, S3HdfsTarget): output_target = get_target_from_url(get_jenkins_safe_url(output_target.path)) raw_output.append(output_target.open('r').read()) return ''.join(raw_output)
def clean_xml_files(self, root_dir): """Find all of the XML files in the package and remove any unrecognized or known sensitive fields from them.""" log.debug('Cleaning XML files') xml_file_paths = [ target.path for target in PathSetTask([root_dir], ['*.xml']).output() ] for xml_file_path in xml_file_paths: document = xml.etree.ElementTree.parse(xml_file_path) element = document.getroot() self.clean_element(element) document.write(xml_file_path)
def obfuscate_directory(self, input_dir, output_dir): if output_dir is not None: create_directory(output_dir) if self.parameters['wiki']: for filepath in glob.glob( os.path.join(input_dir, '*wiki_articlerevision-prod-analytics.sql')): self.obfuscate_wiki_file(filepath, output_dir) if self.parameters['courseware']: for filepath in glob.glob( os.path.join( input_dir, '*courseware_studentmodule-prod-analytics.sql')): self.obfuscate_courseware_file(filepath, output_dir) if self.parameters['forum']: for filepath in glob.glob(os.path.join(input_dir, '*.mongo')): self.obfuscate_forum_file(filepath, output_dir) if self.parameters['event']: # This is generalized beyond localfs/glob. task = PathSetTask(src=[input_dir], include=['*-events-*.log.gz']) requirements = task.requires() for requirement in requirements: self.obfuscate_event_file(requirement.output(), output_dir)
def test_end_to_end_without_vertica(self): # Similar to test_end_to_end but it excludes the vertica part and it checks data values, # not just data shape. table_name = 'reconciled_order_transactions' output_root = url_path_join( self.warehouse_path, table_name, 'dt=' + self.UPPER_BOUND_DATE ) + '/' self.task.launch([ 'ReconcileOrdersAndTransactionsTask', '--import-date', self.UPPER_BOUND_DATE, '--n-reduce-tasks', str(self.NUM_REDUCERS), '--output-root', output_root, ]) final_output_task = LoadInternalReportingOrderTransactionsToWarehouse( import_date=luigi.DateParameter().parse(self.UPPER_BOUND_DATE) ) columns = [x[0] for x in final_output_task.columns] output_targets = PathSetTask([output_root], ['*']).output() raw_output = "" for output_target in output_targets: output_target = get_target_from_url(get_jenkins_safe_url(output_target.path)) raw_output += output_target.open('r').read() expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_financial_report.csv') expected = pandas.read_csv(expected_output_csv, parse_dates=True) output = StringIO(raw_output.replace('\t\\N', '\t')) data = pandas.read_table(output, header=None, names=columns, parse_dates=True) # Re-order dataframe for consistent comparison: for frame in (data, expected): frame.sort(['payment_ref_id', 'transaction_type'], inplace=True, ascending=[True, False]) frame.reset_index(drop=True, inplace=True) try: assert_frame_equal(data, expected) except AssertionError: pandas.set_option('display.max_columns', None) print('----- The report generated this data: -----') print(data) print('----- vs expected: -----') print(expected) if data.shape != expected.shape: print("Data shapes differ.") else: for index, series in data.iterrows(): # Try to print a more helpful/localized difference message: try: assert_series_equal(data.iloc[index, :], expected.iloc[index, :]) except AssertionError: print("First differing row: {index}".format(index=index)) raise
def requires(self): return PathSetTask(self.src, self.include, self.manifest)
def requires(self): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) event_files_url = url_path_join(self.dump_root, filename_safe_course_id, 'events') return PathSetTask([event_files_url], ['*'])
def requires(self): results = { 'events': PathSetTask(self.src, self.include, self.manifest), 'geoloc_data': ExternalURL(self.geolocation_data), } return results
def requires(self): return PathSetTask(self.dump_root)