def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse( self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len( course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file ) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def __init__(self, *args, **kwargs): super(IntervalPullFromCybersourceTask, self).__init__(*args, **kwargs) # Provide default for output_root at this level. if self.output_root is None: self.output_root = self.warehouse_path path = url_path_join(self.warehouse_path, 'payments') file_pattern = '*cybersource_{}.tsv'.format(self.merchant_id) path_targets = PathSetTask([path], include=[file_pattern], include_zero_length=True).output() paths = list( set([os.path.dirname(target.path) for target in path_targets])) dates = [path.rsplit('/', 2)[-1] for path in paths] latest_date = sorted(dates)[-1] latest_completion_date = datetime.datetime.strptime( latest_date, "dt=%Y-%m-%d").date() run_date = latest_completion_date + datetime.timedelta(days=1) # Limit intervals to merchant account close date(if any). if self.merchant_close_date: run_date = min(run_date, self.merchant_close_date) self.interval_end = min(self.interval_end, self.merchant_close_date) self.selection_interval = date_interval.Custom(self.interval_start, run_date) self.run_interval = date_interval.Custom(run_date, self.interval_end)
def __init__(self, *args, **kwargs): super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs) filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( self.course) dump_path = url_path_join(self.dump_root, filename_safe_course_id, 'state') auth_userprofile_targets = PathSetTask( [dump_path], ['*auth_userprofile*']).output() # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that dates = [ target.path.rsplit('/', 2)[-2] for target in auth_userprofile_targets ] # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override? # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error. if len(dates) == 0: raise Exception( 'Missing auth_userprofile data file in {}'.format(dump_path)) latest_date = sorted(dates)[-1] self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date) self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
def requires(self): base_reqs = { # We want to process files that are zero-length. 'data': PathSetTask([self.data_directory], [self.file_pattern], include_zero_length=True) } base_reqs.update(self.user_info_requirements()) return base_reqs
def get_targets_from_remote_path(remote_path, pattern='*'): output_targets = PathSetTask([remote_path], [pattern]).output() modified = [ modify_target_for_local_server(output_target) for output_target in output_targets ] return modified
def __init__(self, *args, **kwargs): super(IntervalPullFromAffiliateWindowTask, self).__init__(*args, **kwargs) # Provide default for output_root at this level. if self.output_root is None: self.output_root = url_path_join(self.warehouse_path, 'fees', 'affiliate_window') path = self.output_root file_pattern = '*affiliate_window.tsv' path_targets = PathSetTask([path], include=[file_pattern], include_zero_length=True).output() if path_targets: paths = list( set([os.path.dirname(target.path) for target in path_targets])) dates = [path.rsplit('/', 2)[-1] for path in paths] latest_date = sorted(dates)[-1] latest_completion_date = datetime.datetime.strptime( latest_date, "dt=%Y-%m-%d").date() self.interval_start = latest_completion_date + datetime.timedelta( days=1) print("Found previous reports to {}".format(latest_date)) else: # If this is the first run, start from the beginning print( "Couldn't find last completed date, defaulting to start date: {}" .format(self.interval_start)) self.run_interval = date_interval.Custom(self.interval_start, self.interval_end) print("Running reports from interval {}".format(self.run_interval))
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse(self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len(course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir ) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def clean_xml_files(self, root_dir): """Find all of the XML files in the package and remove any unrecognized or known sensitive fields from them.""" log.debug('Cleaning XML files') xml_file_paths = [target.path for target in PathSetTask([root_dir], ['*.xml']).output()] for xml_file_path in xml_file_paths: document = xml.etree.ElementTree.parse(xml_file_path) element = document.getroot() self.clean_element(element) document.write(xml_file_path)
def __init__(self, *args, **kwargs): super(LoadInternalReportingCourseStructureToSnowflake, self).__init__(*args, **kwargs) path = url_path_join(self.warehouse_path, 'course_block_records') path_targets = PathSetTask([path]).output() paths = list(set([os.path.dirname(target.path) for target in path_targets])) dates = [path.rsplit('/', 2)[-1] for path in paths] latest_date = sorted(dates)[-1] self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
def obfuscate_directory(self, input_dir, output_dir): if output_dir is not None: create_directory(output_dir) if self.parameters['wiki']: for filepath in glob.glob(os.path.join(input_dir, '*wiki_articlerevision-prod-analytics.sql')): self.obfuscate_wiki_file(filepath, output_dir) if self.parameters['courseware']: for filepath in glob.glob(os.path.join(input_dir, '*courseware_studentmodule-prod-analytics.sql')): self.obfuscate_courseware_file(filepath, output_dir) if self.parameters['forum']: for filepath in glob.glob(os.path.join(input_dir, '*.mongo')): self.obfuscate_forum_file(filepath, output_dir) if self.parameters['event']: # This is generalized beyond localfs/glob. task = PathSetTask(src=[input_dir], include=['*-events-*.log.gz']) requirements = task.requires() for requirement in requirements: self.obfuscate_event_file(requirement.output(), output_dir)
def __init__(self, *args, **kwargs): super(LoadInternalReportingUserActivityToWarehouse, self).__init__(*args, **kwargs) path = url_path_join(self.warehouse_path, 'internal_reporting_user_activity') path_targets = PathSetTask([path]).output() paths = list(set([os.path.dirname(target.path) for target in path_targets])) dates = [path.rsplit('/', 2)[-1] for path in paths] latest_date = sorted(dates)[-1] self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
def __init__(self, *args, **kwargs): super(ExternalCourseEnrollmentSummaryPartitionTask, self).__init__(*args, **kwargs) # Find the most recent data for the source. path = url_path_join(self.warehouse_path, 'course_enrollment_summary') path_targets = PathSetTask([path]).output() paths = list(set([os.path.dirname(target.path) for target in path_targets])) dates = [path.rsplit('/', 2)[-1] for path in paths] latest_date = sorted(dates)[-1] self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
def obfuscate_directory(self, input_dir, output_dir): if output_dir is not None: create_directory(output_dir) if self.parameters['wiki']: for filepath in glob.glob( os.path.join(input_dir, '*wiki_articlerevision-prod-analytics.sql')): self.obfuscate_wiki_file(filepath, output_dir) if self.parameters['courseware']: for filepath in glob.glob( os.path.join( input_dir, '*courseware_studentmodule-prod-analytics.sql')): self.obfuscate_courseware_file(filepath, output_dir) if self.parameters['forum']: for filepath in glob.glob(os.path.join(input_dir, '*.mongo')): self.obfuscate_forum_file(filepath, output_dir) if self.parameters['event']: # This is generalized beyond localfs/glob. task = PathSetTask(src=[input_dir], include=['*-events-*.log.gz']) requirements = task.requires() for requirement in requirements: self.obfuscate_event_file(requirement.output(), output_dir)
def __init__(self, *args, **kwargs): super(LoadHiveTableToVertica, self).__init__(*args, **kwargs) # Find the most recent data for the source if load from latest partition is enabled. if self.load_from_latest_partition: path = url_path_join(self.warehouse_path, self.table_name) path_targets = PathSetTask([path]).output() paths = list( set([os.path.dirname(target.path) for target in path_targets])) dates = [path.rsplit('/', 2)[-1] for path in paths] latest_date = sorted(dates)[-1] self.latest_date = datetime.datetime.strptime( latest_date, "dt=%Y-%m-%d").date() log.debug('Loading data for table %s from partition %s', self.table_name, self.latest_date)
def __init__(self, *args, **kwargs): super(PaypalTransactionsIntervalTask, self).__init__(*args, **kwargs) # Provide default for output_root at this level. if self.output_root is None: self.output_root = self.warehouse_path path = url_path_join(self.warehouse_path, 'payments') path_targets = PathSetTask([path], include=['*paypal.tsv']).output() paths = list(set([os.path.dirname(target.path) for target in path_targets])) dates = [path.rsplit('/', 2)[-1] for path in paths] latest_date = sorted(dates)[-1] latest_completion_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date() run_date = latest_completion_date + datetime.timedelta(days=1) self.selection_interval = date_interval.Custom(self.interval_start, run_date) self.run_interval = date_interval.Custom(run_date, self.interval_end)
def requires(self): return PathSetTask(self.src, self.include, self.manifest)
def requires(self): return PathSetTask(self.dump_root)
def requires(self): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( self.course) event_files_url = url_path_join(self.dump_root, filename_safe_course_id, 'events') return PathSetTask([event_files_url], ['*'])