def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('course_list_raw', partition_value=self.partition_value), 'course_list.json' ) )
def output(self): return get_target_from_url(url_path_join( self.output_root, 'transaction', 'dt=' + self.import_date.isoformat(), # pylint: disable=no-member 'transactions.csv' ))
def credentials(self): """The credentials for connecting to the database, read from a URL.""" if not hasattr(self, '_credentials'): with get_target_from_url(self.vertica_creds_url).open('r') as credentials_file: self._credentials = json.load(credentials_file) return self._credentials
def _get_required_tasks(self): """Internal method to actually calculate required tasks once.""" start_date = self.interval.date_a # pylint: disable=no-member end_date = self.interval.date_b # pylint: disable=no-member table_name = "student_courseenrollment" source_root = url_path_join(self.warehouse_path, table_name) today_datestring = datetime.datetime.utcnow().strftime('%Y-%m-%d') current_date = start_date while current_date <= end_date: datestring = current_date.strftime('%Y-%m-%d') current_date += datetime.timedelta(days=1) src_datestring = "dt={}".format(datestring) source_dir = url_path_join(source_root, src_datestring) target = get_target_from_url(source_dir) output_dir = url_path_join(self.output_root, datestring) if datestring == today_datestring: yield CreateEnrollmentValidationEventsForTodayTask( source_dir=source_dir, output_root=output_dir, n_reduce_tasks=self.n_reduce_tasks, credentials=self.credentials, ) elif target.exists(): yield CreateEnrollmentValidationEventsTask( source_dir=source_dir, output_root=output_dir, n_reduce_tasks=self.n_reduce_tasks, )
def output(self): if len(self.input()['data']) == 0: raise IOError("Course File '{filename}' not found for course '{course}'".format( filename=self.file_pattern, course=self.course )) output_filename = os.path.basename(self.input()['data'][0].path) return get_target_from_url(url_path_join(self.output_directory, output_filename))
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('program_course_order', partition_value=self.date), '{0}.tsv'.format('program_course_order') ) )
def multi_output_reducer(self, key, values, output_file): """ Write values to the appropriate file as determined by the key. Write to the encrypted file by streaming through gzip, which compresses before encrypting """ _date_string, org_id = key recipients = self.recipients_for_org_id[org_id] log.info('Encryption recipients: %s', str(recipients)) def report_progress(num_bytes): """Update hadoop counters as the file is written""" self.event_export_counter(counter_title='Bytes Written to Output', incr_value=num_bytes) key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients] try: with make_encrypted_file(output_file, key_file_targets, progress=report_progress, hadoop_counter_incr_func=self.event_export_counter) as encrypted_output_file: outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file) try: for value in values: outfile.write(value.strip()) outfile.write('\n') # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite # loop. Do not remove it. self.event_export_counter(counter_title='Raw Bytes Written', incr_value=(len(value) + 1)) finally: outfile.close() except IOError as err: log.error("Error encountered while encrypting and gzipping Organization: %s file: %s Exception: %s", org_id, key_file_targets, err) # This counter is set when there is an error during the generation of the encryption file for an # organization for any reason, including encryption errors related to an expired GPG key. self.event_export_counter(counter_title="{} org with Errors".format(org_id), incr_value=1)
def output(self): output_root = url_path_join( self.warehouse_path, self.partition_task.hive_table_task.table, self.partition.path_spec + '/' ) return get_target_from_url(output_root, marker=True)
def run_job(self, job): job.init_hadoop() job.init_mapper() map_output = StringIO.StringIO() input_targets = luigi.task.flatten(job.input_hadoop()) for input_target in input_targets: # if file is a directory, then assume that it's Hadoop output, # and actually loop through its contents: if os.path.isdir(input_target.path): filenames = os.listdir(input_target.path) for filename in filenames: url = url_path_join(input_target.path, filename) input_targets.append(get_target_from_url(url.strip())) continue with input_target.open('r') as input_file: # S3 files not yet supported since they don't support tell() and seek() if input_target.path.endswith('.gz'): input_file = gzip.GzipFile(fileobj=input_file) elif input_target.path.endswith('.manifest'): for url in input_file: input_targets.append(get_target_from_url(url.strip())) continue os.environ['map_input_file'] = input_target.path try: outputs = job._map_input((line[:-1] for line in input_file)) job.internal_writer(outputs, map_output) finally: del os.environ['map_input_file'] map_output.seek(0) reduce_input = self.group(map_output) try: reduce_output = job.output().open('w') except Exception: reduce_output = StringIO.StringIO() try: job._run_reducer(reduce_input, reduce_output) finally: try: reduce_output.close() except Exception: pass
def remove_manifest_target_if_exists(manifest_id): """Given an id and configuration, construct a target that can check and remove a manifest file.""" manifest_file_path = get_manifest_file_path(manifest_id) # we don't need the mixin in order to check for existence or to remove the manifest file. manifest_target = get_target_from_url(manifest_file_path) if manifest_target.exists(): log.info('Removing existing manifest found at %s', manifest_target.path) manifest_target.remove()
def requires_hadoop(self): # Check first if running locally with Sqoop output. target = get_target_from_url(self.source_dir) if isinstance(target, luigi.LocalTarget) and os.path.isdir(self.source_dir): files = [f for f in os.listdir(self.source_dir) if f.startswith("part")] for filename in files: yield ExternalURL(url_path_join(self.source_dir, filename)) else: yield ExternalURL(self.source_dir)
def manifest_file_list(self): """Write each individual path to a manifest file and yield the path to that file.""" manifest_target = get_target_from_url(self.manifest) if not manifest_target.exists(): with manifest_target.open('w') as manifest_file: for external_url_task in self.generate_file_list(): manifest_file.write(external_url_task.url + '\n') yield ExternalURL(self.manifest)
def run(self): # Remove the marker file. self.remove_output_on_overwrite() # Also remove actual output files in case of overwrite. if self.overwrite: for date in self.overwrite_interval: url = self.output_path_for_key(date.isoformat()) target = get_target_from_url(url) if target.exists(): target.remove() super(UserVideoViewingByDateTask, self).run() # Make sure an output file exists for each day within the interval. for date in self.overwrite_interval: url = self.output_path_for_key(date.isoformat()) target = get_target_from_url(url) if not target.exists(): target.open("w").close() # touch the file
def read_config_file(filename): """Read a config file from either an external source (S3, HDFS etc) or the "share" directory of this repo.""" if os.path.basename(filename) != filename: target = get_target_from_url(filename) with target.open('r') as config_file: yield config_file else: file_path = os.path.join(sys.prefix, 'share', 'edx.analytics.tasks', filename) with open(file_path, 'r') as config_file: yield config_file
def output(self): """ Output is set up so that it can be read as a Hive table with partitions, The form is {warehouse_path}/course_catalog_api/subjects/dt={CCYY-mm-dd}/subjects.tsv. """ date_string = self.date.strftime('%Y-%m-%d') # pylint: disable=no-member partition_path_spec = HivePartition('dt', date_string).path_spec url_with_filename = url_path_join(self.warehouse_path, "course_catalog", "subjects", partition_path_spec, "subjects.tsv") return get_target_from_url(url_with_filename)
def run(self): self.remove_output_on_overwrite() super(LastDailyIpAddressOfUserTask, self).run() # This makes sure that a output file exists for each date in the interval # as downstream tasks require that they exist (as provided by downstream_input_tasks()). for date in self.interval: url = self.output_path_for_key(date.isoformat()) target = get_target_from_url(url) if not target.exists(): target.open("w").close() # touch the file
def output(self): """ Output is set up so it can be read in as a Hive table with partitions. The form is {output_root}/payments/dt={CCYY-mm-dd}/cybersource_{merchant}.tsv """ date_string = self.run_date.strftime('%Y-%m-%d') # pylint: disable=no-member partition_path_spec = HivePartition('dt', date_string).path_spec filename = "cybersource_{}.tsv".format(self.merchant_id) url_with_filename = url_path_join(self.output_root, "payments", partition_path_spec, filename) return get_target_from_url(url_with_filename)
def output(self): """Output is in the form {output_root}/cybersource/{CCYY-mm}/cybersource_{merchant}_{CCYYmmdd}.csv""" month_year_string = self.run_date.strftime('%Y-%m') # pylint: disable=no-member date_string = self.run_date.strftime('%Y%m%d') # pylint: disable=no-member filename = "cybersource_{merchant_id}_{date_string}.{report_format}".format( merchant_id=self.merchant_id, date_string=date_string, report_format=self.REPORT_FORMAT, ) url_with_filename = url_path_join(self.output_root, "cybersource", month_year_string, filename) return get_target_from_url(url_with_filename)
def __init__(self, *args, **kwargs): super(MultiOutputMapReduceJobTask, self).__init__(*args, **kwargs) if self.delete_output_root: # If requested, make sure that the output directory is empty. This gets rid # of any generated data files from a previous run (that might not get # regenerated in this run). It also makes sure that the marker file # (i.e. the output target) will be removed, so that external functionality # will know that the generation of data files is not complete. output_dir_target = get_target_from_url(self.output_root) for target in [self.output(), output_dir_target]: if target.exists(): target.remove()
def run(self): # Remove the marker file. self.remove_output_on_overwrite() # Also remove actual output files in case of overwrite. if self.overwrite: for date in self.interval: url = self.output_path_for_key(date.isoformat()) target = get_target_from_url(url) if target.exists(): target.remove() return super(UserActivityTask, self).run()
def reducer(self, key, values): """ Write out values from each key into different output files. """ output_path = self.output_path_for_key(key) if output_path: log.info('Writing output file: %s', output_path) output_file_target = get_target_from_url(output_path) with output_file_target.open('w') as output_file: self.multi_output_reducer(key, values, output_file) # Luigi requires the reducer to return an iterable return iter(tuple())
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse(self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len(course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir ) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def output(self): return get_target_from_url(url_path_join(self.output_root, 'temp', 'CountCourseEnrollments/'))
def output(self): output_name = u'answer_distribution_per_course_{name}/'.format(name=self.name) return get_target_from_url(url_path_join(self.dest, output_name))
def output(self): return get_target_from_url(self.partition_location.rstrip('/') + '/')
def get_table_metadata_target(self): """Returns target for metadata file from the given dump.""" # find the .metadata file in the source directory. metadata_path = url_path_join(self.s3_location_for_table, METADATA_FILENAME) return get_target_from_url(metadata_path)
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path(self.table_name, partition_value=self.date), '{0}.tsv'.format(self.table_name) ) )
def output(self): return get_target_from_url( url_path_join( self.destination, "incremental_users_and_enrollments_{0}.csv".format(self.name)))
def output(self): marker_url = url_path_join(self.marker, str(hash(self))) return get_target_from_url(marker_url)
def output(self): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id( self.course) return get_target_from_url( url_path_join(self.obfuscated_output_root, self.format_version, filename_safe_course_id, 'metadata_file.json'))
def complete(self): """ The task is complete if the output_root/_SUCCESS file is present. """ return get_target_from_url(url_path_join(self.output_root, '_SUCCESS')).exists()
def get_target_for_local_server(url): # The machine running the acceptance test suite may not have hadoop installed on it, so convert S3 paths (which # are normally handled by the hadoop DFS client) to S3+https paths, which are handled by the python native S3 # client. return get_target_from_url(url.replace('s3://', 's3+https://'))
def upload_file_with_content(self, remote_file_path, content): log.debug('Writing %s from string', remote_file_path) with get_target_from_url(remote_file_path).open('w') as remote_file: remote_file.write(content)
def upload_file(self, local_file_name, remote_file_path): log.debug('Uploading %s to %s', local_file_name, remote_file_path) with get_target_from_url(remote_file_path).open('w') as remote_file: with open(local_file_name, 'r') as local_file: shutil.copyfileobj(local_file, remote_file)
def output(self): return get_target_from_url(self.table_location)
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('programs_raw', partition_value=self.date), 'programs.json'))
def metadata_output(self): """Return target to which metadata about the task execution can be written.""" return get_target_from_url(url_path_join(self.destination, METADATA_FILENAME))
def output(self): output_url = self.hive_partition_path('active_users_per_week', self.interval.date_b) return get_target_from_url(output_url)
def output(self): output_name = u'problem_check_events_{name}/'.format(name=self.name) return get_target_from_url(url_path_join(self.dest, output_name))
def complete(self): if self.overwrite and not self.attempted_removal: return False else: return get_target_from_url( url_path_join(self.output_url(), '_SUCCESS')).exists()
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('course_catalog_raw', partition_value=self.date), 'course_catalog.json'))
def output(self): return get_target_from_url(url_path_join(self.output_root, 'temp/CountProgramCohortEnrollments/'))
def output(self): return get_target_from_url( url_path_join( self.destination, "daily_registrations_enrollments_{0}.csv".format(self.name)))
def output(self): return get_target_from_url( url_path_join(self.output_root, self.filename_safe_course_id + '.tar.gz.gpg'))
def output(self): url_with_filename = url_path_join(self.destination, self.filename) return get_target_from_url(url_with_filename)
def output(self): return get_target_from_url(self.s3_location_for_table)
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('discovery_api_raw', partition_value=self.date), 'courses.json'))
def output(self): # pragma: no cover output_root = url_path_join(self.warehouse_path, self.partition_task.hive_table_task.table, self.partition.path_spec + '/') return get_target_from_url(output_root, marker=True)
def output(self): return get_target_from_url(self.output_root)
def output(self): return get_target_from_url( url_path_join( self.hive_partition_path('program_course_order', partition_value=self.date), '{0}.tsv'.format('program_course_order')))
def output(self): """Expose the data location target as the output.""" return get_target_from_url(self.output_root)
def output(self): return get_target_from_url(self.destination + '/')
def complete(self): """ The task is complete if the output_root is present. """ return get_target_from_url(self.output_root).exists()
def marker_output(self): """Return target for _SUCCESS marker indicating the task was successfully completed.""" return get_target_from_url(url_path_join(self.destination, "_SUCCESS"))
def input_hadoop(self): # NOTE: The hadoop job needs the raw data to use as input, not the hive partition metadata, which is the output # of the partition task return get_target_from_url(self.requires().output_root)
def output(self): """ Use the marker location as an indicator of task "completeness". """ return get_target_from_url(self.marker)