Exemplo n.º 1
0
    def run_job(self, job):
        job.init_hadoop()
        job.init_mapper()
        map_output = StringIO.StringIO()
        input_targets = luigi.task.flatten(job.input_hadoop())
        for input_target in input_targets:
            # if file is a directory, then assume that it's Hadoop output,
            # and actually loop through its contents:
            if os.path.isdir(input_target.path):
                filenames = os.listdir(input_target.path)
                for filename in filenames:
                    url = url_path_join(input_target.path, filename)
                    input_targets.append(get_target_from_url(url.strip()))
                continue

            with input_target.open('r') as input_file:

                # S3 files not yet supported since they don't support tell() and seek()
                if input_target.path.endswith('.gz'):
                    input_file = gzip.GzipFile(fileobj=input_file)
                elif input_target.path.endswith('.manifest'):
                    for url in input_file:
                        input_targets.append(get_target_from_url(url.strip()))
                    continue

                os.environ['map_input_file'] = input_target.path
                try:
                    outputs = job._map_input(
                        (line[:-1] for line in input_file))
                    job.internal_writer(outputs, map_output)
                finally:
                    del os.environ['map_input_file']

        map_output.seek(0)

        reduce_input = self.group(map_output)
        try:
            reduce_output = job.output().open('w')
        except Exception:
            reduce_output = StringIO.StringIO()

        try:
            job._run_reducer(reduce_input, reduce_output)
        finally:
            try:
                reduce_output.close()
            except Exception:
                pass
Exemplo n.º 2
0
    def manifest_file_list(self):
        """Write each individual path to a manifest file and yield the path to that file."""
        manifest_target = get_target_from_url(self.manifest)
        if not manifest_target.exists():
            with manifest_target.open('w') as manifest_file:
                for external_url_task in self.generate_file_list():
                    manifest_file.write(external_url_task.url + '\n')

        yield ExternalURL(self.manifest)
Exemplo n.º 3
0
def remove_manifest_target_if_exists(manifest_id):
    """Given an id and configuration, construct a target that can check and remove a manifest file."""
    manifest_file_path = get_manifest_file_path(manifest_id)
    # we don't need the mixin in order to check for existence or to remove the manifest file.
    manifest_target = get_target_from_url(manifest_file_path)
    if manifest_target.exists():
        log.info('Removing existing manifest found at %s',
                 manifest_target.path)
        manifest_target.remove()
Exemplo n.º 4
0
 def __init__(self, *args, **kwargs):
     super(MultiOutputMapReduceJobTask, self).__init__(*args, **kwargs)
     if self.delete_output_root:
         # If requested, make sure that the output directory is empty.  This gets rid
         # of any generated data files from a previous run (that might not get
         # regenerated in this run).  It also makes sure that the marker file
         # (i.e. the output target) will be removed, so that external functionality
         # will know that the generation of data files is not complete.
         output_dir_target = get_target_from_url(self.output_root)
         for target in [self.output(), output_dir_target]:
             if target.exists():
                 target.remove()
Exemplo n.º 5
0
    def reducer(self, key, values):
        """
        Write out values from each key into different output files.
        """
        output_path = self.output_path_for_key(key)
        if output_path:
            log.info('Writing output file: %s', output_path)
            output_file_target = get_target_from_url(output_path)
            with output_file_target.open('w') as output_file:
                self.multi_output_reducer(key, values, output_file)

        # Luigi requires the reducer to return an iterable
        return iter(tuple())
Exemplo n.º 6
0
 def complete(self):
     """
     The task is complete if the output_root/_SUCCESS file is present.
     """
     return get_target_from_url(url_path_join(self.output_root,
                                              '_SUCCESS')).exists()
Exemplo n.º 7
0
 def output(self):
     """Expose the data location target as the output."""
     return get_target_from_url(self.output_root)
Exemplo n.º 8
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('course_list_raw',
                                      partition_value=self.partition_value),
             'course_list.json'))
Exemplo n.º 9
0
 def output(self):
     marker_url = url_path_join(self.marker, str(hash(self)))
     return get_target_from_url(marker_url)
Exemplo n.º 10
0
 def output(self):
     return get_target_from_url(self.output_root)
Exemplo n.º 11
0
 def output(self):  # pragma: no cover
     output_root = url_path_join(self.warehouse_path,
                                 self.partition_task.hive_table_task.table,
                                 self.partition.path_spec + '/')
     return get_target_from_url(output_root, marker=True)
Exemplo n.º 12
0
 def output(self):
     return get_target_from_url(self.partition_location.rstrip('/') + '/')