Exemplo n.º 1
0
def demo_pipeline():
    """Returns a demo pipeline"""
    from data_integration.commands import bash, python
    pipeline = Pipeline(id='demo',
                        description='A small pipeline that demonstrates the interplay between pipelines, tasks and commands')

    pipeline.add(Task(id='ping_localhost', description='Pings localhost',
                      commands=[bash.RunBash('ping -c 3 localhost')]))

    sub_pipeline = Pipeline(id='sub_pipeline', description='Pings a number of hosts')

    for host in ['google', 'amazon', 'facebook']:
        sub_pipeline.add(Task(id=f'ping_{host}', description=f'Pings {host}',
                              commands=[bash.RunBash(f'ping -c 3 {host}.com'),
                                        python.RunFunction(lambda: 1)]))
    sub_pipeline.add_dependency('ping_amazon', 'ping_facebook')
    sub_pipeline.add(Task(id='ping_foo', description='Pings foo',
                          commands=[bash.RunBash('ping foo')]), ['ping_amazon'])

    pipeline.add(sub_pipeline, ['ping_localhost'])

    pipeline.add(Task(id='sleep', description='Sleeps for 2 seconds',
                      commands=[bash.RunBash('sleep 2')]), ['sub_pipeline'])

    return pipeline
Exemplo n.º 2
0
 def parallel_commands(self, file_name: str) -> [pipelines.Command]:
     return [self.read_command(file_name)] + ([
         python.RunFunction(
             function=lambda: _processed_files.track_processed_file(
                 self.path(), file_name,
                 self._last_modification_timestamp(file_name)))
     ] if self.read_mode != ReadMode.ALL else [])
Exemplo n.º 3
0
    def add_parallel_tasks(self, sub_pipeline: 'pipelines.Pipeline') -> None:
        parameters = self.parameter_function()

        if not isinstance(parameters, list):
            raise ValueError(
                f'parameter function should return a list, got "{repr(parameters)}"'
            )

        for parameter in parameters:
            sub_pipeline.add(
                pipelines.Task(
                    id=str(parameter).lower().replace(' ',
                                                      '_').replace('-', '_'),
                    description=
                    f'Runs the function with parameters {repr(parameter)}',
                    commands=[
                        python.RunFunction(
                            lambda args=parameter: self.function(args))
                    ]))
Exemplo n.º 4
0
    def add_parallel_tasks(self, sub_pipeline: 'pipelines.Pipeline') -> None:
        files = []  # A list of (file_name, date_or_file_name) tuples
        data_dir = config.data_dir()
        first_date = config.first_date()

        for file in glob.iglob(str(pathlib.Path(data_dir /
                                                self.file_pattern))):
            file = str(pathlib.Path(file).relative_to(pathlib.Path(data_dir)))
            if self.date_regex:
                match = re.match(self.date_regex, file)
                if not match:
                    raise Exception(
                        f'file name "{file}" \ndoes not match date regex "{self.date_regex}"'
                    )
                date = datetime.date(*[int(group) for group in match.groups()])
                if date >= first_date:
                    files.append((file, date))
            else:
                files.append((file, file))

        # sort by date when regex provided or by filename otherwise
        files.sort(key=lambda x: x[1], reverse=True)

        # remove latest file when requested
        if self.read_mode == ReadMode.ONLY_NEW_EXCEPT_LATEST:
            files = files[1:]

        # take only latest file when requested
        if files and len(files) > 0 and self.read_mode == ReadMode.ONLY_LATEST:
            files = files[:1]

        # for incremental loading, determine which files already have been processed
        # reprocess all when file dependencies changed
        if (self.read_mode not in (ReadMode.ALL, ReadMode.ONLY_LATEST) and
            (not self.file_dependencies or not _file_dependencies.is_modified(
                self.path(), 'ParallelReadFile', self.parent.base_path(),
                self.file_dependencies))):
            processed_files = _processed_files.already_processed_files(
                self.path())

            files = [
                x for x in files
                if x[0] not in processed_files  # everything not yet read
                or
                (self.read_mode == ReadMode.ONLY_CHANGED  # everything modified
                 and self._last_modification_timestamp(x[0]) > processed_files[
                     x[0]])
            ]

        if not files:
            logger.log('No newer files', format=logger.Format.ITALICS)
            return

        if self.read_mode != ReadMode.ALL and self.file_dependencies:

            def update_file_dependencies():
                _file_dependencies.update(self.path(), 'ParallelReadFile',
                                          self.parent.base_path(),
                                          self.file_dependencies)
                return True

            sub_pipeline.final_node.commands.append(
                python.RunFunction(update_file_dependencies))

        chunk_size = math.ceil(
            len(files) / (2 * config.max_number_of_parallel_tasks()))

        if self.partition_target_table_by_day_id:
            if not isinstance(mara_db.dbs.db(self.db_alias),
                              mara_db.dbs.PostgreSQLDB):
                raise NotImplementedError(
                    f'Partitioning by day_id has only been implemented for postgresql so far, \n'
                    f'not for {mara_db.postgresql.engine(self.db_alias).name}')
            files_per_day = {}
            for (file, date) in files:
                if date in files_per_day:
                    files_per_day[date].append(file)
                else:
                    files_per_day[date] = [file]

            sql_statement = ''
            for date in files_per_day.keys():
                sql_statement += f'CREATE TABLE IF NOT EXISTS {self.target_table}_{date.strftime("%Y%m%d")}'
                sql_statement += f' ( CHECK (day_id = {date.strftime("%Y%m%d")}) ) INHERITS ({self.target_table});\n'

            create_partitions_task = pipelines.Task(
                id='create_partitions',
                description='Creates required target table partitions',
                commands=[
                    sql.ExecuteSQL(sql_statement=sql_statement,
                                   echo_queries=False,
                                   db_alias=self.db_alias)
                ])

            sub_pipeline.add(create_partitions_task)

            for n, chunk in enumerate(
                    more_itertools.chunked(files_per_day.items(), chunk_size)):
                task = pipelines.Task(
                    id=str(n), description='Reads a portion of the files')
                for (day, files) in chunk:
                    target_table = self.target_table + '_' + day.strftime(
                        "%Y%m%d")
                    for file in files:
                        task.add_commands(self.parallel_commands(file))
                    task.add_command(
                        sql.ExecuteSQL(
                            sql_statement=f'ANALYZE {target_table}'))
                sub_pipeline.add(task, ['create_partitions'])
        else:
            for n, chunk in enumerate(more_itertools.chunked(
                    files, chunk_size)):
                sub_pipeline.add(
                    pipelines.Task(
                        id=str(n),
                        description=f'Reads {len(chunk)} files',
                        commands=sum(
                            [self.parallel_commands(x[0]) for x in chunk],
                            [])))