def add_parallel_tasks(self, sub_pipeline: 'pipelines.Pipeline') -> None: files = [] # A list of (file_name, date_or_file_name) tuples data_dir = config.data_dir() first_date = config.first_date() for file in glob.iglob(str(pathlib.Path(data_dir / self.file_pattern))): file = str(pathlib.Path(file).relative_to(pathlib.Path(data_dir))) if self.date_regex: match = re.match(self.date_regex, file) if not match: raise Exception( f'file name "{file}" \ndoes not match date regex "{self.date_regex}"' ) date = datetime.date(*[int(group) for group in match.groups()]) if date >= first_date: files.append((file, date)) else: files.append((file, file)) # sort by date when regex provided or by filename otherwise files.sort(key=lambda x: x[1], reverse=True) # remove latest file when requested if self.read_mode == ReadMode.ONLY_NEW_EXCEPT_LATEST: files = files[1:] # take only latest file when requested if files and len(files) > 0 and self.read_mode == ReadMode.ONLY_LATEST: files = files[:1] # for incremental loading, determine which files already have been processed # reprocess all when file dependencies changed if (self.read_mode not in (ReadMode.ALL, ReadMode.ONLY_LATEST) and (not self.file_dependencies or not _file_dependencies.is_modified( self.path(), 'ParallelReadFile', self.parent.base_path(), self.file_dependencies))): processed_files = _processed_files.already_processed_files( self.path()) files = [ x for x in files if x[0] not in processed_files # everything not yet read or (self.read_mode == ReadMode.ONLY_CHANGED # everything modified and self._last_modification_timestamp(x[0]) > processed_files[ x[0]]) ] if not files: logger.log('No newer files', format=logger.Format.ITALICS) return if self.read_mode != ReadMode.ALL and self.file_dependencies: def update_file_dependencies(): _file_dependencies.update(self.path(), 'ParallelReadFile', self.parent.base_path(), self.file_dependencies) return True sub_pipeline.final_node.commands.append( python.RunFunction(update_file_dependencies)) chunk_size = math.ceil( len(files) / (2 * config.max_number_of_parallel_tasks())) if self.partition_target_table_by_day_id: if not isinstance(mara_db.dbs.db(self.db_alias), mara_db.dbs.PostgreSQLDB): raise NotImplementedError( f'Partitioning by day_id has only been implemented for postgresql so far, \n' f'not for {mara_db.postgresql.engine(self.db_alias).name}') files_per_day = {} for (file, date) in files: if date in files_per_day: files_per_day[date].append(file) else: files_per_day[date] = [file] sql_statement = '' for date in files_per_day.keys(): sql_statement += f'CREATE TABLE IF NOT EXISTS {self.target_table}_{date.strftime("%Y%m%d")}' sql_statement += f' ( CHECK (day_id = {date.strftime("%Y%m%d")}) ) INHERITS ({self.target_table});\n' create_partitions_task = pipelines.Task( id='create_partitions', description='Creates required target table partitions', commands=[ sql.ExecuteSQL(sql_statement=sql_statement, echo_queries=False, db_alias=self.db_alias) ]) sub_pipeline.add(create_partitions_task) for n, chunk in enumerate( more_itertools.chunked(files_per_day.items(), chunk_size)): task = pipelines.Task( id=str(n), description='Reads a portion of the files') for (day, files) in chunk: target_table = self.target_table + '_' + day.strftime( "%Y%m%d") for file in files: task.add_commands(self.parallel_commands(file)) task.add_command( sql.ExecuteSQL( sql_statement=f'ANALYZE {target_table}')) sub_pipeline.add(task, ['create_partitions']) else: for n, chunk in enumerate(more_itertools.chunked( files, chunk_size)): sub_pipeline.add( pipelines.Task( id=str(n), description=f'Reads {len(chunk)} files', commands=sum( [self.parallel_commands(x[0]) for x in chunk], [])))
def _last_modification_timestamp(self, file_name): return datetime.datetime.fromtimestamp( os.path.getmtime(pathlib.Path(config.data_dir()) / file_name))