def test_copy(self): t = LocalTarget(self.path) f = t.open('w') test_data = 'test' f.write(test_data) f.close() self.assertTrue(os.path.exists(self.path)) self.assertFalse(os.path.exists(self.copy)) t.copy(self.copy) self.assertTrue(os.path.exists(self.path)) self.assertTrue(os.path.exists(self.copy)) self.assertEqual(t.open('r').read(), LocalTarget(self.copy).open('r').read())
def test_format_newline(self): t = LocalTarget(self.path, luigi.format.SysNewLine) with t.open("w") as f: f.write(b"a\rb\nc\r\nd") with t.open("r") as f: b = f.read() with open(self.path, "rb") as f: c = f.read() self.assertEqual(b"a\nb\nc\nd", b) self.assertEqual(b"a\r\nb\r\nc\r\nd", c)
def test_format_newline(self): t = LocalTarget(self.path, luigi.format.SysNewLine) with t.open('w') as f: f.write(b'a\rb\nc\r\nd') with t.open('r') as f: b = f.read() with open(self.path, 'rb') as f: c = f.read() self.assertEqual(b'a\nb\nc\nd', b) self.assertEqual(b'a\r\nb\r\nc\r\nd', c)
def test_move_across_filesystems(self): t = LocalTarget(self.path) with t.open('w') as f: f.write('test_data') def rename_across_filesystems(src, dst): err = OSError() err.errno = EXDEV raise err real_rename = os.rename def mockrename(src, dst): if '-across-fs' in src: real_rename(src, dst) else: rename_across_filesystems(src, dst) copy = '%s-across-fs' % self.copy with mock.patch('os.rename', mockrename): t.move(copy) self.assertFalse(os.path.exists(self.path)) self.assertTrue(os.path.exists(copy)) self.assertEqual('test_data', LocalTarget(copy).open('r').read())
def dump(target: luigi.LocalTarget, obj): extension = os.path.splitext(target.path)[1] with target.open('w') as f: if extension == '.pkl': f.write(pickle.dumps(obj, protocol=4)) else: f.write(str(obj))
def write_output(self, test_type: str, test_file: str, output_file: TextIO, test_output: LocalTarget): with test_output.open("r") as test_output_file: status = test_output_file.read() for line in status.split("\n"): if line != "": output_file.write("%s %s %s\n" % (test_type, test_file, line))
def valid_io_modes(self, *a, **kw): modes = set() t = LocalTarget(is_tmp=True) t.open('w').close() for mode in self.theoretical_io_modes(*a, **kw): try: io.FileIO(t.path, mode).close() except ValueError: pass except IOError as err: if err.errno == EEXIST: modes.add(mode) else: raise else: modes.add(mode) return modes
def test_tmp(self): t = LocalTarget(is_tmp=True) self.assertFalse(t.exists()) self.assertFalse(os.path.exists(t.path)) p = t.open('w') print('test', file=p) self.assertFalse(t.exists()) self.assertFalse(os.path.exists(t.path)) p.close() self.assertTrue(t.exists()) self.assertTrue(os.path.exists(t.path)) q = t.open('r') self.assertEqual(q.readline(), 'test\n') q.close() path = t.path del t # should remove the underlying file self.assertFalse(os.path.exists(path))
def test_open_modes(self): t = LocalTarget(is_tmp=True) print('Valid write mode:', end=' ') for mode in self.valid_write_io_modes_for_luigi(): print(mode, end=' ') p = t.open(mode) p.close() print() print('Valid read mode:', end=' ') for mode in self.valid_read_io_modes_for_luigi(): print(mode, end=' ') p = t.open(mode) p.close() print() print('Invalid mode:', end=' ') for mode in self.invalid_io_modes_for_luigi(): print(mode, end=' ') self.assertRaises(Exception, t.open, mode) print()
def test_format_chain_reverse(self): t = LocalTarget(self.path, luigi.format.UTF8 >> luigi.format.Gzip) f = gzip.open(self.path, 'wb') f.write(b'\xe6\x88\x91\xc3\xa9\r\n\xc3\xa7\xd1\x84') f.close() with t.open('r') as f: b = f.read() self.assertEqual(u'我é\nçф', b)
def test_format_chain_reverse(self): t = LocalTarget(self.path, luigi.format.UTF8 >> luigi.format.Gzip) f = gzip.open(self.path, "wb") f.write(b"\xe6\x88\x91\xc3\xa9\r\n\xc3\xa7\xd1\x84") f.close() with t.open("r") as f: b = f.read() self.assertEqual(u"我é\nçф", b)
def test_move(self): t = LocalTarget(self.path) f = t.open('w') test_data = 'test' f.write(test_data) f.close() self.assertTrue(os.path.exists(self.path)) self.assertFalse(os.path.exists(self.copy)) t.move(self.copy) self.assertFalse(os.path.exists(self.path)) self.assertTrue(os.path.exists(self.copy))
def test_format_chain(self): UTF8WIN = luigi.format.TextFormat(encoding="utf8", newline="\r\n") t = LocalTarget(self.path, UTF8WIN >> luigi.format.Gzip) a = u"我é\nçф" with t.open("w") as f: f.write(a) f = gzip.open(self.path, "rb") b = f.read() f.close() self.assertEqual(b"\xe6\x88\x91\xc3\xa9\r\n\xc3\xa7\xd1\x84", b)
def test_format_chain(self): UTF8WIN = luigi.format.TextFormat(encoding='utf8', newline='\r\n') t = LocalTarget(self.path, UTF8WIN >> luigi.format.Gzip) a = u'我é\nçф' with t.open('w') as f: f.write(a) f = gzip.open(self.path, 'rb') b = f.read() f.close() self.assertEqual(b'\xe6\x88\x91\xc3\xa9\r\n\xc3\xa7\xd1\x84', b)
def test_bzip2(self): t = LocalTarget(self.path, luigi.format.Bzip2) p = t.open('w') test_data = b'test' p.write(test_data) print(self.path) self.assertFalse(os.path.exists(self.path)) p.close() self.assertTrue(os.path.exists(self.path)) # Using bzip module as validation f = bz2.BZ2File(self.path, 'r') self.assertTrue(test_data == f.read()) f.close() # Verifying our own bzip2 reader f = LocalTarget(self.path, luigi.format.Bzip2).open('r') self.assertTrue(test_data == f.read()) f.close()
def test_gzip_with_module(self): t = LocalTarget(self.path, luigi.format.Gzip) p = t.open('w') test_data = b'test' p.write(test_data) print(self.path) self.assertFalse(os.path.exists(self.path)) p.close() self.assertTrue(os.path.exists(self.path)) # Using gzip module as validation f = gzip.open(self.path, 'r') self.assertTrue(test_data == f.read()) f.close() # Verifying our own gzip reader f = LocalTarget(self.path, luigi.format.Gzip).open('r') self.assertTrue(test_data == f.read()) f.close()
def sort_file(tgt: LocalTarget): with tgt.open("r") as f: out = json.load(f) return out
def load(target: luigi.LocalTarget): with target.open('r') as f: return pickle.load(f)
class StoppableTask(luigi.Task): logger = logging.getLogger('luigi-interface') def set_class_targets(self): self.failed_target = LocalTarget(build_config().output_directory + "/TASK_FAILED") self.create_dependencies_file() self.create_timer_files() def create_dependencies_file(self): self.dependencies_dir = pathlib.Path( build_config().output_directory).joinpath("dependencies") self.dependencies_task_dir = self.dependencies_dir.joinpath( self.task_id) self.dependencies_task_dir.mkdir(parents=True, exist_ok=True) self.dependencies_requires_file = self.dependencies_task_dir.joinpath( "requires.json") self.dependencies_dynamic_file = self.dependencies_task_dir.joinpath( "dynamic.json") def create_timer_files(self): self.create_timer_base_directories() self.create_creation_timer_file() self.create_first_run_timer_file() self.create_run_timer_file() def create_timer_base_directories(self): self.timers_dir = pathlib.Path( build_config().output_directory).joinpath("timers") self.timers_state_dir = self.timers_dir.joinpath("state") self.timers_result_dir = self.timers_dir.joinpath("results") def create_creation_timer_file(self): self.creation_timer_state_dir = self.timers_state_dir.joinpath( "creation") self.creation_timer_state_dir.mkdir(parents=True, exist_ok=True) self.creation_timer_state_file = self.creation_timer_state_dir.joinpath( self.task_id) def create_first_run_timer_file(self): self.first_run_timer_state_dir = self.timers_state_dir.joinpath( "first_run") self.first_run_timer_state_dir.mkdir(parents=True, exist_ok=True) self.first_run_timer_state_file = self.first_run_timer_state_dir.joinpath( self.task_id) def create_run_timer_file(self): self.run_timer_state_dir = self.timers_state_dir.joinpath("run") self.run_timer_state_dir.mkdir(parents=True, exist_ok=True) self.run_timer_state_file = self.run_timer_state_dir.joinpath( self.task_id) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.set_class_targets() self.write_creation_time() def requires(self): tasks = self.requires_tasks() if tasks is not None: if isinstance(tasks, (luigi.Task, list, tuple, dict)): return self.handle_requires_value(tasks) else: return self.handle_requires_generator(tasks) else: return [] def handle_requires_value(self, tasks): if not self.dependencies_requires_file.exists(): with self.dependencies_requires_file.open( "w") as dependencies_file: self.write_dependency( dependencies_file=dependencies_file, dependency_type=DependencyType.requires, dependency_state=DependencyState.requested, index=0, value=tasks) return tasks else: return tasks def handle_requires_generator(self, tasks): if not self.dependencies_requires_file.exists(): with self.dependencies_requires_file.open( "w") as dependencies_file: result = list( self.write_dependencies_for_generator( dependencies_file=dependencies_file, task_generator=tasks, dependency_type=DependencyType.requires)) return result else: return tasks def requires_tasks(self): pass def write_creation_time(self): if not self.creation_timer_state_file.exists(): with self.creation_timer_state_file.open("w") as f: f.write(str(datetime.now().timestamp())) def run(self): start_time = datetime.now() self.write_first_run_start_time(start_time) still_running_logger_thread = self.start_still_running_logger() try: self.fail_if_any_task_failed() with self.dependencies_dynamic_file.open("a") as dependencies_file: task_generator = self.run_task() if task_generator is not None: yield from self.write_dependencies_for_generator( dependencies_file=dependencies_file, task_generator=task_generator, dependency_type=DependencyType.dynamic) finally: self.write_run_time(start_time) self.stop_still_running_logger(still_running_logger_thread) def write_dependencies_for_generator(self, dependencies_file, task_generator, dependency_type: DependencyType): index = 0 try: element = next(task_generator) while True: self.write_dependency( dependencies_file=dependencies_file, dependency_type=dependency_type, dependency_state=DependencyState.requested, index=index, value=element) result = yield element element = task_generator.send(result) index += 1 except StopIteration: pass def write_dependency(self, dependencies_file, dependency_type: DependencyType, dependency_state: DependencyState, index: int, value): for task in self.flatten_tasks(value): dependency = TaskDependency(source=self.get_task_description(), target=task.get_task_description(), type=dependency_type, index=index, state=dependency_state) dependencies_file.write(f"{dependency.to_json()}") dependencies_file.write("\n") def get_task_description(self) -> TaskDescription: return TaskDescription(id=self.task_id, representation=str(self)) def flatten_tasks(self, generator: Generator) -> List["StoppableTask"]: return [ task for task in luigi.task.flatten(generator) if isinstance(task, StoppableTask) ] def run_task(self) -> Generator: pass def write_first_run_start_time(self, start_time): if not self.first_run_timer_state_file.exists(): with self.first_run_timer_state_file.open("w") as f: f.write(str(start_time.timestamp())) def start_still_running_logger(self): # TODO use larger delay for this StillRunningLogger still_running_logger = StillRunningLogger(self.logger, self.__repr__(), "task") still_running_logger_thread = StillRunningLoggerThread( still_running_logger) still_running_logger_thread.start() return still_running_logger_thread def write_run_time(self, start_time): timedelta = datetime.now() - start_time with self.run_timer_state_file.open("a") as f: f.write(str(timedelta.total_seconds())) f.write("\n") def stop_still_running_logger(self, still_running_logger_thread): still_running_logger_thread.stop() still_running_logger_thread.join() def fail_if_any_task_failed(self): if self.failed_target.exists(): with self.failed_target.open("r") as f: failed_task = f.read() raise StoppingFurtherExecution( "Task %s failed. Stopping further execution." % failed_task) def on_success(self): now = datetime.now() self.log_time_since_first_run(now) self.log_time_since_creation(now) self.log_time_of_runs() super().on_success() def log_time_since_creation(self, now): if self.creation_timer_state_file.exists(): with self.creation_timer_state_file.open("r") as f: start_time_str = f.read() start_time = datetime.fromtimestamp(float(start_time_str)) timedelta = now - start_time self.logger.info("Task %s: Time since creation %s s", self.__repr__(), timedelta.total_seconds()) self.timers_result_dir.mkdir(parents=True, exist_ok=True) with self.timers_result_dir.joinpath( self.task_id + "_" + "since_creation").open("w") as f: f.write(str(timedelta.total_seconds())) def log_time_since_first_run(self, now): if self.first_run_timer_state_file.exists(): with self.first_run_timer_state_file.open("r") as f: start_time_str = f.read() start_time = datetime.fromtimestamp(float(start_time_str)) timedelta = now - start_time self.logger.info("Task %s: Time since first_run %s s", self.__repr__(), timedelta.total_seconds()) self.timers_result_dir.mkdir(parents=True, exist_ok=True) with self.timers_result_dir.joinpath( self.task_id + "_" + "since_first_run").open("w") as f: f.write(str(timedelta.total_seconds())) def log_time_of_runs(self): if self.run_timer_state_file.exists(): with self.run_timer_state_file.open("r") as f: total_runtime = self.calculate_total_runtime(f.readlines()) self.logger.info("Task %s: Total runtime of run method %s s", self.__repr__(), total_runtime) with self.timers_result_dir.joinpath(self.task_id + "_" + "total_run").open("w") as f: f.write(str(total_runtime)) def calculate_total_runtime(self, lines): total_runtime = 0 for line in lines: seconds_of_run = float(line) total_runtime += seconds_of_run return total_runtime def on_failure(self, exception): if not isinstance(exception, StoppingFurtherExecution): if not self.failed_target.exists(): with self.failed_target.open("w") as f: f.write("%s" % self.task_id) super().on_failure(exception) def __repr__(self): """ Build a task representation like `MyTask(param1=1.5, param2='5')` """ params = self.get_params() param_values = self.get_param_values(params, [], self.param_kwargs) # Build up task id repr_parts = [] param_objs = dict(params) for param_name, param_value in param_values: if param_objs[param_name].significant and \ param_objs[param_name].visibility == luigi.parameter.ParameterVisibility.PUBLIC: repr_parts.append( '%s=%s' % (param_name, param_objs[param_name].serialize(param_value))) task_str = '{}({})'.format(self.get_task_family(), ', '.join(repr_parts)) return task_str
def read_company_search_officers( self, company_search_officers_target: LocalTarget) -> Series: """Read company search officers.""" with company_search_officers_target.open('rb') as in_file: return pickle.load(in_file)
class ExternalFileTarget(luigi.target.FileSystemTarget): # @property # def path(self): def __init__(self, path, file_type='regular', root_dir=None, format=None, **kwargs): self.is_remote = commons().is_remote if root_dir: full_path = os.path.join(root_dir, path) else: if self.is_remote: full_path = os.path.join(commons().remote_root, path) else: full_path = os.path.join(commons().local_root, path) self.file_type = file_type self.format = format if self.is_remote: host = commons().SSH_HOST port = commons().SSH_PORT kwargs['port'] = port self._target = RemoteTarget(full_path, host, format=format, **kwargs) if file_type == 'apk': # create temporary local copy self.local_path = os.path.join( tempfile.gettempdir(), 'luigi-{}-{}.apk'.format(os.path.basename(path), random.randint(0, 999999999))) self._target.get(self.local_path) else: self._target = LocalTarget(full_path, format=format, **kwargs) if self.is_remote and self.file_type == 'apk': path = self.local_path else: path = self._target.path super(ExternalFileTarget, self).__init__(path) # XXX: check if this is right def fs(self): return self._target.fs def open(self, mode='r'): return self._target.open(mode) def exists(self): return self._target.exists() def remove(self): return self._target.remove() def cleanup(self): try: os.remove(self.local_path) except (OSError, AttributeError): pass