def upload(self, folder: str, dag: Dag, control_reqs: bool = True): hashs = self.file_provider.hashs(dag.project) files = [] all_files = [] spec = self._build_spec(folder) for o in glob(os.path.join(folder, '**'), recursive=True): path = os.path.relpath(o, folder) if spec.match_file(path) or path == '.': continue if isdir(o): self.provider.add( DagStorage(dag=dag.id, path=path, is_dir=True) ) continue content = open(o, 'rb').read() md5 = hashlib.md5(content).hexdigest() all_files.append(o) if md5 in hashs: file_id = hashs[md5] else: file = File( md5=md5, content=content, project=dag.project, dag=dag.id, created=now() ) self.file_provider.add(file) file_id = file.id hashs[md5] = file.id files.append(o) self.provider.add( DagStorage(dag=dag.id, path=path, file=file_id, is_dir=False) ) if INSTALL_DEPENDENCIES and control_reqs: reqs = control_requirements(folder, files=all_files) for name, rel, version in reqs: self.library_provider.add( DagLibrary(dag=dag.id, library=name, version=version) )
def copy_from(self, src: int, dag: Dag): storages = self.provider.query(DagStorage). \ filter(DagStorage.dag == src). \ all() libraries = self.library_provider.query(DagLibrary). \ filter(DagLibrary.dag == src). \ all() s_news = [] for s in storages: s_new = DagStorage( dag=dag.id, file=s.file, path=s.path, is_dir=s.is_dir ) s_news.append(s_new) l_news = [] for l in libraries: l_new = DagLibrary( dag=dag.id, library=l.library, version=l.version ) l_news.append(l_new) self.provider.add_all(s_news) self.library_provider.add_all(l_news)
def upload(self, folder: str, dag: Dag, control_reqs: bool = True): self.log_info('upload started') hashs = self.file_provider.hashs(dag.project) self.log_info('hashes are retrieved') all_files = [] spec = self._build_spec(folder) files = glob(os.path.join(folder, '**')) for file in files[:]: path = os.path.relpath(file, folder) if spec.match_file(path) or path == '.': continue if os.path.isdir(file): child_files = glob(os.path.join(folder, file, '**'), recursive=True) files.extend(child_files) if self.max_count and len(files) > self.max_count: raise Exception(f'files count = {len(files)} ' f'But max count = {self.max_count}') self.log_info('list of files formed') folders_to_add = [] files_to_add = [] files_storage_to_add = [] total_size_added = 0 for o in files: path = os.path.relpath(o, folder) if spec.match_file(path) or path == '.': continue if isdir(o): folder_to_add = DagStorage(dag=dag.id, path=path, is_dir=True) folders_to_add.append(folder_to_add) continue content = open(o, 'rb').read() size = sys.getsizeof(content) if self.max_file_size and size > self.max_file_size: raise Exception( f'file = {o} has size {size}.' f' But max size is set to {self.max_file_size}') md5 = hashlib.md5(content).hexdigest() all_files.append(o) if md5 not in hashs: file = File(md5=md5, content=content, project=dag.project, dag=dag.id, created=now()) hashs[md5] = file files_to_add.append(file) total_size_added += size file_storage = DagStorage(dag=dag.id, path=path, file=hashs[md5], is_dir=False) files_storage_to_add.append(file_storage) self.log_info('inserting DagStorage folders') if len(folders_to_add) > 0: self.provider.bulk_save_objects(folders_to_add) self.log_info('inserting Files') if len(files_to_add) > 0: self.file_provider.bulk_save_objects(files_to_add, return_defaults=True) self.log_info('inserting DagStorage Files') if len(files_storage_to_add) > 0: for file_storage in files_storage_to_add: if isinstance(file_storage.file, File): # noinspection PyUnresolvedReferences file_storage.file = file_storage.file.id self.provider.bulk_save_objects(files_storage_to_add) dag.file_size += total_size_added self.dag_provider.update() if INSTALL_DEPENDENCIES and control_reqs: reqs = control_requirements(folder, files=all_files) for name, rel, version in reqs: self.library_provider.add( DagLibrary(dag=dag.id, library=name, version=version))
def create_tasks(self): tasks = self.task_provider.by_dag(self.dag) tasks_new = [] tasks_old = [] for t in tasks: if t.parent: continue task = Task( name=t.name, status=TaskStatus.NotRan.value, computer=t.computer, gpu=t.gpu, gpu_max=t.gpu_max, cpu=t.cpu, executor=t.executor, memory=t.memory, steps=t.steps, dag=self.dag_db.id, debug=t.debug, type=t.type, ) task.additional_info = t.additional_info tasks_new.append(task) tasks_old.append(t) self.task_provider.bulk_save_objects(tasks_new, return_defaults=True) old2new = { t_old.id: t_new.id for t_new, t_old in zip(tasks_new, tasks_old) } dependencies = self.task_provider.get_dependencies(self.dag) dependencies_new = [] for d in dependencies: d_new = TaskDependence(task_id=old2new[d.task_id], depend_id=old2new[d.depend_id]) dependencies_new.append(d_new) self.task_provider.bulk_save_objects(dependencies_new, return_defaults=False) changes = yaml_load(self.file_changes) storages = self.dag_storage_provider.by_dag(self.dag) storages_new = [] for s, f in storages: if not isinstance(changes, dict): continue replace = self.find_replace(changes, s.path) if replace is not None and f: content = f.content.decode('utf-8') if s.path.endswith('.yml'): data = yaml_load(content) data = merge_dicts_smart(data, replace) content = yaml_dump(data) else: for k, v in replace: if k not in content: raise Exception(f'{k} is not in the content') content = content.replace(k, v) content = content.encode('utf-8') md5 = hashlib.md5(content).hexdigest() f = self.file_provider.by_md5(md5) if not f: f = File(content=content, created=now(), project=self.dag_db.project, md5=md5, dag=self.dag_db.id) self.file_provider.add(f) s_new = DagStorage(dag=self.dag_db.id, file=f.id, path=s.path, is_dir=s.is_dir) storages_new.append(s_new) self.dag_storage_provider.bulk_save_objects(storages_new, return_defaults=False)