def check_or_install_fuse(self): fuse_type = os.getenv('CP_S3_FUSE_TYPE', FUSE_GOOFYS_ID) if fuse_type == FUSE_GOOFYS_ID: fuse_installed = self.execute_and_check_command( 'install_s3_fuse_goofys') return FUSE_GOOFYS_ID if fuse_installed else FUSE_NA_ID elif fuse_type == FUSE_S3FS_ID: fuse_installed = self.execute_and_check_command( 'install_s3_fuse_s3fs') if fuse_installed: return FUSE_S3FS_ID else: Logger.warn( "FUSE {fuse_type} was preferred, but failed to install, will try to setup default goofys" .format(fuse_type=fuse_type), task_name=self.task_name) fuse_installed = self.execute_and_check_command( 'install_s3_fuse_goofys') return FUSE_GOOFYS_ID if fuse_installed else FUSE_NA_ID fi else: Logger.warn( "FUSE {fuse_type} type is not defined for S3 fuse".format( fuse_type=fuse_type), task_name=self.task_name) return FUSE_NA_ID
def execute_mount(self, command, params): result = common.execute_cmd_command(command, silent=True) if result == 0: Logger.info('-->{path} mounted to {mount}'.format(**params), task_name=self.task_name) else: Logger.warn( '--> Failed mounting {path} to {mount}'.format(**params), task_name=self.task_name)
def create_directory(self, path): result = common.execute_cmd_command( 'mkdir -p {path}'.format(path=path), silent=True) if result != 0: Logger.warn( 'Failed to create mount directory: {path}'.format(path=path), task_name=self.task_name) return False return True
def main(): parser = argparse.ArgumentParser() parser.add_argument('--parameter', type=str, required=True, nargs='*') parser.add_argument('--task-name', required=True) parser.add_argument('--run-id', required=True, type=int) args = parser.parse_args() status = StatusEntry(TaskStatus.SUCCESS) try: node = WaitForNode().await_node_start(args.parameter, args.task_name, args.run_id) print(node.name + " " + node.ip) exit(0) except Exception as e: Logger.warn(e.message) status = StatusEntry(TaskStatus.FAILURE) if status.status == TaskStatus.FAILURE: raise RuntimeError('Failed to setup cluster')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--nodes_number', type=int, required=True) args = parser.parse_args() run_id = os.environ['RUN_ID'] hostfile = os.environ['DEFAULT_HOSTFILE'] status = StatusEntry(TaskStatus.SUCCESS) workers = [] try: workers = CreateWorkerNodes().await_workers_start( args.nodes_number, run_id) BuildHostfile().run(workers, hostfile, run_id) except Exception as e: Logger.warn(e.message) status = StatusEntry(TaskStatus.FAILURE) ShutDownCluster().run(workers, status) if status.status == TaskStatus.FAILURE: raise RuntimeError('Failed to setup cluster')
def child_run_active(self): if self.child_id is None: return False attempts = 0 while attempts < self.RETRY_COUNT: try: run = self.api.load_run(self.child_id) return run['status'] == 'RUNNING' except Exception as e: Logger.warn( "Failed to fetch child run ID '' status: {}.".format( str(self.child_id), e.message), task_name=self.TASK_NAME) attempts = attempts + 1 time.sleep(self.POLL_TIMEOUT) Logger.fail("Exceeded maximum attempts to fetch child run status.") raise RuntimeError( "Exceeded maximum attempts to fetch child run status.")
def transfer_data(self, data_paths, log_task): if len(data_paths) > 0: Logger.info('Transferring %d path(s)' % len(data_paths), task_name=log_task) transfers = map(self.__schedule_transfer_task, data_paths) for transfer in transfers: if transfer is None: raise RuntimeError('Upload via DTS failed') remaining_ids = map(lambda transfer: transfer['id'], transfers) while remaining_ids: current_ids = list(remaining_ids) for id in current_ids: transfer_task = self.__get_transfer_task(id) source_path = transfer_task['source']['path'] destination_path = transfer_task['destination']['path'] if transfer_task['status'] == _TransferStatus.SUCCESS: remaining_ids.remove(id) Logger.info( 'Data transfer from source %s to destination %s has finished' % (destination_path, source_path), task_name=log_task) elif transfer_task['status'] == _TransferStatus.FAILURE: remaining_ids.remove(id) reason = transfer_task[ 'reason'] if 'reason' in transfer_task else 'No reason available' Logger.fail( "Data transfer from source %s to destination %s went bad due to the reason: '%s'" % (source_path, destination_path, reason), task_name=log_task) raise RuntimeError( 'Data transfer went bad for source %s' % source_path) else: time.sleep(self.pooling_delay) if not len(remaining_ids) == len( current_ids) and remaining_ids: Logger.info('%d data transfers are still being processed' % len(remaining_ids), task_name=log_task) Logger.info('All data transfers have finished successfully', task_name=log_task) else: Logger.warn('No files for data transfer were found', task_name=log_task)
def get_running_samples(self): attempts = 0 while attempts < self.RETRY_COUNT: try: child_runs = self.api.load_child_pipelines(self.run_id) count = 0 for run in child_runs: if run['status'] == 'RUNNING': count = count + 1 return count except Exception as e: Logger.warn("Failed to fetch running samples: {}.".format( e.message), task_name=self.TASK_NAME) attempts = attempts + 1 time.sleep(self.POLL_TIMEOUT) Logger.fail("Exceeded maximum attempts to fetch running samples.") raise RuntimeError( "Exceeded maximum attempts to fetch running samples.")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--master-id', type=int, required=True) parser.add_argument('--task-name', required=True) args = parser.parse_args() run_id = os.environ['RUN_ID'] status = StatusEntry(TaskStatus.SUCCESS) workers = [] try: master = MasterNode().await_master_start(args.master_id, args.task_name) print(master.name + " " + master.ip) exit(0) except Exception as e: Logger.warn(e.message) status = StatusEntry(TaskStatus.FAILURE) if status.status == TaskStatus.FAILURE: raise RuntimeError('Failed to setup cluster')
def run(self, mount_root, tmp_dir): try: Logger.info('Starting mounting remote data storages.', task_name=self.task_name) Logger.info('Fetching list of allowed storages...', task_name=self.task_name) available_storages = self.api.load_available_storages() if not available_storages: Logger.success('No remote storages are available', task_name=self.task_name) return Logger.info( 'Found {} available storage(s). Checking mount options.'. format(len(available_storages)), task_name=self.task_name) fuse_tmp = os.path.join(tmp_dir, "s3fuse") if not self.create_directory(fuse_tmp): fuse_tmp = '/tmp' fuse_available = self.check_or_install_fuse() aws_default_region = os.getenv('AWS_DEFAULT_REGION', 'us-east-1') aws_region = os.getenv('AWS_REGION', aws_default_region) limited_storages = os.getenv('CP_CAP_LIMIT_MOUNTS') if limited_storages: try: limited_storages_list = [ int(x.strip()) for x in limited_storages.split(',') ] available_storages = [ x for x in available_storages if x.id in limited_storages_list ] Logger.info( 'Run is launched with mount limits ({}) Only {} storages will be mounted' .format(limited_storages, len(available_storages)), task_name=self.task_name) except Exception as limited_storages_ex: Logger.warn( 'Unable to parse CP_CAP_LIMIT_MOUNTS value({}) with error: {}.' .format(limited_storages, str(limited_storages_ex.message)), task_name=self.task_name) nfs_count = len( filter((lambda ds: ds.storage_type == 'NFS' and ds.region_name == aws_region), available_storages)) nfs_available = nfs_count > 0 and self.check_or_install_nfs() if not fuse_available and not nfs_available: Logger.success( 'Mounting of remote storages is not available for this image', task_name=self.task_name) return for storage in available_storages: if not PermissionHelper.is_storage_readable(storage): continue mounter = self.get_mount_manager(storage, nfs_available, fuse_available, fuse_tmp) if mounter is not None: self.mount(mounter, mount_root) elif storage.storage_type != NFS_TYPE and storage.storage_type != S3_TYPE: Logger.warn('Unsupported storage type {}.'.format( storage.storage_type), task_name=self.task_name) Logger.success('Finished data storage mounting', task_name=self.task_name) except Exception as e: Logger.fail('Unhandled error during mount task: {}.'.format( str(e.message)), task_name=self.task_name)
for column in metadata_columns: for (external_id, internal_id, url, column_type, file_name_format) in metadata_columns_values[column]: upload_result = pool.apply_async( upload_data, (url, destination, file_name_format, column, column_type, create_folders_for_columns, internal_id, metadata_id, api, update_paths)) pool_results.append(upload_result) pool.close() pool.join() successes_count = sum([1 for x in pool_results if x.get() == 0]) if successes_count == len(pool_results): Logger.success("Upload done. All transfers completed successfully", task_name=UPLOAD_TASK_NAME) exit(0) elif successes_count == 0: Logger.fail( "Upload completed with errors. ALL transfers FAILED\nPlease review errors above", task_name=UPLOAD_TASK_NAME) exit(1) else: Logger.warn( "Upload completed with errors. SOME of the transfers failed to complete\nPlease review errors above", task_name=UPLOAD_TASK_NAME) exit(0)
def launch_pipeline(self, params, param_names, instance_size, instance_disk, docker_image, cmd, sample=None): if not self.child_run_active(): self.launch_child_run(params, param_names, cmd, instance_size, instance_disk, docker_image, sample=sample) return command = self.CMD_TEMPLATE.format(pipe_id=self.pipe_id, version=self.version, instance_disk=instance_disk, instance_type=instance_size, docker_image=docker_image, cmd=cmd, parent=self.run_id) if sample: command = command + self.SAMPLE_TEMPLATE.format( sample_name=sample[self.SAMPLE_NAME], sample_id=sample[self.SAMPLE_ID]) # add all pattern params index = 0 for name in param_names: if sample: value = ','.join(params[name]) else: value = params[index] command += " --{} input?{}".format(name, value) index = index + 1 # add all other params for param, value in self.pipe_params.iteritems(): if param.startswith('i_'): command += " --{} input?{}".format( self.change_parameter_name(param), value) elif param.startswith('c_'): command += " --{} common?{}".format( self.change_parameter_name(param), value) elif param.startswith('o_'): command += " --{} output?{}".format( self.change_parameter_name(param), value) else: command += " --{} {}".format(param, value) Logger.info('Starting pipeline with command: "{}".'.format(command), task_name=self.TASK_NAME) try: LoggedCommand(command, None, self.TASK_NAME).execute() except Exception as e: Logger.warn( "Failed to launch sample processing with command: '{}'. Error: '{}'." .format(command, e.message), task_name=self.TASK_NAME)
def launch_child_run(self, params, param_names, cmd, instance_size, instance_disk, docker_image, sample=None): run_params = {'parent-id': self.run_id} if sample: run_params['sample_name'] = sample[self.SAMPLE_NAME] run_params['sample_id'] = sample[self.SAMPLE_ID] index = 0 # add all pattern params for name in param_names: if sample: value = ','.join(params[name]) else: value = params[index] run_params[name] = {'value': value, 'type': 'input'} index = index + 1 # add all other params for param, value in self.pipe_params.iteritems(): param_type = None param_name = param real_value = self.normalize_value(value) if param.startswith('i_'): param_type = 'input' param_name = self.change_parameter_name(param) elif param.startswith('c_'): param_type = 'common' param_name = self.change_parameter_name(param) elif param.startswith('o_'): param_type = 'output' param_name = self.change_parameter_name(param) run_params[param_name] = {'value': real_value} if param_type is not None: run_params[param_name]['type'] = param_type else: run_params[param_name]['type'] = self.get_type_from_env( param_name) Logger.info( "Starting child pipeline run on a parent node with parameters: '{}'." .format(str(run_params)), task_name=self.TASK_NAME) try: run = self.api.launch_pipeline(self.pipe_id, self.version, run_params, parent_node_id=self.run_id, cmd=cmd, instance=instance_size, disk=instance_disk, docker=docker_image) self.child_id = run['id'] except Exception as e: Logger.warn( "Failed to launch sample processing with parameters: '{}'. Error: '{}'." .format(str(run_params), e.message), task_name=self.TASK_NAME) self.child_id = None
def warn(message, crucial=False, *args, **kwargs): logging.warn(message, *args, **kwargs) if not Logger.cmd and (crucial or Logger.verbose): CloudPipelineLogger.warn(message, task_name=Logger.task)