def __init__(self): Task.__init__(self) self.task_name = 'WaitForNode' self.pipe_api = PipelineAPI(os.environ['API'], 'logs')
class RunAnalyticalPipelinesTask(object): def __init__(self, task, pipeline, version, instance_type, instance_disk): self.api = PipelineAPI(os.environ['API'], 'logs') self.task = task self.pipeline = self.api.find_pipeline(pipeline) self.version = version self.instance_type = instance_type self.instance_disk = instance_disk def run(self): analysis_folder = os.environ['ANALYSIS_FOLDER'] machine_run_folder = os.environ['MACHINE_RUN_FOLDER'] sample_sheet = os.environ['SAMPLE_SHEET'] Logger.info('Starting analytical processing for sample sheet %s' % sample_sheet, task_name=self.task) samples = SampleSheetParser( sample_sheet, [SAMPLE_ID, SAMPLE_NAME, SAMPLE_PROJECT]).parse_sample_sheet() launched_runs = {} for sample in samples: Logger.info('Starting "%s" sample processing.' % sample[SAMPLE_NAME], task_name=self.task) launched_runs[sample[SAMPLE_NAME]] = self.__run_sample( sample[SAMPLE_NAME], analysis_folder, machine_run_folder) failed_runs = self.__wait_runs_completion(launched_runs) if failed_runs: for sample, run_id in failed_runs.iteritems(): Logger.fail( 'Processing failed for sample "%s". Check run %d logs for more information.' % (sample, run_id), task_name=self.task) sys.exit(1) Logger.success("All samples processed successfully.", task_name=self.task) def __run_sample(self, sample, analysis_folder, machine_run_folder): Logger.info( 'Launching analytical pipeline "%s" with version "%s" for sample %s.' % (self.pipeline['name'], self.version, sample), task_name=self.task) read1, read2 = self.__fetch_reads(sample, analysis_folder, machine_run_folder) pipeline_params = { 'SAMPLE': { 'value': sample }, 'READ1': { 'value': read1, 'type': 'input' }, 'READ2': { 'value': read2, 'type': 'input' }, 'OUTPUT_FOLDER': { 'value': analysis_folder, 'type': 'output' } } run = self.api.launch_pipeline(self.pipeline['id'], self.version, pipeline_params, instance=self.instance_type, disk=self.instance_disk, parent_run_id=os.environ['RUN_ID']) return run['id'] def __fetch_reads(self, sample, analysis_folder, machine_run_folder): run_folder_name = urlparse.urlparse(machine_run_folder).path read_folder = self.__get_path_without_trailing_slash(analysis_folder) + \ self.__get_path_without_trailing_slash(run_folder_name) + \ '/PipelineInputData/FASTQ/' return os.path.join(read_folder, sample + '_R1.fastq.gz'), os.path.join( read_folder, sample + '_R2.fastq.gz') def __get_path_without_trailing_slash(self, path): return path[:-1] if path.endswith('/') else path def __wait_runs_completion(self, launched_runs): finished = {} failed = {} while True: for sample, run_id in launched_runs.iteritems(): current_status = self.api.load_run(run_id)['status'] Logger.info('Processing sample: %s. Run %d status is %s.' % (sample, run_id, current_status), task_name=self.task) if current_status != 'RUNNING': finished[sample] = run_id if current_status != 'SUCCESS': failed[sample] = run_id if len(finished) == len(launched_runs): Logger.info("Processing for all samples completed.", task_name=self.task) return failed time.sleep(60)
class WaitForNode(Task): def __init__(self): Task.__init__(self) self.task_name = 'WaitForNode' self.pipe_api = PipelineAPI(os.environ['API'], 'logs') def await_node_start(self, parameters, task_name, run_id): try: Logger.info( 'Waiting for node with parameters = {}, task: {}'.format( ','.join(parameters), task_name), task_name=self.task_name) # approximately 10 minutes attempts = 60 master = self.get_node_info(parameters, task_name, run_id) while not master and attempts > 0: master = self.get_node_info(parameters, task_name, run_id) attempts -= 1 Logger.info('Waiting for node ...', task_name=self.task_name) time.sleep(10) if not master: raise RuntimeError('Failed to attach to master node') Logger.success('Attached to node (run id {})'.format(master.name), task_name=self.task_name) return master except Exception as e: self.fail_task(e.message) def get_node_info(self, parameters, task_name, run_id): params = self.parse_parameters(parameters) runs = self.pipe_api.search_runs(params, status='RUNNING', run_id=run_id) if len(runs) == 0: params.append(('parent-id', str(run_id))) runs = self.pipe_api.search_runs(params, status='RUNNING') for run in runs: if self.check_run(run, params): node = Node(run) task_logs = self.pipe_api.load_task(node.run_id, task_name) if not task_logs: return None task_status = task_logs[-1]['status'] if task_status == 'SUCCESS': return node elif task_status != 'RUNNING': raise RuntimeError( 'Node failed to start as it cannot attach to a node (run id {})' .format(node.run_id)) return None def parse_parameters(self, parameters): result = [] for param in parameters: if '=' not in param: raise RuntimeError( "Illegal parameter format. Key=Value is expected.") result.append(param.split("=", 1)) return result def check_run(self, run, params): run_params = {} for run_param in run['pipelineRunParameters']: value = run_param['value'] if 'value' in run_param else None run_params[run_param['name']] = value for param in params: if param[0] not in run_params or run_params[param[0]] != param[1]: return False return True
def get_image_name_and_tag(image_name_with_tag): image_name, image_tag = parse_image(image_name_with_tag) if image_tag is None: image_tag = 'latest' return image_name, image_tag def add_settings(new_tool_id, new_version, initial_tool_id, initial_version): settings = get_tool_version_settings(initial_tool_id, initial_version) create_settings_for_tool_version(new_tool_id, new_version, settings) if __name__ == '__main__': api = PipelineAPI(os.environ['API'], 'logs') command = sys.argv[1] run_id = sys.argv[2] if command == "ups": status_to_update = None new_status = sys.argv[3] if new_status == "FAILURE": status_to_update = pipeline.api.CommmitStatus.FAILURE elif new_status == "SUCCESS": status_to_update = pipeline.api.CommmitStatus.SUCCESS else: raise RuntimeError( "Wrong argument for update_commit_status: {}".format( new_status)) update_commit_status(api, run_id, status_to_update)
class AbstractPipelineLauncher(AbstractTask): TASK_NAME = "LaunchSampleProcessing" # important! cmd template should be single-quoted to prevent parameter expansion CMD_TEMPLATE = "pipe run --yes --quiet --pipeline {pipe_id}@{version} --instance-disk {instance_disk} " \ "--instance-type {instance_type} --docker-image {docker_image} --cmd-template '{cmd}' " \ "--parent-id {parent}" SAMPLE_TEMPLATE = " --sample_name {sample_name} --sample_id {sample_id}" POLL_TIMEOUT = 30 RETRY_COUNT = 10 SAMPLE_ID = "Sample_ID" SAMPLE_NAME = "Sample_Name" def __init__(self, run_files, param_names, run_id, pipe_id, version, pipe_params, param_types): AbstractTask.__init__(self, self.TASK_NAME) self.samples_number = len(run_files) self.run_id = run_id self.run_files = run_files self.param_names = param_names self.pipe_id = pipe_id self.version = version self.api = PipelineAPI(os.environ['API'], 'logs') self.pipe_params = pipe_params self.child_id = None self.param_types = param_types def launch_pipeline(self, params, param_names, instance_size, instance_disk, docker_image, cmd, sample=None): if not self.child_run_active(): self.launch_child_run(params, param_names, cmd, instance_size, instance_disk, docker_image, sample=sample) return command = self.CMD_TEMPLATE.format(pipe_id=self.pipe_id, version=self.version, instance_disk=instance_disk, instance_type=instance_size, docker_image=docker_image, cmd=cmd, parent=self.run_id) if sample: command = command + self.SAMPLE_TEMPLATE.format( sample_name=sample[self.SAMPLE_NAME], sample_id=sample[self.SAMPLE_ID]) # add all pattern params index = 0 for name in param_names: if sample: value = ','.join(params[name]) else: value = params[index] command += " --{} input?{}".format(name, value) index = index + 1 # add all other params for param, value in self.pipe_params.iteritems(): if param.startswith('i_'): command += " --{} input?{}".format( self.change_parameter_name(param), value) elif param.startswith('c_'): command += " --{} common?{}".format( self.change_parameter_name(param), value) elif param.startswith('o_'): command += " --{} output?{}".format( self.change_parameter_name(param), value) else: command += " --{} {}".format(param, value) Logger.info('Starting pipeline with command: "{}".'.format(command), task_name=self.TASK_NAME) try: LoggedCommand(command, None, self.TASK_NAME).execute() except Exception as e: Logger.warn( "Failed to launch sample processing with command: '{}'. Error: '{}'." .format(command, e.message), task_name=self.TASK_NAME) def launch_child_run(self, params, param_names, cmd, instance_size, instance_disk, docker_image, sample=None): run_params = {'parent-id': self.run_id} if sample: run_params['sample_name'] = sample[self.SAMPLE_NAME] run_params['sample_id'] = sample[self.SAMPLE_ID] index = 0 # add all pattern params for name in param_names: if sample: value = ','.join(params[name]) else: value = params[index] run_params[name] = {'value': value, 'type': 'input'} index = index + 1 # add all other params for param, value in self.pipe_params.iteritems(): param_type = None param_name = param real_value = self.normalize_value(value) if param.startswith('i_'): param_type = 'input' param_name = self.change_parameter_name(param) elif param.startswith('c_'): param_type = 'common' param_name = self.change_parameter_name(param) elif param.startswith('o_'): param_type = 'output' param_name = self.change_parameter_name(param) run_params[param_name] = {'value': real_value} if param_type is not None: run_params[param_name]['type'] = param_type else: run_params[param_name]['type'] = self.get_type_from_env( param_name) Logger.info( "Starting child pipeline run on a parent node with parameters: '{}'." .format(str(run_params)), task_name=self.TASK_NAME) try: run = self.api.launch_pipeline(self.pipe_id, self.version, run_params, parent_node_id=self.run_id, cmd=cmd, instance=instance_size, disk=instance_disk, docker=docker_image) self.child_id = run['id'] except Exception as e: Logger.warn( "Failed to launch sample processing with parameters: '{}'. Error: '{}'." .format(str(run_params), e.message), task_name=self.TASK_NAME) self.child_id = None # to have possibilities to change way of naming new parameter in the batched pipeline @staticmethod def change_parameter_name(param): return param[2:] def get_running_samples(self): attempts = 0 while attempts < self.RETRY_COUNT: try: child_runs = self.api.load_child_pipelines(self.run_id) count = 0 for run in child_runs: if run['status'] == 'RUNNING': count = count + 1 return count except Exception as e: Logger.warn("Failed to fetch running samples: {}.".format( e.message), task_name=self.TASK_NAME) attempts = attempts + 1 time.sleep(self.POLL_TIMEOUT) Logger.fail("Exceeded maximum attempts to fetch running samples.") raise RuntimeError( "Exceeded maximum attempts to fetch running samples.") def child_run_active(self): if self.child_id is None: return False attempts = 0 while attempts < self.RETRY_COUNT: try: run = self.api.load_run(self.child_id) return run['status'] == 'RUNNING' except Exception as e: Logger.warn( "Failed to fetch child run ID '' status: {}.".format( str(self.child_id), e.message), task_name=self.TASK_NAME) attempts = attempts + 1 time.sleep(self.POLL_TIMEOUT) Logger.fail("Exceeded maximum attempts to fetch child run status.") raise RuntimeError( "Exceeded maximum attempts to fetch child run status.") def wait_all_samples_finish(self): running = self.get_running_samples() while running != 0: time.sleep(self.POLL_TIMEOUT) running = self.get_running_samples() def get_type_from_env(self, param_name): if param_name not in self.param_types or not self.param_types[ param_name]: return 'string' else: return self.param_types[param_name] # remove escaped ENV values def normalize_value(self, value): return value.replace("\\$", "$")
max_additional_hosts = int(os.environ['CP_CAP_SGE_AUTOSCALE_WORKERS']) \ if 'CP_CAP_SGE_AUTOSCALE_WORKERS' in os.environ else 3 log_verbose = os.environ['CP_CAP_SGE_AUTOSCALE_VERBOSE'].strip().lower() == "true" \ if 'CP_CAP_SGE_AUTOSCALE_VERBOSE' in os.environ else False Logger.init(cmd=args.debug, log_file='/common/workdir/.autoscaler.log', task='GridEngineAutoscaling', verbose=log_verbose) cmd_executor = CmdExecutor() grid_engine = GridEngine(cmd_executor=cmd_executor) host_storage = FileSystemHostStorage( cmd_executor=cmd_executor, storage_file='/common/workdir/.autoscaler.storage') pipe = PipelineAPI(api_url=pipeline_api, log_dir='/common/workdir/.pipe.log') scale_up_timeout = int( _retrieve_preference(pipe, 'ge.autoscaling.scale.up.timeout', default_value=30)) scale_down_timeout = int( _retrieve_preference(pipe, 'ge.autoscaling.scale.down.timeout', default_value=30)) scale_up_handler = GridEngineScaleUpHandler( cmd_executor=cmd_executor, pipe=pipe, grid_engine=grid_engine, host_storage=host_storage, parent_run_id=master_run_id, default_hostfile=default_hostfile,
class InputDataTask: def __init__(self, input_dir, common_dir, analysis_dir, task_name, bucket, report_file, rules): self.input_dir = input_dir self.common_dir = common_dir self.analysis_dir = get_path_with_trailing_delimiter(analysis_dir) self.task_name = task_name self.bucket = bucket self.report_file = report_file self.rules = rules api_url = os.environ['API'] if 'API_EXTERNAL' in os.environ and os.environ['API_EXTERNAL']: api_url = os.environ['API_EXTERNAL'] self.api_url = api_url self.token = os.environ['API_TOKEN'] self.api = PipelineAPI(os.environ['API'], 'logs') def run(self, upload): Logger.info('Starting localization of remote data...', task_name=self.task_name) try: dts_registry = self.fetch_dts_registry() parameter_types = {ParameterType.INPUT_PARAMETER, ParameterType.COMMON_PARAMETER} if upload else \ {ParameterType.OUTPUT_PARAMETER} remote_locations = self.find_remote_locations( dts_registry, parameter_types) if len(remote_locations) == 0: Logger.info('No remote sources found', task_name=self.task_name) else: dts_locations = [ path for location in remote_locations for path in location.paths if path.type == PathType.DTS ] if upload: self.transfer_dts(dts_locations, dts_registry, upload) self.localize_data(remote_locations, upload) if self.report_file: with open(self.report_file, 'w') as report: for location in remote_locations: env_name = location.env_name original_value = location.original_value localized_value = location.delimiter.join([ path.local_path for path in location.paths ]) report.write('export {}="{}"\n'.format( env_name, localized_value)) report.write('export {}="{}"\n'.format( env_name + '_ORIGINAL', original_value)) else: rule_patterns = DataStorageRule.read_from_file(self.rules) rules = [] for rule in rule_patterns: if rule.move_to_sts: rules.append(rule.file_mask) self.localize_data(remote_locations, upload, rules=rules) self.transfer_dts(dts_locations, dts_registry, upload, rules=rules) Logger.success('Finished localization of remote data', task_name=self.task_name) except BaseException as e: Logger.fail( 'Localization of remote data failed due to exception: %s' % e.message, task_name=self.task_name) exit(1) def fetch_dts_registry(self): result = {} try: dts_data = self.api.load_dts_registry() except BaseException as e: Logger.info("DTS is not available: %s" % e.message, task_name=self.task_name) return result for registry in dts_data: for prefix in registry['prefixes']: result[prefix] = registry['url'] return result def find_remote_locations(self, dts_registry, parameter_types): remote_locations = [] for env in os.environ: param_type_name = env + '_PARAM_TYPE' if os.environ[env] and param_type_name in os.environ: param_type = os.environ[param_type_name] if param_type in parameter_types: value = os.environ[env].strip() Logger.info('Found remote parameter %s with type %s' % (value, param_type), task_name=self.task_name) original_paths = [value] delimiter = '' for supported_delimiter in VALUE_DELIMITERS: if value.find(supported_delimiter) != -1: original_paths = re.split(supported_delimiter, value) delimiter = supported_delimiter break paths = [] for path in original_paths: resolved_path = replace_all_system_variables_in_path( path).strip() if self.match_dts_path(resolved_path, dts_registry): paths.append( self.build_dts_path(resolved_path, dts_registry, param_type)) elif self.match_s3_path(resolved_path): paths.append( self.build_s3_path(resolved_path, param_type)) elif self.match_ftp_or_http_path(resolved_path): paths.append( self.build_ftp_or_http_path( resolved_path, param_type)) if len(paths) != 0: remote_locations.append( RemoteLocation(env, value, param_type, paths, delimiter)) return remote_locations @staticmethod def match_ftp_or_http_path(path): return any(path.startswith(scheme) for scheme in HTTP_FTP_SCHEMES) @staticmethod def match_s3_path(path): return path.startswith('s3://') or path.startswith('cp://') @staticmethod def match_dts_path(path, dts_registry): for prefix in dts_registry: if path.startswith(prefix): return True return False def build_dts_path(self, path, dts_registry, input_type): for prefix in dts_registry: if path.startswith(prefix): if not self.bucket: raise RuntimeError( 'Transfer bucket shall be set for DTS locations') relative_path = path.replace(prefix, '') s3_path = self.join_paths(self.bucket, relative_path) if input_type == ParameterType.OUTPUT_PARAMETER: local_path = self.analysis_dir else: local_dir = self.get_local_dir(input_type) local_path = self.join_paths(local_dir, relative_path) Logger.info( 'Found remote {} path {} matching DTS prefix {}. ' 'It will be uploaded to bucket path {} and localized {} {}.' .format( input_type, path, prefix, s3_path, 'from' if input_type == ParameterType.OUTPUT_PARAMETER else 'to', local_path), task_name=self.task_name) return LocalizedPath(path, s3_path, local_path, PathType.DTS, prefix=prefix) raise RuntimeError( 'Remote path %s does not match any of DTS prefixes.') def build_s3_path(self, path, input_type): return self._build_remote_path(path, input_type, PathType.S3) def build_ftp_or_http_path(self, path, input_type): return self._build_remote_path(path, input_type, PathType.HTTP_OR_FTP) def _build_remote_path(self, path, input_type, path_type): if input_type == ParameterType.OUTPUT_PARAMETER: local_path = self.analysis_dir else: remote = urlparse.urlparse(path) relative_path = path.replace( '%s://%s' % (remote.scheme, remote.netloc), '') local_dir = self.get_local_dir(input_type) local_path = self.join_paths(local_dir, relative_path) Logger.info('Found %s %s path %s. It will be localized to %s.' % (path_type.lower(), input_type, path, local_path), task_name=self.task_name) return LocalizedPath(path, path, local_path, path_type) def get_local_dir(self, type): return self.input_dir if type == ParameterType.INPUT_PARAMETER else self.common_dir def join_paths(self, prefix, suffix): trimmed_prefix = get_path_with_trailing_delimiter(prefix) trimmed_suffix = suffix[1:] if suffix.startswith('/') else suffix return trimmed_prefix + trimmed_suffix def transfer_dts(self, dts_locations, dts_registry, upload, rules=None): grouped_paths = {} for path in dts_locations: if path.prefix not in grouped_paths: grouped_paths[path.prefix] = [path] else: grouped_paths[path.prefix].append(path) for prefix, paths in grouped_paths.iteritems(): dts_url = dts_registry[prefix] Logger.info( 'Uploading {} paths using DTS service {}'.format( len(paths), dts_url), self.task_name) dts_client = DataTransferServiceClient(dts_url, self.token, self.api_url, self.token, 10) dts_client.transfer_data( [self.create_dts_path(path, upload, rules) for path in paths], self.task_name) def create_dts_path(self, path, upload, rules): return LocalToS3(path.path, path.s3_path, rules) if upload else S3ToLocal( path.s3_path, path.path, rules) def localize_data(self, remote_locations, upload, rules=None): cluster = Cluster.build_cluster() for location in remote_locations: for path in location.paths: source, destination = self.get_local_paths(path, upload) self.perform_transfer(path, source, destination, cluster, upload, rules=rules) def perform_transfer(self, path, source, destination, cluster, upload, rules=None): Logger.info( 'Uploading files from {} to {}'.format(source, destination), self.task_name) if path.type == PathType.HTTP_OR_FTP or cluster is None or self.is_file( source): if upload or self.rules is None: S3Bucket().pipe_copy(source, destination, TRANSFER_ATTEMPTS) else: S3Bucket().pipe_copy_with_rules(source, destination, TRANSFER_ATTEMPTS, self.rules) else: common_folder = os.path.join(os.environ['SHARED_WORK_FOLDER'], 'transfer') applied_rules = None if upload else rules chunks = self.split_source_into_chunks(cluster, source, destination, common_folder, applied_rules) transfer_pool = Pool(len(chunks)) transfer_pool.map(transfer_async, chunks) shutil.rmtree(common_folder, ignore_errors=True) def is_file(self, source): if source.endswith('/'): return False if self.match_s3_path(source): source_path = urlparse.urlparse(source) # case when whole bucket is selected if not source_path.path or source_path.path == '/': return True # urlparse returns path as /folder/inner # convert it to s3 listing representation folder/inner/ folder = get_path_with_trailing_delimiter( get_path_without_first_delimiter(source_path.path)) s3_paths = S3Bucket().pipe_ls( get_path_without_trailing_delimiter(source), TRANSFER_ATTEMPTS, recursive=False, all=False, show_info=True) for path in s3_paths: if path[0] == 'Folder' and path[1] == folder: return False return True else: return os.path.isfile(source) def split_source_into_chunks(self, cluster, source, destination, common_folder, rules): if not os.path.exists(common_folder): os.makedirs(common_folder) source_files = self.fetch_source_files(source) chunks = [] for node in cluster.nodes: for slot in range(0, cluster.slots_per_node): chunks.append( TransferChunk(node.hostname, [], source, destination, common_folder, self.task_name, rules)) for i in range(0, len(source_files)): file = source_files[i] chunk_index = i % len(chunks) chunks[chunk_index].files.append(file) return chunks def fetch_source_files(self, source): """ :return: list of files sorted by size DESC """ if self.match_s3_path(source): s3_paths = S3Bucket().pipe_ls( get_path_with_trailing_delimiter(source), TRANSFER_ATTEMPTS, recursive=True, all=True, show_info=True) s3_paths = filter(lambda x: x[0] == 'File', s3_paths) files = [ File(self.get_path_without_folder(source, path[1]), int(path[2])) for path in s3_paths ] else: files = [] for root, d_names, f_names in os.walk(source): for f in f_names: path = os.path.join(root, f) files.append( File(os.path.relpath(path, start=source), os.path.getsize(path))) return sorted(files, key=lambda x: x.size, reverse=True) def get_path_without_folder(self, source, path): prefix = urlparse.urlparse(source).path if prefix.startswith('/'): prefix = prefix[1:] if not prefix.endswith('/'): prefix += '/' if len(prefix) == 0 or prefix == '/': return path return path.replace(prefix, '', 1) @staticmethod def get_local_paths(path, upload): if upload: source = path.s3_path if path.type == PathType.DTS else path.path destination = path.local_path else: source = path.local_path destination = path.path if path.type == PathType.HTTP_OR_FTP else path.s3_path return source, destination
def __init__(self): self.api = PipelineAPI(os.environ.get('API'), "logs")
class CloudPipelineApiProvider(object): def __init__(self): self.api = PipelineAPI(os.environ.get('API'), "logs") def search(self, query, type): return self.api.search(query, [type]) def create_pipeline(self, name, description): data = { "name": name, "description": description, } return self.api.create_pipeline(data) def delete_pipeline(self, id): self.api.delete_pipeline(id) def create_folder(self, name, parent=None): return self.api.create_folder(name, parent) def delete_folder(self, id): self.api.delete_folder(id) def create_s3_data_storage(self, name, description, parent_folder_id=None, region_id=2, storage_policy=None): if not storage_policy: storage_policy = {"versioningEnabled": True} data = { "name": name, "path": name, "description": description, "type": 'S3', "shared": False, "parentFolderId": parent_folder_id, "regionId": region_id, "storagePolicy": storage_policy } return self.api.datastorage_create(data) def delete_data_storage(self, id): self.api.delete_datastorage(id) def create_issue(self, name, text, entity_id, entity_class): return self.api.create_issue(name, text, entity_id, entity_class) def delete_issue(self, id): self.api.delete_folder(id) def create_comment(self, issue_id, text): return self.api.create_comment(issue_id, text)
def __init__(self): Task.__init__(self) self.task_name = 'WaitForMasterNode' self.kube = Kubernetes() self.pipe_api = PipelineAPI(os.environ['API'], 'logs')