def build_dts_path(self, path, dts_registry, input_type): for prefix in dts_registry: if path.startswith(prefix): if not self.bucket: raise RuntimeError( 'Transfer bucket shall be set for DTS locations') relative_path = path.replace(prefix, '') s3_path = self.join_paths(self.bucket, relative_path) if input_type == ParameterType.OUTPUT_PARAMETER: local_path = self.analysis_dir else: local_dir = self.get_local_dir(input_type) local_path = self.join_paths(local_dir, relative_path) Logger.info( 'Found remote {} path {} matching DTS prefix {}. ' 'It will be uploaded to bucket path {} and localized {} {}.' .format( input_type, path, prefix, s3_path, 'from' if input_type == ParameterType.OUTPUT_PARAMETER else 'to', local_path), task_name=self.task_name) return LocalizedPath(path, s3_path, local_path, PathType.DTS, prefix=prefix) raise RuntimeError( 'Remote path %s does not match any of DTS prefixes.')
def find_files(self, recursive=True): Logger.info("Starting parsing input directory: {}.".format( self.folder), task_name=self.TASK_NAME) all_files = bucket.ls_s3(self.folder, self.MAX_ATTEMPTS, recursive=recursive) patterns_files = {} if recursive: all_folders = self.get_folders(all_files) for folder in all_folders: self.check_file_match(self.samples, folder, patterns_files) for file in all_files: # recursive version of s3 ls returns path from bucket root # non-recursive ls returns path relative to the requested folder if recursive: file_name = file[len(self.get_path_without_bucket()) - 1:] else: file_name = file self.check_file_match(self.samples, file_name, patterns_files) Logger.info('Collected batch files: {}.'.format(str(patterns_files)), task_name=self.TASK_NAME) if len(patterns_files) != len(self.samples): self.fail_task( "Failed to find all parameters for all samples.".format()) Logger.success('Successfully collected batch files: {}.'.format( str(patterns_files)), task_name=self.TASK_NAME) return patterns_files
def run(self): analysis_folder = os.environ['ANALYSIS_FOLDER'] machine_run_folder = os.environ['MACHINE_RUN_FOLDER'] sample_sheet = os.environ['SAMPLE_SHEET'] Logger.info('Starting analytical processing for sample sheet %s' % sample_sheet, task_name=self.task) samples = SampleSheetParser( sample_sheet, [SAMPLE_ID, SAMPLE_NAME, SAMPLE_PROJECT]).parse_sample_sheet() launched_runs = {} for sample in samples: Logger.info('Starting "%s" sample processing.' % sample[SAMPLE_NAME], task_name=self.task) launched_runs[sample[SAMPLE_NAME]] = self.__run_sample( sample[SAMPLE_NAME], analysis_folder, machine_run_folder) failed_runs = self.__wait_runs_completion(launched_runs) if failed_runs: for sample, run_id in failed_runs.iteritems(): Logger.fail( 'Processing failed for sample "%s". Check run %d logs for more information.' % (sample, run_id), task_name=self.task) sys.exit(1) Logger.success("All samples processed successfully.", task_name=self.task)
def __run_sample(self, sample, analysis_folder, machine_run_folder): Logger.info( 'Launching analytical pipeline "%s" with version "%s" for sample %s.' % (self.pipeline['name'], self.version, sample), task_name=self.task) read1, read2 = self.__fetch_reads(sample, analysis_folder, machine_run_folder) pipeline_params = { 'SAMPLE': { 'value': sample }, 'READ1': { 'value': read1, 'type': 'input' }, 'READ2': { 'value': read2, 'type': 'input' }, 'OUTPUT_FOLDER': { 'value': analysis_folder, 'type': 'output' } } run = self.api.launch_pipeline(self.pipeline['id'], self.version, pipeline_params, instance=self.instance_type, disk=self.instance_disk, parent_run_id=os.environ['RUN_ID']) return run['id']
def await_workers_start(self, nodes_number, parent_id): if nodes_number == 0: Logger.success( 'No workers requested. Processing will run on a master node', task_name=self.task_name) return [] try: Logger.info('Waiting for {} worker node(s)'.format(nodes_number), task_name=self.task_name) # TODO: probably we shall check several times, as it is possible that workers are not yet submitted worker_ids = self.get_workers(parent_id) total_number = len(worker_ids) started = [] # approximately 10 minutes attempts = 60 while len(started) != total_number and attempts != 0: started = self.get_started_workers(worker_ids) attempts -= 1 Logger.info('Started {} worker(s) of {} total'.format( len(started), total_number), task_name=self.task_name) time.sleep(10) if len(started) != total_number: raise RuntimeError('Failed to start all workers') Logger.success('All workers started', task_name=self.task_name) return started except Exception as e: self.fail_task(e.message)
def perform_transfer(self, path, source, destination, cluster, upload, rules=None): Logger.info( 'Uploading files from {} to {}'.format(source, destination), self.task_name) if path.type == PathType.HTTP_OR_FTP or cluster is None or self.is_file( source): if upload or self.rules is None: S3Bucket().pipe_copy(source, destination, TRANSFER_ATTEMPTS) else: S3Bucket().pipe_copy_with_rules(source, destination, TRANSFER_ATTEMPTS, self.rules) else: common_folder = os.path.join(os.environ['SHARED_WORK_FOLDER'], 'transfer') applied_rules = None if upload else rules chunks = self.split_source_into_chunks(cluster, source, destination, common_folder, applied_rules) transfer_pool = Pool(len(chunks)) transfer_pool.map(transfer_async, chunks) shutil.rmtree(common_folder, ignore_errors=True)
def await_master_start(self, master_id, task_name): try: Logger.info( 'Waiting for master node (run id: {}), task: {}'.format( master_id, task_name), task_name=self.task_name) # approximately 1 day. we really do not need this timeout, as if something will go wrong with a master - workers will be killed automatically # but for any unpredictable case - this task will be killed eventually attempts = 8640 master = None Logger.info('Waiting for master node ...', task_name=self.task_name) while not master and attempts > 0: master = self.get_master_node_info(master_id, task_name) attempts -= 1 time.sleep(10) if not master: raise RuntimeError('Failed to attach to master node') Logger.success( 'Attached to master node (run id {})'.format(master_id), task_name=self.task_name) return master except Exception as e: self.fail_task(e.message)
def execute_mount(self, command, params): result = common.execute_cmd_command(command, silent=True) if result == 0: Logger.info('-->{path} mounted to {mount}'.format(**params), task_name=self.task_name) else: Logger.warn( '--> Failed mounting {path} to {mount}'.format(**params), task_name=self.task_name)
def read(cls, report_file, task): Logger.info("Reading Flagstats report from file %s." % report_file, task_name=task) with open(report_file, 'r') as report: line_index = 0 for line in report.readlines(): if line_index < 2: line_index += 1 continue return int(line.split('+')[0].strip())
def __fill_trim_data(self, sample_metrics): Logger.info("Fetching data from FASTQC reports after trimming.", task_name=self.task) r1_total_reads, r1_poor_reads, r1_gc, r1_read_length = FastQCReader \ .read(os.path.join(self.folder, "FastQC_Trimmed", self.file_suffix + ".Trimmomatic.R1.trimmed_fastqc.zip"), self.task) r2_total_reads, r2_poor_reads, r2_gc, r2_read_length = FastQCReader \ .read(os.path.join(self.folder, "FastQC_Trimmed", self.file_suffix + ".Trimmomatic.R2.trimmed_fastqc.zip"), self.task) sample_metrics["ReadsAfterTrim"] = r1_total_reads + r2_total_reads
def __fill_starting_data(self, sample_metrics): Logger.info("Fetching data from FASTQC Initial reports.", task_name=self.task) r1_total_reads, r1_poor_reads, r1_gc, r1_read_length = FastQCReader\ .read(os.path.join(self.folder, "FastQC_Initial", self.sample + "_R1_fastqc.zip"), self.task) r2_total_reads, r2_poor_reads, r2_gc, r2_read_length = FastQCReader\ .read(os.path.join(self.folder, "FastQC_Initial", self.sample + "_R2_fastqc.zip"), self.task) sample_metrics["StartingReads"] = r1_total_reads + r2_total_reads sample_metrics["QCFailedReads"] = r1_poor_reads + r2_poor_reads sample_metrics["ReadLength"] = r1_read_length sample_metrics["GC"] = r1_gc
def read(cls, report_file, task): Logger.info("Reading Coverage report from file %s." % report_file, task_name=task) total_bases = 0 total_coverage = 0 with open(report_file, 'r') as report: for line in report.readlines(): if line: total_bases += 1 total_coverage += int(line.split("\t")[2]) return 0 if total_bases == 0 else total_coverage / total_bases
def __wait_run_completion(self, run_id): current_status = self.api.load_run(run_id)['status'] while current_status == 'RUNNING': Logger.info('Run %d status is %s. Waiting for completion...' % (run_id, current_status), task_name=self.task) time.sleep(60) current_status = self.api.load_run(run_id)['status'] Logger.info('Run %d finished with status %s' % (run_id, current_status), task_name=self.task) return current_status
def fetch_dts_registry(self): result = {} try: dts_data = self.api.load_dts_registry() except BaseException as e: Logger.info("DTS is not available: %s" % e.message, task_name=self.task_name) return result for registry in dts_data: for prefix in registry['prefixes']: result[prefix] = registry['url'] return result
def run(self): Logger.info("Reading %s file to collect variants metrics." % self.vcf_file, task_name=self.task) with open(self.output_file, 'w+') as output, open(self.vcf_file, 'r') as vcf: self.__write_header(output) lines_started = False for vcf_line in vcf.readlines(): if lines_started and vcf_line: self.__process_variant(output, vcf_line) elif vcf_line.startswith("#CHROM"): lines_started = True
def _build_remote_path(self, path, input_type, path_type): if input_type == ParameterType.OUTPUT_PARAMETER: local_path = self.analysis_dir else: remote = urlparse.urlparse(path) relative_path = path.replace( '%s://%s' % (remote.scheme, remote.netloc), '') local_dir = self.get_local_dir(input_type) local_path = self.join_paths(local_dir, relative_path) Logger.info('Found %s %s path %s. It will be localized to %s.' % (path_type.lower(), input_type, path, local_path), task_name=self.task_name) return LocalizedPath(path, path, local_path, path_type)
def run(self, upload): Logger.info('Starting localization of remote data...', task_name=self.task_name) try: dts_registry = self.fetch_dts_registry() parameter_types = {ParameterType.INPUT_PARAMETER, ParameterType.COMMON_PARAMETER} if upload else \ {ParameterType.OUTPUT_PARAMETER} remote_locations = self.find_remote_locations( dts_registry, parameter_types) if len(remote_locations) == 0: Logger.info('No remote sources found', task_name=self.task_name) else: dts_locations = [ path for location in remote_locations for path in location.paths if path.type == PathType.DTS ] if upload: self.transfer_dts(dts_locations, dts_registry, upload) self.localize_data(remote_locations, upload) if self.report_file: with open(self.report_file, 'w') as report: for location in remote_locations: env_name = location.env_name original_value = location.original_value localized_value = location.delimiter.join([ path.local_path for path in location.paths ]) report.write('export {}="{}"\n'.format( env_name, localized_value)) report.write('export {}="{}"\n'.format( env_name + '_ORIGINAL', original_value)) else: rule_patterns = DataStorageRule.read_from_file(self.rules) rules = [] for rule in rule_patterns: if rule.move_to_sts: rules.append(rule.file_mask) self.localize_data(remote_locations, upload, rules=rules) self.transfer_dts(dts_locations, dts_registry, upload, rules=rules) Logger.success('Finished localization of remote data', task_name=self.task_name) except BaseException as e: Logger.fail( 'Localization of remote data failed due to exception: %s' % e.message, task_name=self.task_name) exit(1)
def run(self, worker_ids, status): try: Logger.info('Shutting down {} node(s)'.format(len(worker_ids)), task_name=self.task_name) api = PipelineAPI(os.environ['API'], 'logs') for pod in worker_ids: Logger.info('Shutting down {} node with status {}.'.format( pod.run_id, status.status), task_name=self.task_name) api.update_status(pod.run_id, status) Logger.success('Successfully scaled cluster down', task_name=self.task_name) except Exception as e: self.fail_task(e.message)
def read(cls, report_file, task): Logger.info("Reading InsertSizeMetrics report from file %s." % report_file, task_name=task) with open(report_file, 'r') as report: data_started = False for line in report.readlines(): if data_started and line: chunks = line.split("\t") # MEDIAN_INSERT_SIZE return int(chunks[0]) elif line.startswith("MEDIAN_INSERT_SIZE"): data_started = True return 0
def launch(self, instance_size, instance_disk, docker_image, cmd, wait_finish=False): running = 0 Logger.info('Starting {} sample(s) scheduling.'.format( len(self.run_dirs)), task_name=self.TASK_NAME) for folder in self.run_dirs: self.launch_pipeline(folder, self.param_names, instance_size, instance_disk, docker_image, cmd) running = running + 1 Logger.info('Processing {} sample(s).'.format(running), task_name=self.TASK_NAME) Logger.info('Successfully scheduled {} sample(s).'.format(running), task_name=self.TASK_NAME) if wait_finish: Logger.info('Waiting for all runs to finish.', task_name=self.TASK_NAME) self.wait_all_samples_finish() Logger.success('All child pipeline successfully finished.', task_name=self.TASK_NAME)
def run(self, worker_pods, path, run_id): try: Logger.info('Creating hostfile {}'.format(path), task_name=self.task_name) with open(path, 'w') as file: master_pod = self.kube.get_pod(run_id) file.write('{}\n'.format(master_pod.name)) for pod in worker_pods: file.write('{}\n'.format(pod.name)) self.add_to_hosts(pod) Logger.success('Successfully created hostfile {}'.format(path), task_name=self.task_name) except Exception as e: self.fail_task(e.message)
def read(cls, report_file, task): Logger.info("Reading MarkDuplicates report from file %s." % report_file, task_name=task) with open(report_file, 'r') as report: data_started = False for line in report.readlines(): if data_started and line: chunks = line.split("\t") # UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES return int( chunks[5]) + 2 * int(chunks[6]) + 2 * int(chunks[7]) elif line.startswith("LIBRARY"): data_started = True return 0
def find_files(self, recursive=False): Logger.info("Starting parsing input directory: {}.".format( self.folder), task_name=self.TASK_NAME) all_files = bucket.ls_s3(self.folder, self.MAX_ATTEMPTS, recursive=recursive) result = [[] for x in xrange(len(all_files))] index = 0 for file in all_files: result[index].append(os.path.join(self.folder, file)) index = index + 1 Logger.success("Found {} directories to process.".format(len(result)), task_name=self.TASK_NAME) return result
def __wait_runs_completion(self, launched_runs): finished = {} failed = {} while True: for sample, run_id in launched_runs.iteritems(): current_status = self.api.load_run(run_id)['status'] Logger.info('Processing sample: %s. Run %d status is %s.' % (sample, run_id, current_status), task_name=self.task) if current_status != 'RUNNING': finished[sample] = run_id if current_status != 'SUCCESS': failed[sample] = run_id if len(finished) == len(launched_runs): Logger.info("Processing for all samples completed.", task_name=self.task) return failed time.sleep(60)
def launch(self, nodes, instance_size, instance_disk, docker_image, cmd, wait_finish=False): running = 0 scheduled = 0 Logger.info('Starting {} sample(s) scheduling.'.format( len(self.samples)), task_name=self.TASK_NAME) while scheduled != len(self.samples): if running < nodes: sample = self.samples[scheduled] self.launch_pipeline(self.run_files[sample[SAMPLE_NAME]], self.param_names, instance_size, instance_disk, docker_image, cmd, sample=sample) scheduled = scheduled + 1 running = running + 1 else: Logger.info('Processing {} sample(s).'.format(running), task_name=self.TASK_NAME) Logger.info('Total scheduled {} sample(s).'.format(scheduled), task_name=self.TASK_NAME) time.sleep(self.POLL_TIMEOUT) running = self.get_running_samples() while self.child_run_active(): Logger.info('Waiting a child run {} to finish.'.format( self.child_id), task_name=self.TASK_NAME) time.sleep(self.POLL_TIMEOUT) if wait_finish: Logger.info('Waiting for all runs to finish.', task_name=self.TASK_NAME) self.wait_all_samples_finish() Logger.success( 'Successfully scheduled {} sample(s).'.format(scheduled), task_name=self.TASK_NAME)
def transfer_dts(self, dts_locations, dts_registry, upload, rules=None): grouped_paths = {} for path in dts_locations: if path.prefix not in grouped_paths: grouped_paths[path.prefix] = [path] else: grouped_paths[path.prefix].append(path) for prefix, paths in grouped_paths.iteritems(): dts_url = dts_registry[prefix] Logger.info( 'Uploading {} paths using DTS service {}'.format( len(paths), dts_url), self.task_name) dts_client = DataTransferServiceClient(dts_url, self.token, self.api_url, self.token, 10) dts_client.transfer_data( [self.create_dts_path(path, upload, rules) for path in paths], self.task_name)
def launch(self, nodes, instance_size, instance_disk, docker_image, cmd, wait_finish=False): running = 0 current_index = 0 Logger.info('Starting {} sample(s) scheduling.'.format( self.samples_number), task_name=self.TASK_NAME) while current_index != self.samples_number: if running < nodes: self.launch_pipeline(self.run_files[current_index], self.param_names, instance_size, instance_disk, docker_image, cmd) current_index = current_index + 1 running = running + 1 else: Logger.info('Processing {} sample(s).'.format(running), task_name=self.TASK_NAME) Logger.info( 'Total scheduled {} sample(s).'.format(current_index), task_name=self.TASK_NAME) time.sleep(self.POLL_TIMEOUT) running = self.get_running_samples() while self.child_run_active(): Logger.info('Waiting a child run {} to finish.'.format( self.child_id), task_name=self.TASK_NAME) time.sleep(self.POLL_TIMEOUT) if wait_finish: Logger.info('Waiting for all runs to finish.', task_name=self.TASK_NAME) self.wait_all_samples_finish() Logger.success( 'Successfully scheduled {} sample(s).'.format(current_index), task_name=self.TASK_NAME)
def check_file_match(self, samples, file_name, patterns_files): for sample in samples: sample_name = sample[SAMPLE_NAME] patterns = self.format_sample_patterns(sample, self.patterns) exclude = self.format_sample_patterns(sample, self.exclude_patterns) for pattern_name, glob in patterns.iteritems(): if self.match_patterns(file_name, glob): if pattern_name in exclude: exclude = exclude[pattern_name] if self.match_patterns(file_name, exclude): Logger.info( "Skipping filename '{}' since it matches exclude patterns '{}'." .format(file_name, str(exclude))) continue if sample_name not in patterns_files: patterns_files[sample_name] = {} if pattern_name not in patterns_files[sample_name]: patterns_files[sample_name][pattern_name] = [] patterns_files[sample_name][pattern_name].append( os.path.join(self.folder, file_name))
def transfer_async(chunk): if not chunk.files: Logger.info('Skipping empty chunk', task_name=chunk.task_name) return file_list_name = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)) + '.list' file_list_path = os.path.join(chunk.common_folder, file_list_name) with open(file_list_path, 'w') as file_list: for file in chunk.files: file_list.write('%s\t%d\n' % (file.filename, file.size)) bucket = S3Bucket() cmd = bucket.build_pipe_cp_command(chunk.source, chunk.destination, file_list=file_list_path, include=chunk.rules) if chunk.hostname != 'localhost': cmd = '(ssh %s API=$API API_TOKEN=$API_TOKEN RUN_ID=$RUN_ID "%s") & _CHUNK_PID=$! && wait $_CHUNK_PID' % \ (chunk.hostname, cmd) Logger.info('Executing chunk transfer with cmd: %s' % cmd, task_name=chunk.task_name) bucket.execute_command(cmd, TRANSFER_ATTEMPTS)
def find_remote_locations(self, dts_registry, parameter_types): remote_locations = [] for env in os.environ: param_type_name = env + '_PARAM_TYPE' if os.environ[env] and param_type_name in os.environ: param_type = os.environ[param_type_name] if param_type in parameter_types: value = os.environ[env].strip() Logger.info('Found remote parameter %s with type %s' % (value, param_type), task_name=self.task_name) original_paths = [value] delimiter = '' for supported_delimiter in VALUE_DELIMITERS: if value.find(supported_delimiter) != -1: original_paths = re.split(supported_delimiter, value) delimiter = supported_delimiter break paths = [] for path in original_paths: resolved_path = replace_all_system_variables_in_path( path).strip() if self.match_dts_path(resolved_path, dts_registry): paths.append( self.build_dts_path(resolved_path, dts_registry, param_type)) elif self.match_s3_path(resolved_path): paths.append( self.build_s3_path(resolved_path, param_type)) elif self.match_ftp_or_http_path(resolved_path): paths.append( self.build_ftp_or_http_path( resolved_path, param_type)) if len(paths) != 0: remote_locations.append( RemoteLocation(env, value, param_type, paths, delimiter)) return remote_locations