Пример #1
0
 def __run_sample(self, sample, analysis_folder, machine_run_folder):
     Logger.info(
         'Launching analytical pipeline "%s" with version "%s" for sample %s.'
         % (self.pipeline['name'], self.version, sample),
         task_name=self.task)
     read1, read2 = self.__fetch_reads(sample, analysis_folder,
                                       machine_run_folder)
     pipeline_params = {
         'SAMPLE': {
             'value': sample
         },
         'READ1': {
             'value': read1,
             'type': 'input'
         },
         'READ2': {
             'value': read2,
             'type': 'input'
         },
         'OUTPUT_FOLDER': {
             'value': analysis_folder,
             'type': 'output'
         }
     }
     run = self.api.launch_pipeline(self.pipeline['id'],
                                    self.version,
                                    pipeline_params,
                                    instance=self.instance_type,
                                    disk=self.instance_disk,
                                    parent_run_id=os.environ['RUN_ID'])
     return run['id']
Пример #2
0
 def check_or_install_fuse(self):
     fuse_type = os.getenv('CP_S3_FUSE_TYPE', FUSE_GOOFYS_ID)
     if fuse_type == FUSE_GOOFYS_ID:
         fuse_installed = self.execute_and_check_command(
             'install_s3_fuse_goofys')
         return FUSE_GOOFYS_ID if fuse_installed else FUSE_NA_ID
     elif fuse_type == FUSE_S3FS_ID:
         fuse_installed = self.execute_and_check_command(
             'install_s3_fuse_s3fs')
         if fuse_installed:
             return FUSE_S3FS_ID
         else:
             Logger.warn(
                 "FUSE {fuse_type} was preferred, but failed to install, will try to setup default goofys"
                 .format(fuse_type=fuse_type),
                 task_name=self.task_name)
             fuse_installed = self.execute_and_check_command(
                 'install_s3_fuse_goofys')
             return FUSE_GOOFYS_ID if fuse_installed else FUSE_NA_ID
         fi
     else:
         Logger.warn(
             "FUSE {fuse_type} type is not defined for S3 fuse".format(
                 fuse_type=fuse_type),
             task_name=self.task_name)
         return FUSE_NA_ID
Пример #3
0
    def launch(self,
               instance_size,
               instance_disk,
               docker_image,
               cmd,
               wait_finish=False):
        running = 0
        Logger.info('Starting {} sample(s) scheduling.'.format(
            len(self.run_dirs)),
                    task_name=self.TASK_NAME)
        for folder in self.run_dirs:
            self.launch_pipeline(folder, self.param_names, instance_size,
                                 instance_disk, docker_image, cmd)
            running = running + 1
            Logger.info('Processing {} sample(s).'.format(running),
                        task_name=self.TASK_NAME)

        Logger.info('Successfully scheduled {} sample(s).'.format(running),
                    task_name=self.TASK_NAME)
        if wait_finish:
            Logger.info('Waiting for all runs to finish.',
                        task_name=self.TASK_NAME)
            self.wait_all_samples_finish()
        Logger.success('All child pipeline successfully finished.',
                       task_name=self.TASK_NAME)
 def perform_transfer(self,
                      path,
                      source,
                      destination,
                      cluster,
                      upload,
                      rules=None):
     Logger.info(
         'Uploading files from {} to {}'.format(source, destination),
         self.task_name)
     if path.type == PathType.HTTP_OR_FTP or cluster is None or self.is_file(
             source):
         if upload or self.rules is None:
             S3Bucket().pipe_copy(source, destination, TRANSFER_ATTEMPTS)
         else:
             S3Bucket().pipe_copy_with_rules(source, destination,
                                             TRANSFER_ATTEMPTS, self.rules)
     else:
         common_folder = os.path.join(os.environ['SHARED_WORK_FOLDER'],
                                      'transfer')
         applied_rules = None if upload else rules
         chunks = self.split_source_into_chunks(cluster, source,
                                                destination, common_folder,
                                                applied_rules)
         transfer_pool = Pool(len(chunks))
         transfer_pool.map(transfer_async, chunks)
         shutil.rmtree(common_folder, ignore_errors=True)
    def build_dts_path(self, path, dts_registry, input_type):
        for prefix in dts_registry:
            if path.startswith(prefix):
                if not self.bucket:
                    raise RuntimeError(
                        'Transfer bucket shall be set for DTS locations')
                relative_path = path.replace(prefix, '')
                s3_path = self.join_paths(self.bucket, relative_path)

                if input_type == ParameterType.OUTPUT_PARAMETER:
                    local_path = self.analysis_dir
                else:
                    local_dir = self.get_local_dir(input_type)
                    local_path = self.join_paths(local_dir, relative_path)
                Logger.info(
                    'Found remote {} path {} matching DTS prefix {}. '
                    'It will be uploaded to bucket path {} and localized {} {}.'
                    .format(
                        input_type, path, prefix, s3_path, 'from' if input_type
                        == ParameterType.OUTPUT_PARAMETER else 'to',
                        local_path),
                    task_name=self.task_name)
                return LocalizedPath(path,
                                     s3_path,
                                     local_path,
                                     PathType.DTS,
                                     prefix=prefix)
        raise RuntimeError(
            'Remote path %s does not match any of DTS prefixes.')
Пример #6
0
 def create_directory(self, path):
     result = common.execute_cmd_command(
         'mkdir -p {path}'.format(path=path), silent=True)
     if result != 0:
         Logger.warn(
             'Failed to create mount directory: {path}'.format(path=path),
             task_name=self.task_name)
         return False
     return True
Пример #7
0
 def execute_mount(self, command, params):
     result = common.execute_cmd_command(command, silent=True)
     if result == 0:
         Logger.info('-->{path} mounted to {mount}'.format(**params),
                     task_name=self.task_name)
     else:
         Logger.warn(
             '--> Failed mounting {path} to {mount}'.format(**params),
             task_name=self.task_name)
Пример #8
0
 def read(cls, report_file, task):
     Logger.info("Reading Flagstats report from file %s." % report_file,
                 task_name=task)
     with open(report_file, 'r') as report:
         line_index = 0
         for line in report.readlines():
             if line_index < 2:
                 line_index += 1
                 continue
             return int(line.split('+')[0].strip())
Пример #9
0
 def __fill_trim_data(self, sample_metrics):
     Logger.info("Fetching data from FASTQC reports after trimming.",
                 task_name=self.task)
     r1_total_reads, r1_poor_reads, r1_gc, r1_read_length = FastQCReader \
         .read(os.path.join(self.folder, "FastQC_Trimmed", self.file_suffix + ".Trimmomatic.R1.trimmed_fastqc.zip"),
               self.task)
     r2_total_reads, r2_poor_reads, r2_gc, r2_read_length = FastQCReader \
         .read(os.path.join(self.folder, "FastQC_Trimmed", self.file_suffix + ".Trimmomatic.R2.trimmed_fastqc.zip"),
               self.task)
     sample_metrics["ReadsAfterTrim"] = r1_total_reads + r2_total_reads
Пример #10
0
 def __fill_starting_data(self, sample_metrics):
     Logger.info("Fetching data from FASTQC Initial reports.",
                 task_name=self.task)
     r1_total_reads, r1_poor_reads, r1_gc, r1_read_length = FastQCReader\
         .read(os.path.join(self.folder, "FastQC_Initial", self.sample + "_R1_fastqc.zip"), self.task)
     r2_total_reads, r2_poor_reads, r2_gc, r2_read_length = FastQCReader\
         .read(os.path.join(self.folder, "FastQC_Initial", self.sample + "_R2_fastqc.zip"), self.task)
     sample_metrics["StartingReads"] = r1_total_reads + r2_total_reads
     sample_metrics["QCFailedReads"] = r1_poor_reads + r2_poor_reads
     sample_metrics["ReadLength"] = r1_read_length
     sample_metrics["GC"] = r1_gc
Пример #11
0
 def read(cls, report_file, task):
     Logger.info("Reading Coverage report from file %s." % report_file,
                 task_name=task)
     total_bases = 0
     total_coverage = 0
     with open(report_file, 'r') as report:
         for line in report.readlines():
             if line:
                 total_bases += 1
                 total_coverage += int(line.split("\t")[2])
     return 0 if total_bases == 0 else total_coverage / total_bases
Пример #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mount-root', required=True)
    parser.add_argument('--tmp-dir', required=True)
    parser.add_argument('--task', required=False, default=MOUNT_DATA_STORAGES)
    args = parser.parse_args()
    if EXEC_ENVIRONMENT in os.environ and os.environ[EXEC_ENVIRONMENT] == DTS:
        Logger.success(
            'Skipping cloud storage mount for execution environment %s' % DTS,
            task_name=args.task)
        return
    MountStorageTask(args.task).run(args.mount_root, args.tmp_dir)
Пример #13
0
 def fetch_dts_registry(self):
     result = {}
     try:
         dts_data = self.api.load_dts_registry()
     except BaseException as e:
         Logger.info("DTS is not available: %s" % e.message,
                     task_name=self.task_name)
         return result
     for registry in dts_data:
         for prefix in registry['prefixes']:
             result[prefix] = registry['url']
     return result
Пример #14
0
 def __wait_run_completion(self, run_id):
     current_status = self.api.load_run(run_id)['status']
     while current_status == 'RUNNING':
         Logger.info('Run %d status is %s. Waiting for completion...' %
                     (run_id, current_status),
                     task_name=self.task)
         time.sleep(60)
         current_status = self.api.load_run(run_id)['status']
     Logger.info('Run %d finished with status %s' %
                 (run_id, current_status),
                 task_name=self.task)
     return current_status
 def run(self):
     Logger.info("Reading %s file to collect variants metrics." %
                 self.vcf_file,
                 task_name=self.task)
     with open(self.output_file, 'w+') as output, open(self.vcf_file,
                                                       'r') as vcf:
         self.__write_header(output)
         lines_started = False
         for vcf_line in vcf.readlines():
             if lines_started and vcf_line:
                 self.__process_variant(output, vcf_line)
             elif vcf_line.startswith("#CHROM"):
                 lines_started = True
Пример #16
0
 def _build_remote_path(self, path, input_type, path_type):
     if input_type == ParameterType.OUTPUT_PARAMETER:
         local_path = self.analysis_dir
     else:
         remote = urlparse.urlparse(path)
         relative_path = path.replace(
             '%s://%s' % (remote.scheme, remote.netloc), '')
         local_dir = self.get_local_dir(input_type)
         local_path = self.join_paths(local_dir, relative_path)
     Logger.info('Found %s %s path %s. It will be localized to %s.' %
                 (path_type.lower(), input_type, path, local_path),
                 task_name=self.task_name)
     return LocalizedPath(path, path, local_path, path_type)
Пример #17
0
def get_variable_value(variable_name):
    Logger.log_task_event(GENERATE_INPUTS_TASK, "Getting value of: {}".format(variable_name))

    if not os.environ.get(variable_name):
        return

    variable_value = os.environ.get(env_key)
    if VARIABLE_DELIMITER in variable_value:
        variable_value = [x for x in variable_value.split(VARIABLE_DELIMITER) if x]

    Logger.log_task_event(GENERATE_INPUTS_TASK, "Value of {}:\n{}".format(variable_name, variable_value))

    return variable_value
Пример #18
0
 def run(self, worker_pods, path, run_id):
     try:
         Logger.info('Creating hostfile {}'.format(path),
                     task_name=self.task_name)
         with open(path, 'w') as file:
             master_pod = self.kube.get_pod(run_id)
             file.write('{}\n'.format(master_pod.name))
             for pod in worker_pods:
                 file.write('{}\n'.format(pod.name))
                 self.add_to_hosts(pod)
         Logger.success('Successfully created hostfile {}'.format(path),
                        task_name=self.task_name)
     except Exception as e:
         self.fail_task(e.message)
Пример #19
0
    def await_workers_start(self, nodes_number, parent_id):
        if nodes_number == 0:
            Logger.success(
                'No workers requested. Processing will run on a master node',
                task_name=self.task_name)
            return []
        try:
            Logger.info('Waiting for {} worker node(s)'.format(nodes_number),
                        task_name=self.task_name)

            # TODO: probably we shall check several times, as it is possible that workers are not yet submitted
            worker_ids = self.get_workers(parent_id)
            total_number = len(worker_ids)
            started = []
            # approximately 10 minutes
            attempts = 60
            while len(started) != total_number and attempts != 0:
                started = self.get_started_workers(worker_ids)
                attempts -= 1
                Logger.info('Started {} worker(s) of {} total'.format(
                    len(started), total_number),
                            task_name=self.task_name)
                time.sleep(10)
            if len(started) != total_number:
                raise RuntimeError('Failed to start all workers')

            Logger.success('All workers started', task_name=self.task_name)
            return started
        except Exception as e:
            self.fail_task(e.message)
Пример #20
0
 def run(self):
     analysis_folder = os.environ['ANALYSIS_FOLDER']
     machine_run_folder = os.environ['MACHINE_RUN_FOLDER']
     sample_sheet = os.environ['SAMPLE_SHEET']
     Logger.info('Starting analytical processing for sample sheet %s' %
                 sample_sheet,
                 task_name=self.task)
     samples = SampleSheetParser(
         sample_sheet,
         [SAMPLE_ID, SAMPLE_NAME, SAMPLE_PROJECT]).parse_sample_sheet()
     launched_runs = {}
     for sample in samples:
         Logger.info('Starting "%s" sample processing.' %
                     sample[SAMPLE_NAME],
                     task_name=self.task)
         launched_runs[sample[SAMPLE_NAME]] = self.__run_sample(
             sample[SAMPLE_NAME], analysis_folder, machine_run_folder)
     failed_runs = self.__wait_runs_completion(launched_runs)
     if failed_runs:
         for sample, run_id in failed_runs.iteritems():
             Logger.fail(
                 'Processing failed for sample "%s". Check run %d logs for more information.'
                 % (sample, run_id),
                 task_name=self.task)
         sys.exit(1)
     Logger.success("All samples processed successfully.",
                    task_name=self.task)
Пример #21
0
 def read(cls, report_file, task):
     Logger.info("Reading InsertSizeMetrics report from file %s." %
                 report_file,
                 task_name=task)
     with open(report_file, 'r') as report:
         data_started = False
         for line in report.readlines():
             if data_started and line:
                 chunks = line.split("\t")
                 # MEDIAN_INSERT_SIZE
                 return int(chunks[0])
             elif line.startswith("MEDIAN_INSERT_SIZE"):
                 data_started = True
     return 0
Пример #22
0
 def find_files(self, recursive=False):
     Logger.info("Starting parsing input directory: {}.".format(
         self.folder),
                 task_name=self.TASK_NAME)
     all_files = bucket.ls_s3(self.folder,
                              self.MAX_ATTEMPTS,
                              recursive=recursive)
     result = [[] for x in xrange(len(all_files))]
     index = 0
     for file in all_files:
         result[index].append(os.path.join(self.folder, file))
         index = index + 1
     Logger.success("Found {} directories to process.".format(len(result)),
                    task_name=self.TASK_NAME)
     return result
Пример #23
0
 def read(cls, report_file, task):
     Logger.info("Reading MarkDuplicates report from file %s." %
                 report_file,
                 task_name=task)
     with open(report_file, 'r') as report:
         data_started = False
         for line in report.readlines():
             if data_started and line:
                 chunks = line.split("\t")
                 # UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES
                 return int(
                     chunks[5]) + 2 * int(chunks[6]) + 2 * int(chunks[7])
             elif line.startswith("LIBRARY"):
                 data_started = True
     return 0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--parameter', type=str, required=True, nargs='*')
    parser.add_argument('--task-name', required=True)
    parser.add_argument('--run-id', required=True, type=int)
    args = parser.parse_args()
    status = StatusEntry(TaskStatus.SUCCESS)
    try:
        node = WaitForNode().await_node_start(args.parameter, args.task_name,
                                              args.run_id)
        print(node.name + " " + node.ip)
        exit(0)
    except Exception as e:
        Logger.warn(e.message)
        status = StatusEntry(TaskStatus.FAILURE)
    if status.status == TaskStatus.FAILURE:
        raise RuntimeError('Failed to setup cluster')
Пример #25
0
def pipe_log(message, status=TaskStatus.RUNNING):
    global api_token
    global api_url
    global script_path
    global current_run_id

    if api_url and api_token:
        Logger.log_task_event(NODEUP_TASK,
                              '[{}] {}'.format(current_run_id, message),
                              run_id=current_run_id,
                              instance=str(current_run_id),
                              log_dir=script_path,
                              api_url=api_url,
                              status=status,
                              omit_console=True)
    else:
        # Log as always
        logging.info(message)
Пример #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--nodes_number', type=int, required=True)
    args = parser.parse_args()
    run_id = os.environ['RUN_ID']
    hostfile = os.environ['DEFAULT_HOSTFILE']
    status = StatusEntry(TaskStatus.SUCCESS)
    workers = []
    try:
        workers = CreateWorkerNodes().await_workers_start(
            args.nodes_number, run_id)
        BuildHostfile().run(workers, hostfile, run_id)
    except Exception as e:
        Logger.warn(e.message)
        status = StatusEntry(TaskStatus.FAILURE)
        ShutDownCluster().run(workers, status)
    if status.status == TaskStatus.FAILURE:
        raise RuntimeError('Failed to setup cluster')
Пример #27
0
 def __wait_runs_completion(self, launched_runs):
     finished = {}
     failed = {}
     while True:
         for sample, run_id in launched_runs.iteritems():
             current_status = self.api.load_run(run_id)['status']
             Logger.info('Processing sample: %s. Run %d status is %s.' %
                         (sample, run_id, current_status),
                         task_name=self.task)
             if current_status != 'RUNNING':
                 finished[sample] = run_id
                 if current_status != 'SUCCESS':
                     failed[sample] = run_id
         if len(finished) == len(launched_runs):
             Logger.info("Processing for all samples completed.",
                         task_name=self.task)
             return failed
         time.sleep(60)
Пример #28
0
 def child_run_active(self):
     if self.child_id is None:
         return False
     attempts = 0
     while attempts < self.RETRY_COUNT:
         try:
             run = self.api.load_run(self.child_id)
             return run['status'] == 'RUNNING'
         except Exception as e:
             Logger.warn(
                 "Failed to fetch child run ID '' status: {}.".format(
                     str(self.child_id), e.message),
                 task_name=self.TASK_NAME)
             attempts = attempts + 1
             time.sleep(self.POLL_TIMEOUT)
     Logger.fail("Exceeded maximum attempts to fetch child run status.")
     raise RuntimeError(
         "Exceeded maximum attempts to fetch child run status.")
Пример #29
0
 def get_running_samples(self):
     attempts = 0
     while attempts < self.RETRY_COUNT:
         try:
             child_runs = self.api.load_child_pipelines(self.run_id)
             count = 0
             for run in child_runs:
                 if run['status'] == 'RUNNING':
                     count = count + 1
             return count
         except Exception as e:
             Logger.warn("Failed to fetch running samples: {}.".format(
                 e.message),
                         task_name=self.TASK_NAME)
             attempts = attempts + 1
             time.sleep(self.POLL_TIMEOUT)
     Logger.fail("Exceeded maximum attempts to fetch running samples.")
     raise RuntimeError(
         "Exceeded maximum attempts to fetch running samples.")
Пример #30
0
    def transfer_dts(self, dts_locations, dts_registry, upload, rules=None):
        grouped_paths = {}
        for path in dts_locations:
            if path.prefix not in grouped_paths:
                grouped_paths[path.prefix] = [path]
            else:
                grouped_paths[path.prefix].append(path)

        for prefix, paths in grouped_paths.iteritems():
            dts_url = dts_registry[prefix]
            Logger.info(
                'Uploading {} paths using DTS service {}'.format(
                    len(paths), dts_url), self.task_name)
            dts_client = DataTransferServiceClient(dts_url, self.token,
                                                   self.api_url, self.token,
                                                   10)
            dts_client.transfer_data(
                [self.create_dts_path(path, upload, rules) for path in paths],
                self.task_name)