예제 #1
0
 def check_or_install_fuse(self):
     fuse_type = os.getenv('CP_S3_FUSE_TYPE', FUSE_GOOFYS_ID)
     if fuse_type == FUSE_GOOFYS_ID:
         fuse_installed = self.execute_and_check_command(
             'install_s3_fuse_goofys')
         return FUSE_GOOFYS_ID if fuse_installed else FUSE_NA_ID
     elif fuse_type == FUSE_S3FS_ID:
         fuse_installed = self.execute_and_check_command(
             'install_s3_fuse_s3fs')
         if fuse_installed:
             return FUSE_S3FS_ID
         else:
             Logger.warn(
                 "FUSE {fuse_type} was preferred, but failed to install, will try to setup default goofys"
                 .format(fuse_type=fuse_type),
                 task_name=self.task_name)
             fuse_installed = self.execute_and_check_command(
                 'install_s3_fuse_goofys')
             return FUSE_GOOFYS_ID if fuse_installed else FUSE_NA_ID
         fi
     else:
         Logger.warn(
             "FUSE {fuse_type} type is not defined for S3 fuse".format(
                 fuse_type=fuse_type),
             task_name=self.task_name)
         return FUSE_NA_ID
예제 #2
0
 def execute_mount(self, command, params):
     result = common.execute_cmd_command(command, silent=True)
     if result == 0:
         Logger.info('-->{path} mounted to {mount}'.format(**params),
                     task_name=self.task_name)
     else:
         Logger.warn(
             '--> Failed mounting {path} to {mount}'.format(**params),
             task_name=self.task_name)
예제 #3
0
 def create_directory(self, path):
     result = common.execute_cmd_command(
         'mkdir -p {path}'.format(path=path), silent=True)
     if result != 0:
         Logger.warn(
             'Failed to create mount directory: {path}'.format(path=path),
             task_name=self.task_name)
         return False
     return True
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--parameter', type=str, required=True, nargs='*')
    parser.add_argument('--task-name', required=True)
    parser.add_argument('--run-id', required=True, type=int)
    args = parser.parse_args()
    status = StatusEntry(TaskStatus.SUCCESS)
    try:
        node = WaitForNode().await_node_start(args.parameter, args.task_name,
                                              args.run_id)
        print(node.name + " " + node.ip)
        exit(0)
    except Exception as e:
        Logger.warn(e.message)
        status = StatusEntry(TaskStatus.FAILURE)
    if status.status == TaskStatus.FAILURE:
        raise RuntimeError('Failed to setup cluster')
예제 #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--nodes_number', type=int, required=True)
    args = parser.parse_args()
    run_id = os.environ['RUN_ID']
    hostfile = os.environ['DEFAULT_HOSTFILE']
    status = StatusEntry(TaskStatus.SUCCESS)
    workers = []
    try:
        workers = CreateWorkerNodes().await_workers_start(
            args.nodes_number, run_id)
        BuildHostfile().run(workers, hostfile, run_id)
    except Exception as e:
        Logger.warn(e.message)
        status = StatusEntry(TaskStatus.FAILURE)
        ShutDownCluster().run(workers, status)
    if status.status == TaskStatus.FAILURE:
        raise RuntimeError('Failed to setup cluster')
예제 #6
0
 def child_run_active(self):
     if self.child_id is None:
         return False
     attempts = 0
     while attempts < self.RETRY_COUNT:
         try:
             run = self.api.load_run(self.child_id)
             return run['status'] == 'RUNNING'
         except Exception as e:
             Logger.warn(
                 "Failed to fetch child run ID '' status: {}.".format(
                     str(self.child_id), e.message),
                 task_name=self.TASK_NAME)
             attempts = attempts + 1
             time.sleep(self.POLL_TIMEOUT)
     Logger.fail("Exceeded maximum attempts to fetch child run status.")
     raise RuntimeError(
         "Exceeded maximum attempts to fetch child run status.")
예제 #7
0
 def transfer_data(self, data_paths, log_task):
     if len(data_paths) > 0:
         Logger.info('Transferring %d path(s)' % len(data_paths),
                     task_name=log_task)
         transfers = map(self.__schedule_transfer_task, data_paths)
         for transfer in transfers:
             if transfer is None:
                 raise RuntimeError('Upload via DTS failed')
         remaining_ids = map(lambda transfer: transfer['id'], transfers)
         while remaining_ids:
             current_ids = list(remaining_ids)
             for id in current_ids:
                 transfer_task = self.__get_transfer_task(id)
                 source_path = transfer_task['source']['path']
                 destination_path = transfer_task['destination']['path']
                 if transfer_task['status'] == _TransferStatus.SUCCESS:
                     remaining_ids.remove(id)
                     Logger.info(
                         'Data transfer from source %s to destination %s has finished'
                         % (destination_path, source_path),
                         task_name=log_task)
                 elif transfer_task['status'] == _TransferStatus.FAILURE:
                     remaining_ids.remove(id)
                     reason = transfer_task[
                         'reason'] if 'reason' in transfer_task else 'No reason available'
                     Logger.fail(
                         "Data transfer from source %s to destination %s went bad due to the reason: '%s'"
                         % (source_path, destination_path, reason),
                         task_name=log_task)
                     raise RuntimeError(
                         'Data transfer went bad for source %s' %
                         source_path)
                 else:
                     time.sleep(self.pooling_delay)
             if not len(remaining_ids) == len(
                     current_ids) and remaining_ids:
                 Logger.info('%d data transfers are still being processed' %
                             len(remaining_ids),
                             task_name=log_task)
         Logger.info('All data transfers have finished successfully',
                     task_name=log_task)
     else:
         Logger.warn('No files for data transfer were found',
                     task_name=log_task)
예제 #8
0
 def get_running_samples(self):
     attempts = 0
     while attempts < self.RETRY_COUNT:
         try:
             child_runs = self.api.load_child_pipelines(self.run_id)
             count = 0
             for run in child_runs:
                 if run['status'] == 'RUNNING':
                     count = count + 1
             return count
         except Exception as e:
             Logger.warn("Failed to fetch running samples: {}.".format(
                 e.message),
                         task_name=self.TASK_NAME)
             attempts = attempts + 1
             time.sleep(self.POLL_TIMEOUT)
     Logger.fail("Exceeded maximum attempts to fetch running samples.")
     raise RuntimeError(
         "Exceeded maximum attempts to fetch running samples.")
예제 #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--master-id', type=int, required=True)
    parser.add_argument('--task-name', required=True)
    args = parser.parse_args()
    run_id = os.environ['RUN_ID']

    status = StatusEntry(TaskStatus.SUCCESS)
    workers = []
    try:
        master = MasterNode().await_master_start(args.master_id,
                                                 args.task_name)
        print(master.name + " " + master.ip)
        exit(0)
    except Exception as e:
        Logger.warn(e.message)
        status = StatusEntry(TaskStatus.FAILURE)
    if status.status == TaskStatus.FAILURE:
        raise RuntimeError('Failed to setup cluster')
예제 #10
0
    def run(self, mount_root, tmp_dir):
        try:
            Logger.info('Starting mounting remote data storages.',
                        task_name=self.task_name)

            Logger.info('Fetching list of allowed storages...',
                        task_name=self.task_name)
            available_storages = self.api.load_available_storages()
            if not available_storages:
                Logger.success('No remote storages are available',
                               task_name=self.task_name)
                return
            Logger.info(
                'Found {} available storage(s). Checking mount options.'.
                format(len(available_storages)),
                task_name=self.task_name)

            fuse_tmp = os.path.join(tmp_dir, "s3fuse")
            if not self.create_directory(fuse_tmp):
                fuse_tmp = '/tmp'

            fuse_available = self.check_or_install_fuse()

            aws_default_region = os.getenv('AWS_DEFAULT_REGION', 'us-east-1')
            aws_region = os.getenv('AWS_REGION', aws_default_region)
            limited_storages = os.getenv('CP_CAP_LIMIT_MOUNTS')
            if limited_storages:
                try:
                    limited_storages_list = [
                        int(x.strip()) for x in limited_storages.split(',')
                    ]
                    available_storages = [
                        x for x in available_storages
                        if x.id in limited_storages_list
                    ]
                    Logger.info(
                        'Run is launched with mount limits ({}) Only {} storages will be mounted'
                        .format(limited_storages, len(available_storages)),
                        task_name=self.task_name)
                except Exception as limited_storages_ex:
                    Logger.warn(
                        'Unable to parse CP_CAP_LIMIT_MOUNTS value({}) with error: {}.'
                        .format(limited_storages,
                                str(limited_storages_ex.message)),
                        task_name=self.task_name)

            nfs_count = len(
                filter((lambda ds: ds.storage_type == 'NFS' and ds.region_name
                        == aws_region), available_storages))
            nfs_available = nfs_count > 0 and self.check_or_install_nfs()
            if not fuse_available and not nfs_available:
                Logger.success(
                    'Mounting of remote storages is not available for this image',
                    task_name=self.task_name)
                return
            for storage in available_storages:
                if not PermissionHelper.is_storage_readable(storage):
                    continue
                mounter = self.get_mount_manager(storage, nfs_available,
                                                 fuse_available, fuse_tmp)
                if mounter is not None:
                    self.mount(mounter, mount_root)
                elif storage.storage_type != NFS_TYPE and storage.storage_type != S3_TYPE:
                    Logger.warn('Unsupported storage type {}.'.format(
                        storage.storage_type),
                                task_name=self.task_name)
            Logger.success('Finished data storage mounting',
                           task_name=self.task_name)
        except Exception as e:
            Logger.fail('Unhandled error during mount task: {}.'.format(
                str(e.message)),
                        task_name=self.task_name)
예제 #11
0
    for column in metadata_columns:
        for (external_id, internal_id, url, column_type,
             file_name_format) in metadata_columns_values[column]:
            upload_result = pool.apply_async(
                upload_data, (url, destination, file_name_format, column,
                              column_type, create_folders_for_columns,
                              internal_id, metadata_id, api, update_paths))
            pool_results.append(upload_result)

    pool.close()
    pool.join()

    successes_count = sum([1 for x in pool_results if x.get() == 0])

    if successes_count == len(pool_results):
        Logger.success("Upload done. All transfers completed successfully",
                       task_name=UPLOAD_TASK_NAME)
        exit(0)

    elif successes_count == 0:
        Logger.fail(
            "Upload completed with errors. ALL transfers FAILED\nPlease review errors above",
            task_name=UPLOAD_TASK_NAME)
        exit(1)

    else:
        Logger.warn(
            "Upload completed with errors. SOME of the transfers failed to complete\nPlease review errors above",
            task_name=UPLOAD_TASK_NAME)
        exit(0)
예제 #12
0
    def launch_pipeline(self,
                        params,
                        param_names,
                        instance_size,
                        instance_disk,
                        docker_image,
                        cmd,
                        sample=None):
        if not self.child_run_active():
            self.launch_child_run(params,
                                  param_names,
                                  cmd,
                                  instance_size,
                                  instance_disk,
                                  docker_image,
                                  sample=sample)
            return

        command = self.CMD_TEMPLATE.format(pipe_id=self.pipe_id,
                                           version=self.version,
                                           instance_disk=instance_disk,
                                           instance_type=instance_size,
                                           docker_image=docker_image,
                                           cmd=cmd,
                                           parent=self.run_id)
        if sample:
            command = command + self.SAMPLE_TEMPLATE.format(
                sample_name=sample[self.SAMPLE_NAME],
                sample_id=sample[self.SAMPLE_ID])
        # add all pattern params
        index = 0
        for name in param_names:
            if sample:
                value = ','.join(params[name])
            else:
                value = params[index]
            command += " --{} input?{}".format(name, value)
            index = index + 1
        # add all other params
        for param, value in self.pipe_params.iteritems():
            if param.startswith('i_'):
                command += " --{} input?{}".format(
                    self.change_parameter_name(param), value)
            elif param.startswith('c_'):
                command += " --{} common?{}".format(
                    self.change_parameter_name(param), value)
            elif param.startswith('o_'):
                command += " --{} output?{}".format(
                    self.change_parameter_name(param), value)
            else:
                command += " --{} {}".format(param, value)

        Logger.info('Starting pipeline with command: "{}".'.format(command),
                    task_name=self.TASK_NAME)
        try:
            LoggedCommand(command, None, self.TASK_NAME).execute()
        except Exception as e:
            Logger.warn(
                "Failed to launch sample processing with command: '{}'. Error: '{}'."
                .format(command, e.message),
                task_name=self.TASK_NAME)
예제 #13
0
    def launch_child_run(self,
                         params,
                         param_names,
                         cmd,
                         instance_size,
                         instance_disk,
                         docker_image,
                         sample=None):
        run_params = {'parent-id': self.run_id}
        if sample:
            run_params['sample_name'] = sample[self.SAMPLE_NAME]
            run_params['sample_id'] = sample[self.SAMPLE_ID]
        index = 0
        # add all pattern params
        for name in param_names:
            if sample:
                value = ','.join(params[name])
            else:
                value = params[index]
            run_params[name] = {'value': value, 'type': 'input'}
            index = index + 1

        # add all other params
        for param, value in self.pipe_params.iteritems():
            param_type = None
            param_name = param
            real_value = self.normalize_value(value)
            if param.startswith('i_'):
                param_type = 'input'
                param_name = self.change_parameter_name(param)
            elif param.startswith('c_'):
                param_type = 'common'
                param_name = self.change_parameter_name(param)
            elif param.startswith('o_'):
                param_type = 'output'
                param_name = self.change_parameter_name(param)
            run_params[param_name] = {'value': real_value}
            if param_type is not None:
                run_params[param_name]['type'] = param_type
            else:
                run_params[param_name]['type'] = self.get_type_from_env(
                    param_name)
        Logger.info(
            "Starting child pipeline run on a parent node with parameters: '{}'."
            .format(str(run_params)),
            task_name=self.TASK_NAME)
        try:
            run = self.api.launch_pipeline(self.pipe_id,
                                           self.version,
                                           run_params,
                                           parent_node_id=self.run_id,
                                           cmd=cmd,
                                           instance=instance_size,
                                           disk=instance_disk,
                                           docker=docker_image)
            self.child_id = run['id']
        except Exception as e:
            Logger.warn(
                "Failed to launch sample processing with parameters: '{}'. Error: '{}'."
                .format(str(run_params), e.message),
                task_name=self.TASK_NAME)
            self.child_id = None
예제 #14
0
 def warn(message, crucial=False, *args, **kwargs):
     logging.warn(message, *args, **kwargs)
     if not Logger.cmd and (crucial or Logger.verbose):
         CloudPipelineLogger.warn(message, task_name=Logger.task)