예제 #1
0
def login(username, password):
    global token
    ret = post(
            '/account/login',
            {"username": username, "password": password}
            )
    dlog.debug(f"debug: login ret:{ret}")
    token = ret['token']
예제 #2
0
    def get_job_state(self):
        """get the jobs. Usually, this method will query the database of slurm or pbs job scheduler system and get the results.

        Notes
        -----
        this method will not submit or resubmit the jobs if the job is unsubmitted.
        """
        dlog.debug(f"debug:query database; self.job_hash:{self.job_hash}; self.job_id:{self.job_id}")
        job_state = self.machine.check_status(self)
        self.job_state = job_state
예제 #3
0
    def get_submission_state(self):
        """check whether all the jobs in the submission.

        Notes
        -----
        this method will not handle unexpected (like resubmit terminated) job state in the submission.
        """
        for job in self.belonging_jobs:
            job.get_job_state()
            dlog.debug(f"debug:get_submission_state: job: {job.job_hash}, {job.job_id}, {repr(job.job_state)}")
예제 #4
0
 def sub_script(self, job_dirs, cmd, args, res, outlog, errlog):
     if args is None:
         args = []
     multi_command = ""
     for job_dir in job_dirs:
         for idx, t in enumerate(zip_longest(cmd, args, fillvalue='')):
             c_str = f"cd {self.context.remote_root}/{job_dir} && ( test -f tag_{idx}_finished || ( ({t[0]} {t[1]} && touch tag_{idx}_finished  2>>{errlog} || exit 52 ) | tee -a {outlog}) ) || exit 51;"
             multi_command += c_str
     multi_command += "exit 0;"
     dlog.debug("10000, %s" % multi_command)
     return multi_command
예제 #5
0
def _get_oss_bucket(endpoint, bucket_name):
    #  res = get("/tools/sts_token", {})
    res = get("/data/get_sts_token", {})
    # print('debug>>>>>>>>>>>>>', res)
    dlog.debug(f"debug: _get_oss_bucket: res:{res}")
    auth = oss2.StsAuth(
                res['AccessKeyId'],
                res['AccessKeySecret'],
                res['SecurityToken']
                )
    return oss2.Bucket(auth, endpoint, bucket_name)
예제 #6
0
 def job_id(self, values):
     response, jobQueue = values
     self._job_id = response['jobId']
     self._job_name = response['jobName']
     self.__class__._jobQueue = jobQueue
     self.__class__._job_id_map_status[
         self._job_id] = self.map_aws_status_to_dpdisp_status(
             response.get('status', 'SUBMITTED'))
     self.context.write_file(self.job_id_name, self._job_id)
     dlog.debug(
         "15000, _job_id:%s, _job_name:%s, _map:%s, _Queue:%s" %
         (self._job_id, self._job_name, self.__class__._job_id_map_status,
          self.__class__._jobQueue))
예제 #7
0
 def check_status(self, job):
     if job.job_id == '':
         return JobStatus.unsubmitted
     dlog.debug(
         f"debug: check_status; job.job_id:{job.job_id}; job.job_hash:{job.job_hash}"
     )
     try:
         dp_job_status = api.get_tasks(job.job_id)[0]["status"]
     except IndexError as e:
         raise RuntimeError(
             f"cannot find job information in dpcloudserver's database for job {job.job_id}"
         )
     job_state = self.map_dp_job_state(dp_job_status)
     return job_state
예제 #8
0
    def __init__(self, local_root, work_profile, job_uuid=None):
        """
        work_profile:
        local_root:
        """
        assert (type(local_root) == str)
        self.temp_local_root = os.path.abspath(local_root)
        self.temp_remote_root = os.path.abspath(work_profile.get_work_root())
        self.work_profile = work_profile
        self.job_uuid = job_uuid
        self.submission = None
        # if job_uuid:
        #    self.job_uuid = job_uuid
        # else:
        #    self.job_uuid = str(uuid.uuid4())

        # self.remote_root = os.path.join(work_profile.get_work_root(), self.job_uuid)
        dlog.debug("local_root is %s" % local_root)
예제 #9
0
    def __init__(self, local_root, work_profile, job_uuid=None):
        """
        work_profile:
        local_root:
        """
        assert (type(local_root) == str)
        self.local_root = os.path.abspath(local_root)
        if job_uuid:
            self.job_uuid = job_uuid
        else:
            self.job_uuid = str(uuid.uuid4())

        self.remote_root = os.path.join(work_profile.get_work_root(),
                                        self.job_uuid)
        dlog.debug("local_root is %s" % local_root)
        dlog.debug("remote_root is %s" % self.remote_root)

        os.makedirs(self.remote_root, exist_ok=True)
예제 #10
0
def upload(oss_task_zip, zip_task_file, endpoint, bucket_name):
    dlog.debug(f"debug: upload: oss_task_zip:{oss_task_zip}; zip_task_file:{zip_task_file}")
    bucket = _get_oss_bucket(endpoint, bucket_name)
    total_size = os.path.getsize(zip_task_file)
    part_size = determine_part_size(total_size, preferred_size=1000 * 1024)
    upload_id = bucket.init_multipart_upload(oss_task_zip).upload_id
    parts = []
    with open(zip_task_file, 'rb') as fileobj:
        part_number = 1
        offset = 0
        while offset < total_size:
            num_to_upload = min(part_size, total_size - offset)
            result = bucket.upload_part(oss_task_zip, upload_id, part_number, SizedFileAdapter(fileobj, num_to_upload))
            parts.append(PartInfo(part_number, result.etag))
            offset += num_to_upload
            part_number += 1
    # result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts)
    result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts)
    # print('debug:upload_result:', result, dir())
    return result
예제 #11
0
 def job_id(self):
     try:
         self._job_id
     except AttributeError:
         if self.context.check_file_exists(self.job_id_name):
             self._job_id = self.context.read_file(self.job_id_name)
             response_list = batch_client.describe_jobs(
                 jobs=[self._job_id]).get('jobs')
             try:
                 response = response_list[0]
                 jobQueue = response['jobQueue']
             except IndexError:
                 pass
             else:
                 self.job_id = (response, jobQueue)
                 return self._job_id
         dlog.debug("50000, self._job_id:%s,_Queue:%s,_map:%s," %
                    (self._job_id, self.__class__._jobQueue,
                     self.__class__._job_id_map_status))
         return ""
     return self._job_id
예제 #12
0
    def AWS_check_status(cls, job_id=""):
        """
        to aviod query jobStatus too often, set a time interval
        query_dict example:
        {job_id: JobStatus}

        {'40fb24b2-d0ca-4443-8e3a-c0906ea03622': <JobStatus.running: 3>,
         '41bda50c-0a23-4372-806c-87d16a680d85': <JobStatus.waiting: 2>}
           
        """
        query_dict = {}
        if datetime.now().timestamp() > cls._query_next_allow_time:
            cls._query_next_allow_time = datetime.now().timestamp(
            ) + cls._query_time_interval
            for status in [
                    'SUBMITTED', 'PENDING', 'RUNNABLE', 'STARTING', 'RUNNING',
                    'SUCCEEDED', 'FAILED'
            ]:
                nextToken = ''
                while nextToken is not None:
                    status_response = batch_client.list_jobs(
                        jobQueue=cls._jobQueue,
                        jobStatus=status,
                        maxResults=100,
                        nextToken=nextToken)
                    status_list = status_response.get('jobSummaryList')
                    nextToken = status_response.get('nextToken', None)
                    for job_dict in status_list:
                        cls._job_id_map_status.update({
                            job_dict['jobId']:
                            cls.map_aws_status_to_dpdisp_status(
                                job_dict['status'])
                        })
            dlog.debug('20000:_map: %s' % (cls._job_id_map_status))
        dlog.debug('62000:job_id:%s, _query: %s, _map: %s' %
                   (job_id, query_dict, cls._job_id_map_status))
        if job_id:
            return cls._job_id_map_status.get(job_id)

        return cls._job_id_map_status
예제 #13
0
 def all_finished(self, job_handler, mark_failure, clean=True):
     task_chunks = job_handler['task_chunks']
     task_chunks_str = ['+'.join(ii) for ii in task_chunks]
     task_hashes = [
         sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str
     ]
     job_list = job_handler['job_list']
     job_record = job_handler['job_record']
     command = job_handler['command']
     tag_failure_list = [
         'tag_failure_%d' % ii for ii in range(len(command))
     ]
     resources = job_handler['resources']
     outlog = job_handler['outlog']
     errlog = job_handler['errlog']
     backward_task_files = job_handler['backward_task_files']
     dlog.debug('checking jobs')
     nchunks = len(task_chunks)
     for idx in range(nchunks):
         cur_hash = task_hashes[idx]
         rjob = job_list[idx]
         if not job_record.check_finished(cur_hash):
             # chunk not finished according to record
             status = rjob['batch'].check_status()
             job_uuid = rjob['context'].job_uuid
             dlog.debug('checked job %s' % job_uuid)
             if status == JobStatus.terminated:
                 job_record.increase_nfail(cur_hash)
                 if job_record.check_nfail(cur_hash) > 3:
                     raise RuntimeError(
                         'Job %s failed for more than 3 times' % job_uuid)
                 dlog.info('job %s terminated, submit again' % job_uuid)
                 dlog.debug('try %s times for %s' %
                            (job_record.check_nfail(cur_hash), job_uuid))
                 rjob['batch'].submit(task_chunks[idx],
                                      command,
                                      res=resources,
                                      outlog=outlog,
                                      errlog=errlog,
                                      restart=True)
             elif status == JobStatus.finished:
                 dlog.info('job %s finished' % job_uuid)
                 if mark_failure:
                     rjob['context'].download(task_chunks[idx],
                                              tag_failure_list,
                                              check_exists=True,
                                              mark_failure=False)
                     rjob['context'].download(task_chunks[idx],
                                              backward_task_files,
                                              check_exists=True)
                 else:
                     rjob['context'].download(task_chunks[idx],
                                              backward_task_files)
                 if clean:
                     rjob['context'].clean()
                 job_record.record_finish(cur_hash)
                 job_record.dump()
     job_record.dump()
     return job_record.check_all_finished()
예제 #14
0
    def do_submit(self,
                  job_dirs,
                  cmd,
                  args=None,
                  res=None,
                  outlog='log',
                  errlog='err'):

        res = self.default_resources(res)
        dlog.debug("2000, params=(%s, %s, %s, %s, %s, %s, )" %
                   (job_dirs, cmd, args, res, outlog, errlog))
        dlog.debug(
            '2200, self.context.remote_root: %s , self.context.local_root: %s'
            % (self.context.remote_root, self.context.local_root))
        # concreate_command =
        script_str = self.sub_script(job_dirs,
                                     cmd,
                                     args=args,
                                     res=res,
                                     outlog=outlog,
                                     errlog=errlog)
        dlog.debug('2300, script_str: %s, self.sub_script_name: %s' %
                   (script_str, self.sub_script_name))
        """
        jobName example:
        home-ec2-user-Ag_init-run_gen-iter_000000-01_model_devi-task_000_000048
        """
        jobName = os.path.join(self.context.remote_root,
                               job_dirs.pop())[1:].replace('/', '-').replace(
                                   '.', '_')
        jobName += ("_" + str(self.context.job_uuid))
        response = batch_client.submit_job(
            jobName=jobName,
            jobQueue=res['jobQueue'],
            jobDefinition=res['jobDefinition'],
            parameters={'task_command': script_str},
            containerOverrides={
                'vcpus': res['cpu_num'],
                'memory': res['memory_size']
            })
        dlog.debug('4000, response:%s' % response)
        self.job_id = (response, res['jobQueue'])
예제 #15
0
    def submit_jobs(self,
                    resources,
                    command,
                    work_path,
                    tasks,
                    group_size,
                    forward_common_files,
                    forward_task_files,
                    backward_task_files,
                    forward_task_deference=True,
                    outlog='log',
                    errlog='err'):
        self.backward_task_files = backward_task_files
        # task_chunks = [
        #     [os.path.basename(j) for j in tasks[i:i + group_size]] \
        #     for i in range(0, len(tasks), group_size)
        # ]
        task_chunks = _split_tasks(tasks, group_size)
        task_chunks_str = ['+'.join(ii) for ii in task_chunks]
        task_hashes = [
            sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str
        ]
        job_record = JobRecord(work_path, task_chunks, fname=self.jrname)
        job_record.dump()
        nchunks = len(task_chunks)

        job_list = []
        for ii in range(nchunks):
            cur_chunk = task_chunks[ii]
            cur_hash = task_hashes[ii]
            if not job_record.check_finished(cur_hash):
                # chunk is not finished
                # check if chunk is submitted
                submitted = job_record.check_submitted(cur_hash)
                if not submitted:
                    job_uuid = None
                else:
                    job_uuid = job_record.get_uuid(cur_hash)
                    dlog.debug("load uuid %s for chunk %s" %
                               (job_uuid, cur_hash))
                # communication context, bach system
                context = self.context(work_path, self.session, job_uuid)
                batch = self.batch(context, uuid_names=self.uuid_names)
                rjob = {'context': context, 'batch': batch}
                # upload files
                if not rjob['context'].check_file_exists(
                        rjob['batch'].upload_tag_name):
                    rjob['context'].upload('.', forward_common_files)
                    rjob['context'].upload(cur_chunk,
                                           forward_task_files,
                                           dereference=forward_task_deference)

                    rjob['context'].write_file(rjob['batch'].upload_tag_name,
                                               '')
                    dlog.debug('uploaded files for %s' % task_chunks_str[ii])
                # submit new or recover old submission
                if not submitted:
                    rjob['batch'].submit(cur_chunk,
                                         command,
                                         res=resources,
                                         outlog=outlog,
                                         errlog=errlog)
                    job_uuid = rjob['context'].job_uuid
                    dlog.debug('assigned uuid %s for %s ' %
                               (job_uuid, task_chunks_str[ii]))
                    dlog.info('new submission of %s for chunk %s' %
                              (job_uuid, cur_hash))
                else:
                    rjob['batch'].submit(cur_chunk,
                                         command,
                                         res=resources,
                                         outlog=outlog,
                                         errlog=errlog,
                                         restart=True)
                    dlog.info('restart from old submission %s for chunk %s' %
                              (job_uuid, cur_hash))
                # record job and its remote context
                job_list.append(rjob)
                ip = None
                instance_id = None
                if 'cloud_resources' in self.remote_profile:
                    ip = self.remote_profile['hostname']
                    instance_id = self.remote_profile['instance_id']
                job_record.record_remote_context(cur_hash, context.local_root,
                                                 context.remote_root, job_uuid,
                                                 ip, instance_id)
                job_record.dump()
            else:
                # finished job, append a None to list
                job_list.append(None)
        assert (len(job_list) == nchunks)
        job_handler = {
            'task_chunks': task_chunks,
            'job_list': job_list,
            'job_record': job_record,
            'command': command,
            'resources': resources,
            'outlog': outlog,
            'errlog': errlog,
            'backward_task_files': backward_task_files
        }
        return job_handler
예제 #16
0
 def submit(self,
            job_dirs,
            cmd,
            args = None,
            res = None,
            restart = False,
            outlog = 'log',
            errlog = 'err'):
     if restart:
         dlog.debug('restart task')
         status = self.check_status()
         if status in [  JobStatus.unsubmitted, JobStatus.unknown, JobStatus.terminated ]:
             dlog.debug('task restart point !!!')
             self.do_submit(job_dirs, cmd, args, res, outlog=outlog, errlog=errlog)
         elif status==JobStatus.waiting:
             dlog.debug('task is waiting')
         elif status==JobStatus.running:
             dlog.debug('task is running')
         elif status==JobStatus.finished:
             dlog.debug('task is finished')
         else:
             raise RuntimeError('unknow job status, must be wrong')
     else:
         dlog.debug('new task')
         self.do_submit(job_dirs, cmd, args, res, outlog=outlog, errlog=errlog)
     if res is None:
         sleep = 0
     else:
         sleep = res.get('submit_wait_time', 0)
     time.sleep(sleep) # For preventing the crash of the tasks while submitting
예제 #17
0
def download(oss_file, save_file, endpoint, bucket_name):
    bucket = _get_oss_bucket(endpoint, bucket_name)
    dlog.debug(f"debug: download: oss_file:{oss_file}; save_file:{save_file}")
    bucket.get_object_to_file(oss_file, save_file)
    return save_file