예제 #1
0
    def prepare_job_start_events(self):
        '''
        add job start events into job_events list
        end events should be added when they are starting
        '''
        for job in self.job_list:
            start_t = job['submit_time']
            # util.print_fn('%d, %d' % (start_t, end_t))

            #for job start
            tmp_dict = util.search_dict_list(self.job_events, 'time', start_t)
            if tmp_dict == None:
                #not found, add the time into to job_events
                tmp_dict = dict()
                tmp_dict['time'] = start_t
                tmp_dict['start_jobs'] = list()
                tmp_dict['end_jobs'] = list()
                tmp_dict['start_jobs'].append(job)
                self.job_events.append(tmp_dict)
            else:
                tmp_dict['start_jobs'].append(job)

            job['status'] = 'EVENT'  #job has been in EVENT status
        ''' sort events based on their time'''
        self.job_events.sort(key=lambda e: e.__getitem__('time'))
        util.print_fn('Init, add job start events')
        self.print_job_events()
예제 #2
0
def get_model(model_name):
    '''
    get model tensor information by model_name
    return a dict{name, tensors(list)}
    '''
    if model_name == 'vgg19':
        m_idx = 0
    elif model_name == 'vgg16':
        m_idx = 1
    elif model_name == 'vgg11':
        m_idx = 2
    elif model_name == 'alexnet':
        m_idx = 3
    elif model_name == 'resnet152':
        m_idx = 4
    elif model_name == 'resnet101':
        m_idx = 5
    elif model_name == 'resnet50':
        m_idx = 6
    elif model_name == 'inception4':
        m_idx = 7
    elif model_name == 'inception3':
        m_idx = 8
    else:
        # m_idx = random.randint(0,8)
        m_idx = 8
        util.print_fn('No model match, pick %s' % m_names[m_idx])

    ret = {
        'name': m_names[m_idx],
        'ind': m_idx,
        'tensors': m_tensors[m_idx],
        'mem_util': m_mem[m_idx]
    }
    return ret
예제 #3
0
    def get_network_load(self, job_dict):
        if 'num_gpu' not in job_dict:
            util.print_fn('No gpu information')
            return

        if 'model' not in job_dict:
            util.print_fn('No model information')
            return

        num_w = job_dict['num_gpu']
        num_ps = num_w

        if num_w == 1:
            job_dict['ps_network'] = list()
            job_dict['w_network'] = list([0])
            '''
            check job ps_size 
            '''
            job_dict['ps_ave'] = 0
            return

        job_dict['w_network'] = list([job_dict['model']['total_size']] * num_w)
        job_dict['ps_network'] = list([0] * num_ps)
        for i in range(0, len(job_dict['model']['tensors'])):
            ps_idx = int(i % num_ps)
            # job_dict['ps_network'][ps_idx] += (job_dict['model']['tensors'][i] * num_w)
            job_dict['ps_network'][ps_idx] += (job_dict['model']['tensors'][i])

        for i in range(0, len(job_dict['ps_network'])):
            job_dict['ps_network'][i] = round(job_dict['ps_network'][i], 1)
        '''
예제 #4
0
 def get_job_model(self, job_dict):
     # if job_dict.has_key('model_name') and job_dict.has_key('model_scale'):
     if ('model_name' in job_dict) and ('model_scale' in job_dict):
         job_dict['model'] = models.get_model_with_scale(
             job_dict['model_name'], job_dict['model_scale'])
     else:
         util.print_fn('Not enough model information to get the details')
예제 #5
0
 def __init__(self, id, num_node=0, num_gpu_p_node=0, num_cpu_p_node=0, mem_p_node=0):
     self.num_node = num_node
     self.num_gpu_p_node = num_gpu_p_node
     self.num_cpu_p_node = num_cpu_p_node
     self.mem_p_node = mem_p_node
     self.id = id
     self.node_list = list()
     util.print_fn('  Switch[%d] has %d nodes' % (id, num_node))
예제 #6
0
    def checkpoint_multi_dlas_gpu(self, event_time):
        '''
        Record cluster, and job information, including:
        time
        idle_node
        busy_node: gpu running
        full_node: all gpus are running
        idle_gpu
        busy_gpu
        pending_job
        running_job
        completed_job
        '''
        idle_node = 0
        busy_node = 0
        full_node = 0
        idle_gpu = 0
        busy_gpu = 0
        pending_job = 0
        running_job = 0
        completed_job = 0

        if FLAGS.schedule != 'multi-dlas-gpu':
            util.print_fn("Error, not multi-dlas-gpu in checkpoint")
            exit()

        for num_gpu, gjob in JOBS.gpu_job.items():
            idle_gpu += gjob.free_gpu

        busy_gpu = CLUSTER.num_gpu - idle_gpu

        busy_node = int(math.ceil(busy_gpu / CLUSTER.num_gpu_p_node))
        full_node = busy_node
        idle_node = int(CLUSTER.num_node - busy_node)

        for job in JOBS.job_list:
            if job['status'] == 'RUNNING':
                running_job += 1
            elif job['status'] == 'PENDING':
                pending_job += 1
            elif job['status'] == 'END':
                completed_job += 1

        #add log
        self.log_list.append([
            event_time,
            int(idle_node),
            int(busy_node),
            int(full_node),
            int(idle_gpu),
            int(busy_gpu),
            int(pending_job),
            int(running_job),
            int(completed_job)
        ])
        if len(self.log_list) >= 1:
            self.dump_all_logs()
예제 #7
0
    def end_job(self, e_job):
        if FLAGS.schedule != 'multi-dlas-gpu':
            util.print_fn("Not multi-dlas-gpu")
            exit()

        num_gpu = e_job['num_gpu']
        gjob = self.gpu_job[num_gpu]
        gjob.release_job_gpu(1)
        gjob.runnable_jobs.remove(e_job)
        # gjob.running_jobs.remove(e_job)
        gjob.queues[e_job['q_id']].remove(e_job)
        gjob.end_job += 1
예제 #8
0
    def sort_all_jobs(self, mode=None):
        '''
        Sort jobs based on their sumbit_time
        j1, num_gpu, start_t, end_t, duration
        '''
        # tmp_list = sorted(self.job_list, key = lambda e:e.__getitem__('start_time'))
        # tmp_dict = util.search_dict_list(self.job_list, 'start_time', 4)
        # tmp_dict['end_time'] = 15
        # print(tmp_dict)
        # self.job_list = tmp_list

        self.job_list.sort(key=lambda e: e.__getitem__('submit_time'))
        util.print_fn('   Jobs are sorted with their start time')
        # self.read_all_jobs()
        if FLAGS.schedule == 'multi-dlas-gpu' and FLAGS.scheme == 'count':
            for num_gpu, gjob in self.gpu_job.items():
                util.print_fn('%d-GPU jobs have %d ' %
                              (num_gpu, gjob.total_job))
예제 #9
0
    def __init__(self, id, num_gpu=0, num_cpu=0, mem=0):
        self.id = id
        self.num_cpu = num_cpu
        self.free_cpus = num_cpu
        self.num_gpu = num_gpu
        self.free_gpus = num_gpu
        #network load: can be bw, or the amount of traffic
        # in and out should be the same
        self.network_in = 0
        self.network_out = 0

        self.mem = mem
        self.free_mem = mem

        #node class for gandiva
        self.job_gpu = 0
        self.num_jobs = 0

        util.print_fn('    Node[%d] has %d gpus, %d cpus, %d G memory' %
                      (id, num_gpu, num_cpu, mem))
예제 #10
0
    def print_all_job_size_info(self):
        '''        
        print job tensor info
        '''

        ps_max_ave_fd = open('ps_max_ave.csv', 'w+')
        ps_max_ave_writer = csv.writer(ps_max_ave_fd)
        ps_max_ave_writer.writerow(['ps_max_ave'])

        ps_max99_ave_fd = open('ps_max99_ave.csv', 'w+')
        ps_max99_ave_writer = csv.writer(ps_max99_ave_fd)
        ps_max99_ave_writer.writerow(['ps_max99_ave'])

        w_fd = open('w.csv', 'w+')
        w_writer = csv.writer(w_fd)
        w_writer.writerow(['w'])

        ps_fd = open('ps.csv', 'w+')
        ps_writer = csv.writer(ps_fd)
        ps_writer.writerow(['ps'])

        ps_w_fd = open('ps_w.csv', 'w+')
        ps_w_writer = csv.writer(ps_w_fd)
        ps_w_writer.writerow(['ps_w'])

        util.print_fn("Start to dump job information")
        for job in self.job_list:
            if job['ps_ave'] != 0:
                ps_max_ave_writer.writerow(list([job['ps_max_ave']]))
                ps_max99_ave_writer.writerow(list([job['ps_max99_ave']]))
                w_writer.writerow(list([job['w_network'][0]]))
                # ps_w_writer.writerow(job['w_network'][0])
                # for ps in job['ps_network']:
                #     ps_writer.writerow(ps)
                #     ps_w_writer.writerow(ps)

        ps_max_ave_fd.close()
        ps_max99_ave_fd.close()
        w_fd.close()
        ps_fd.close()
        ps_w_fd.close()
예제 #11
0
    def print_job_events(self):
        util.print_fn('    Print all job events ')
        for event in self.job_events:
            util.print_fn(
                '      event.time[%d], with %d start_jobs, and %d end_jobs' %
                (event['time'], len(event['start_jobs']), len(
                    event['end_jobs'])))

        util.print_fn(' ')
예제 #12
0
    def init_log(self):
        self.log_path = FLAGS.log_path
        if self.log_path[-1] == '/':
            self.log_path = self.log_path[:-1]
        util.print_fn(self.log_path)
        util.print_fn(' ')

        #prepare folder
        cmd = 'mkdir -p ' + self.log_path
        ''' python 2.7
        status, output = commands.getstatusoutput(cmd)
        '''
        #python 2.7 & 3
        ret = subprocess.check_output(cmd, shell=True)

        self.log_file = self.log_path + '/cluster.csv'
        self.log_job = self.log_path + '/job.csv'
        if FLAGS.scheme != 'count':
            self.log_cpu = self.log_path + '/cpu.csv'
            self.log_gpu = self.log_path + '/gpu.csv'
            self.log_network = self.log_path + '/network.csv'
            self.log_mem = self.log_path + '/memory.csv'

        fd = open(self.log_file, 'w+')
        log_writer = csv.writer(fd)
        if FLAGS.scheme == 'gandiva':
            log_writer.writerow([
                'time', 'idle_node', 'busy_node', 'full_node', 'fra_gpu',
                'busy_gpu', 'pending_job', 'running_job', 'completed_job',
                'len_g1', 'len_g2', 'len_g4', 'len_g8', 'len_g16', 'len_g32',
                'len_g64'
            ])
        else:
            log_writer.writerow([
                'time', 'idle_node', 'busy_node', 'full_node', 'idle_gpu',
                'busy_gpu', 'pending_job', 'running_job', 'completed_job'
            ])
        fd.close()

        if FLAGS.scheme != 'count':
            fd = open(self.log_cpu, 'w+')
            log_writer = csv.writer(fd)
            log_writer.writerow(
                ['time'] + ['cpu' + str(i) for i in range(CLUSTER.num_node)])
            ''''if combine all the info together
            log_writer.writerow(['cpu'+str(i) for i in range(CLUSTER.num_node)] 
                                + ['gpu'+str(i) for i in range(CLUSTER.num_node)] 
                                + ['net'+str(i) for i in range(CLUSTER.num_node)])
            '''
            fd.close()
            fd = open(self.log_gpu, 'w+')
            log_writer = csv.writer(fd)
            log_writer.writerow(
                ['time'] + ['gpu' + str(i) for i in range(CLUSTER.num_node)])
            fd.close()
            fd = open(self.log_network, 'w+')
            log_writer = csv.writer(fd)
            title_list = list()
            title_list.append('time')
            for i in range(CLUSTER.num_node):
                title_list.append('in' + str(i))
                title_list.append('out' + str(i))
            log_writer.writerow(title_list)
            # log_writer.writerow(['net'+str(i) for i in range(CLUSTER.num_node)])
            fd.close()

            fd = open(self.log_mem, 'w+')
            log_writer = csv.writer(fd)
            # log_writer.writerow(['time'] + ['mem'+str(i) for i in range(CLUSTER.num_node)])
            log_writer.writerow(['time', 'max', '99th', '95th', 'med'])
            fd.close()

        fd = open(self.log_job, 'w+')
        log_writer = csv.writer(fd)
        if FLAGS.schedule == 'gpu-demands':
            log_writer.writerow([
                'time', '1-GPU', '2-GPU', '4-GPU', '8-GPU', '12-GPU', '16-GPU',
                '24-GPU', '32-GPU'
            ])
        else:
            if FLAGS.scheme == 'count':
                log_writer.writerow([
                    'time', 'job_id', 'num_gpu', 'submit_time', 'start_time',
                    'end_time', 'executed_time', 'JCT', 'duration',
                    'pending_time', 'preempt', 'resume', 'promote'
                ])
            else:
                log_writer.writerow([
                    'time', 'job_id', 'num_gpu', 'submit_time', 'start_time',
                    'end_time', 'executed_time', 'JCT', 'duration',
                    'pending_time', 'preempt', 'promote'
                ])
        fd.close()
예제 #13
0
 def completion_check(self):
     for num_gpu, gjob in self.gpu_job.items():
         if gjob.end_job != gjob.total_job:
             util.print_fn(
                 '!!!! Miss-match %d completed jobs with %d total jobs in %d-GPU jobs'
                 % (gjob.end_job, gjob.total_job, num_gpu))
예제 #14
0
    def reserve_gpus(self, total_num):
        '''
        GPU cluster reserve gpus for gpu_job groups
        '''
        num_group = len(self.gpu_job)
        ave_gpu = math.floor(total_num / num_group)

        job_list = list()
        for num_gpu, gjob in self.gpu_job.items():
            tmp_dict = dict()
            tmp_dict['num_gpu'] = num_gpu
            tmp_dict['used_gpu'] = gjob.total_gpu - gjob.free_gpu
            tmp_dict['demands'] = gjob.get_gpu_demands()
            tmp_dict['cur_gpu'] = gjob.total_gpu
            tmp_dict['cur_free_gpu'] = gjob.free_gpu
            tmp_dict['reserve'] = 0
            job_list.append(tmp_dict)

        total_free_gpu = total_num - sum(k['used_gpu'] for k in job_list)
        total_demands = sum(k['demands'] for k in job_list)
        # print('total_free %d, total_demands %d' % (total_free_gpu, total_demands))
        if total_demands == 0:
            return
        '''demand-based, keep current used_gpu'''
        remain_free_gpu = total_free_gpu
        job_list.sort(key=lambda e: e.__getitem__('demands'))
        for job_dict in job_list:
            if job_dict['demands'] == 0:
                continue

            ratio = round((job_dict['demands'] * 1.0) / total_demands, 2)
            cal_gpu = int(
                math.floor((ratio * total_num) / job_dict['num_gpu']) *
                job_dict['num_gpu'])
            cal_gpu = job_dict[
                'demands'] if job_dict['demands'] <= cal_gpu else cal_gpu
            extra_gpu = cal_gpu - job_dict['used_gpu']
            if extra_gpu <= 0:
                extra_gpu = 0
            elif extra_gpu > remain_free_gpu:
                extra_gpu = int(
                    math.floor(remain_free_gpu / job_dict['num_gpu']) *
                    job_dict['num_gpu'])

            # print('%d-GPU, u%d, cal_gpu %d, extra_g %d' %(job_dict['num_gpu'], job_dict['used_gpu'], cal_gpu, extra_gpu))
            job_dict['reserve'] = job_dict['used_gpu'] + extra_gpu
            remain_free_gpu -= extra_gpu
            # if remain_free_gpu <= 0:
            #     break
        ''' still remaining, give to the right job group'''
        job_list.sort(key=lambda e: e.__getitem__('num_gpu'))
        num_full = 0
        while remain_free_gpu > 0:
            # if all are satisfied
            if num_full >= len(job_list):
                break
            else:
                num_full = 0

            for job_dict in job_list:
                if job_dict['demands'] <= job_dict['reserve']:
                    num_full += 1
                    continue
                if remain_free_gpu >= job_dict['num_gpu']:
                    remain_free_gpu -= job_dict['num_gpu']
                    job_dict['reserve'] += job_dict['num_gpu']
                else:
                    num_full += 1

                if remain_free_gpu <= 0:
                    break

        #execute reservation
        for job_dict in job_list:
            num_gpu = job_dict['num_gpu']
            self.gpu_job[num_gpu].get_gpu_reservation(job_dict['reserve'])
            print("%d-j, T%d, F%d, U%d, N%d, R%d; " %
                  (job_dict['num_gpu'], job_dict['cur_gpu'],
                   job_dict['cur_free_gpu'], job_dict['used_gpu'],
                   job_dict['demands'], job_dict['reserve']),
                  end=' ')

        for num_gpu, gjob in self.gpu_job.items():
            if gjob.free_gpu < 0:
                print("Error free gpu, %d" % num_gpu)
                exit()

        util.print_fn(' %s is done' % sys._getframe().f_code.co_name)
예제 #15
0
 def release_job_gpu(self, num_job=1):
     if num_job < 0:
         util.print_fn("Error: num_job < 0")
         exit()
     self.free_gpu += int(self.num_gpu * num_job)