Пример #1
0
def parse_logs(args):
    """
    parse log files and insert to db
    :param args:
    :return:
    """
    image_id = get_image_id()
    file_list = load_folder_files(os.path.join(args.log_path, "index"))
    dict_run_machine_type = {
        '1': 'ONE_GPU',
        '4': 'FOUR_GPU',
        '8': 'MULTI_GPU',
        '8mp': 'MULTI_GPU_MULTI_PROCESS'
    }
    report_index_dict = {'speed': 1, 'mem': 2, 'maxbs': 6}
    html_results = []
    for job_file in file_list:
        cluster_job_id = uuid.uuid1()
        result = ""
        with open(job_file, 'r+') as file_obj:
            file_lines = file_obj.readlines()
            try:
                job_info = json.loads(file_lines[-1])
            except Exception as exc:
                print("file {} parse error".format(job_file))
                continue
            # save_job
            if str(job_info["gpu_num"]
                   ) == "8" and job_info["run_mode"] == "mp":
                run_machine_type = dict_run_machine_type['8mp']
            else:
                run_machine_type = dict_run_machine_type[str(
                    job_info["gpu_num"])]
            report_index = report_index_dict[job_info["index"]]
            pj = bm.Job()
            pj.job_name = "pb_{}_{}".format(args.paddle_version,
                                            job_info["model_name"])
            pj.cluster_job_id = cluster_job_id
            pj.cluster_type_id = "LocalJob"
            pj.model_name = job_info["model_name"]
            pj.report_index = report_index
            pj.code_branch = "master"
            pj.code_commit_id = args.code_commit_id
            pj.job_type = args.job_type
            pj.run_machine_type = run_machine_type
            pj.frame_id = 0
            pj.image_id = image_id
            pj.cuda_version = args.cuda_version
            pj.cudnn_version = args.cudnn_version
            pj.device_type = args.device_type
            pj.model_implement_type = args.implement_type
            pj.log_extracted = "yes"
            pj.save()
            job_id = pj.job_id

            log_server = socket.gethostname()
            # todo config the log_server port
            log_server = "http://" + log_server + ":8777/"
            log_file = job_info["log_file"].split("/")[-1]
            profiler_log = job_info["log_with_profiler"].split("/")[-1]
            profiler_path = job_info["profiler_path"].split("/")[-1]
            train_log_path = log_server + os.path.join(
                os.path.basename(args.log_path), "train_log", log_file)
            profiler_log_path = log_server + os.path.join(
                os.path.basename(args.log_path), "profiler_log", profiler_log)
            profiler_path = log_server + os.path.join(
                os.path.basename(args.log_path), "profiler_log", profiler_path)

            cpu_utilization_result = 0
            gpu_utilization_result = 0
            try:
                if report_index == 2:
                    for line in file_lines:
                        if "MAX_GPU_MEMORY_USE" in line:
                            value = line.strip().split("=")[1].strip()
                            result = int(value) if str.isdigit(value) else 0
                            break
                elif report_index == 1:
                    for line in file_lines:
                        if "FINAL_RESULT" in line:
                            result = line.strip().split("=")[1]
                        if 'AVG_CPU_USE' in line:
                            cpu_utilization_result = line.strip().split('=')[1]
                        if 'AVG_GPU_USE' in line:
                            gpu_utilization_result = line.strip().split('=')[1]
                else:
                    for line in file_lines:
                        if "MAX_BATCH_SIZE" in line:
                            value = line.strip().split("=")[1].strip()
                            result = int(value) if str.isdigit(value) else 0
                            break

                # save_result
                pjr = bm.JobResults()
                pjr.job_id = job_id
                pjr.model_name = job_info["model_name"]
                pjr.report_index_id = report_index
                pjr.report_result = result
                pjr.train_log_path = 1
                pjr.save()

                # save log path
                pjrl = bm.JobResultsLog()
                pjrl.result_id = pjr.result_id
                cmd = "curl -I -m 10 -o /dev/null -s -w %{http_code} " + profiler_log_path
                if commands.getoutput(cmd) != '200':
                    pjrl.log_path = json.dumps(
                        {"train_log_path": train_log_path})
                else:
                    pjrl.log_path = json.dumps({
                        "train_log_path": train_log_path,
                        "profiler_log_path": profiler_log_path,
                        "profiler_path": profiler_path
                    })
                pjrl.save()
                # save cpu & gpu result
                if report_index == 1:
                    pjr_cpu = bm.JobResults()
                    pjr_cpu.job_id = job_id
                    pjr_cpu.model_name = job_info["model_name"]
                    pjr_cpu.report_index_id = 7
                    pjr_cpu.report_result = cpu_utilization_result
                    pjr_cpu.save()

                    pjr_gpu = bm.JobResults()
                    pjr_gpu.job_id = job_id
                    pjr_gpu.model_name = job_info["model_name"]
                    pjr_gpu.report_index_id = 8
                    pjr_gpu.report_result = gpu_utilization_result
                    pjr_gpu.save()

            except Exception as pfe:
                print pfe
            else:
                print(
                    "models: {}, run_machine_type: {}, index: {}, result: {}".
                    format(job_info["model_name"], run_machine_type,
                           report_index, result))

                # 如果当前值是空或者inf(speed 会出现)
                if not result or result == '-inf':
                    result = 0

                value = check_results(job_info["model_name"], report_index,
                                      run_machine_type, result)

                if value:
                    current_html_result = [
                        job_info["model_name"], run_machine_type,
                        job_info["index"], value[0], result, value[1]
                    ]
                    html_results.append(current_html_result)

    if html_results:
        template.construct_email_content(html_results, args.log_path, args)
Пример #2
0
def parse_logs(args):
    """
    parse log files and insert to db
    :param args:
    :return:
    """
    image_id = get_image_id()
    file_list = load_folder_files(os.path.join(args.log_path, "index"))
    dict_run_machine_type = {
        '1gpus' : 'ONE_GPU',
        '4gpus' : 'FOUR_GPU',
        '8gpus' : 'MULTI_GPU',
        '8gpus8p' : 'MULTI_GPU_MULTI_PROCESS'
    }
    cv_models = ['DeepLab_V3+', 'CycleGAN', 'mask_rcnn', 'SE-ResNeXt50', 'yolov3']
    # nlp_models = ['bert', 'paddingrnn_large', 'paddingrnn_small', 'transformer']
    # rl_models = ['ddpg_deep_explore']
    multi_process_models = ['mask_rcnn', 'yolov3', 'transformer_base', 'transformer_big', 'bert', 'SE-ResNeXt50']
    html_results = []
    for file in file_list:
        # file_name like CycleGAN_mem_1gpus or ddpg_deep_explore_speed_1gpus
        cluster_job_id = uuid.uuid1()
        file_name = file.split('/')[-1]
        model_name = '_'.join(file_name.split('_')[:-2])
        key_word = "FPS:" if model_name in cv_models else 'Avg:'
        job_name = 'pb_' + model_name
        task_index = file_name.split('_')[-2]
        if task_index == 'speed':
            report_index = 1
        elif task_index == 'mem':
            report_index = 2
        else:
            report_index = 6

        run_machine_type = dict_run_machine_type[file_name.split('_')[-1]]
        run_mode = "mp" if file_name.split('_')[-1] == "8gpus8p" else "sp"
        pj = bm.Job()
        pj.job_name = job_name
        pj.cluster_job_id = cluster_job_id
        pj.cluster_type_id = 0
        pj.model_name = model_name
        pj.report_index = report_index
        pj.code_branch = "master"
        pj.code_commit_id = args.code_commit_id
        pj.job_type = args.job_type
        pj.run_machine_type = run_machine_type
        pj.frame_id = 0
        pj.image_id = image_id
        pj.cuda_version = args.cuda_version
        pj.cudnn_version = args.cudnn_version
        pj.gpu_type = args.gpu_type
        pj.model_implement_type = args.implement_type
        pj.log_extracted = "yes"
        pj.save()
        #log_server = log_server_cuda9 if args.cuda_version == '9.0' else log_server_cuda10
        log_server = socket.gethostname()
        #todo config the log_server port
        log_server = "http://" + log_server + ":8777/"
        train_log_name = "{}_{}_{}_{}".format(model_name, "train",
                                               task_index,
                                               file_name.split('_')[-1][0])
        if model_name in multi_process_models:
            train_log_name += "_{}".format(run_mode)
        train_log_path = os.path.join(os.path.basename(args.log_path),
                                      "train_log", train_log_name)
        train_log_path = log_server + train_log_path

        job_id = get_job_id(cluster_job_id)

        result = ""
        with open(file, 'r+') as file_obj:
            file_lines = file_obj.readlines()
            try:
                if report_index == 2:
                    value = file_lines[-1].split()[-1]
                    result = int(value) if str.isdigit(value) else 0
                elif report_index == 1:
                    lines = file_lines[-10:-1]
                    for line in lines:
                        if key_word in line:
                            result = line.split(':')[1].split(' ')[1]
                else:
                    value = file_lines[-1].split()[-1]
                    result = int(value) if str.isdigit(value) else 0

                pjr = bm.JobResults()
                pjr.job_id = job_id
                pjr.model_name = model_name
                pjr.report_index_id = report_index
                pjr.report_result = result
                pjr.train_log_path = train_log_path
                pjr.save()
            except Exception as pfe:
                print pfe
            else:
                print("models: {}, run_machine_type: {}, index: {}, result: {}".format(
                    model_name, run_machine_type, task_index, result))

                # 如果当前值是空或者inf(speed 会出现)
                if not result or result == '-inf':
                    result = 0

                value = check_results(model_name, report_index, run_machine_type, result)

                if value:
                    current_html_result = [model_name, run_machine_type,
                                           task_index, value[0], result, value[1]]
                    html_results.append(current_html_result)

    if html_results:
        template.construct_email_content(html_results, args.log_path, args)
Пример #3
0
def parse_logs(args):
    """
    parse log files and insert to db
    :param args:
    :return:
    """
    image_id = get_image_id()
    file_list = load_folder_files(os.path.join(args.log_path, "index"))
    html_results = []
    for job_file in file_list:
        result = 0
        with open(job_file, 'r+') as file_obj:
            file_lines = file_obj.readlines()
            try:
                job_info = json.loads(file_lines[-1])
            except Exception as exc:
                print("file {} parse error".format(job_file))
                continue

            # check model if exist in db
            get_or_insert_model(job_info["model_name"], job_info["mission_name"], job_info["direction_id"])

            # save job
            if str(job_info["gpu_num"]) == "8" and job_info["run_mode"] == "mp":
                run_machine_type = DICT_RUN_MACHINE_TYPE['8mp']
            else:
                run_machine_type = DICT_RUN_MACHINE_TYPE[str(job_info["gpu_num"])]
            job_id = insert_job(image_id, run_machine_type, job_info, args).job_id

            # parse job results
            cpu_utilization_result = 0
            gpu_utilization_result = 0
            unit = ''
            mem_result = 0
            try:
                if job_info["index"] == 1:
                    result = job_info['FINAL_RESULT']
                    unit = job_info['UNIT']
                    for line in file_lines:
                        if 'AVG_CPU_USE' in line:
                            cpu_utilization_result = line.strip().split('=')[1]
                        if 'AVG_GPU_USE' in line:
                            gpu_utilization_result = line.strip().split('=')[1]
                        # TODO: 动态图吞吐和显存占用是一个任务,后续静态图也改成一个任务,删除这个判断和删除 elif job_info["index"] == 2:
                        if "MAX_GPU_MEMORY_USE" in line and args.implement_type != 'static_graph':
                            value = line.strip().split("=")[1].strip()
                            mem_result = int(value) if str.isdigit(value) else 0
                            
                elif job_info["index"] == 2:
                    for line in file_lines:
                        if "MAX_GPU_MEMORY_USE" in line:
                            value = line.strip().split("=")[1].strip()
                            result = int(value) if str.isdigit(value) else 0
                            unit = 'MiB'
                            break
                elif job_info["index"] == 3:
                    result = json.dumps(job_info['FINAL_RESULT'])
                else:
                    for line in file_lines:
                        if "MAX_BATCH_SIZE" in line:
                            value = line.strip().split("=")[1].strip()
                            result = int(value) if str.isdigit(value) else 0
                            break

                # save job results
                pjr = insert_results(job_id, job_info["model_name"], job_info["index"], result, unit, 1)
                log_file = job_info["log_file"].split("/")[-1]
                log_base = args.paddle_version + "/" + args.implement_type
                train_log_path = LOG_SERVER + os.path.join(log_base, "train_log", log_file)
                log_save_dict = {"train_log_path": train_log_path}
                if job_info["index"] == 1:
                    insert_results(job_id, job_info["model_name"], 7, cpu_utilization_result, '%')
                    insert_results(job_id, job_info["model_name"], 8, gpu_utilization_result, '%')
                    if int(job_info["gpu_num"]) == 1:
                        profiler_log = job_info["log_with_profiler"].split("/")[-1]
                        profiler_path = job_info["profiler_path"].split("/")[-1]
                        profiler_log_path = LOG_SERVER + os.path.join(log_base, "profiler_log", profiler_log)
                        profiler_path = LOG_SERVER + os.path.join(log_base, "profiler_log", profiler_path)
                        log_save_dict["profiler_log_path"] = profiler_log_path
                        log_save_dict["profiler_path"] = profiler_path

                pjrl = bm.JobResultsLog()
                pjrl.result_id = pjr.result_id
                pjrl.log_path = json.dumps(log_save_dict)
                pjrl.save()
                # TODO: 动态图吞吐和显存占用是一个任务,后续静态图也改成一个任务,即可删除这个判断
                if args.implement_type != 'static_graph':
                    pjr = insert_results(job_id, job_info["model_name"], 2, mem_result, 'MiB', 0)

            except Exception as pfe:
                print pfe
            else:
                print("models: {}, run_machine_type: {}, index: {}, result: {}".format(
                    job_info["model_name"], run_machine_type, job_info["index"], result))

                if job_info["index"] != 3:
                    check_results(job_info["model_name"], job_info["index"],
                                        run_machine_type, result, html_results)
                    # TODO: 动态图吞吐和显存占用是一个任务,后续静态图也改成一个任务,即可删除这个判断
                    if args.implement_type != 'static_graph':
                        check_results(job_info["model_name"], 2, run_machine_type,
                                      mem_result, html_results)
                else:
                    check_results(job_info["model_name"], job_info["index"], run_machine_type,
                                    result, html_results, "Framework_Total")
                    check_results(job_info["model_name"], job_info["index"],
                                    run_machine_type, result, html_results, "GpuMemcpy_Total")

    if html_results:
        template.construct_email_content(html_results, args.log_path, args)
Пример #4
0
def parse_logs(args):
    """
    parse log files and insert to db
    :param args:
    :return:
    """
    image_id = get_image_id()
    file_list = load_folder_files(os.path.join(args.log_path, "index"))
    html_results = []
    for job_file in file_list:
        result = 0
        with open(job_file, 'r+') as file_obj:
            file_lines = file_obj.readlines()
            try:
                job_info = json.loads(file_lines[-1])
            except Exception as exc:
                print("file {} parse error".format(job_file))
                continue

            # save job
            if str(job_info["gpu_num"]
                   ) == "8" and job_info["run_mode"] == "mp":
                run_machine_type = DICT_RUN_MACHINE_TYPE['8mp']
            else:
                run_machine_type = DICT_RUN_MACHINE_TYPE[str(
                    job_info["gpu_num"])]
            job_id = insert_job(image_id, run_machine_type, job_info,
                                args).job_id

            # parse job results
            cpu_utilization_result = 0
            gpu_utilization_result = 0
            try:
                if job_info["index"] == 1:
                    result = job_info['FINAL_RESULT']
                    for line in file_lines:
                        if 'AVG_CPU_USE' in line:
                            cpu_utilization_result = line.strip().split('=')[1]
                        if 'AVG_GPU_USE' in line:
                            gpu_utilization_result = line.strip().split('=')[1]
                elif job_info["index"] == 2:
                    for line in file_lines:
                        if "MAX_GPU_MEMORY_USE" in line:
                            value = line.strip().split("=")[1].strip()
                            result = int(value) if str.isdigit(value) else 0
                            break
                elif job_info["index"] == 3:
                    result = json.dumps(job_info['FINAL_RESULT'])
                else:
                    for line in file_lines:
                        if "MAX_BATCH_SIZE" in line:
                            value = line.strip().split("=")[1].strip()
                            result = int(value) if str.isdigit(value) else 0
                            break

                # save job results
                pjr = insert_results(job_id, job_info["model_name"],
                                     job_info["index"], result, 1)
                log_file = job_info["log_file"].split("/")[-1]
                train_log_path = LOG_SERVER + os.path.join(
                    os.path.basename(args.log_path), "train_log", log_file)
                log_save_dict = {"train_log_path": train_log_path}
                if job_info["index"] == 1:
                    insert_results(job_id, job_info["model_name"], 7,
                                   cpu_utilization_result)
                    insert_results(job_id, job_info["model_name"], 8,
                                   gpu_utilization_result)
                    if int(job_info["gpu_num"]) == 1:
                        profiler_log = job_info["log_with_profiler"].split(
                            "/")[-1]
                        profiler_path = job_info["profiler_path"].split(
                            "/")[-1]
                        profiler_log_path = LOG_SERVER + os.path.join(
                            os.path.basename(args.log_path), "profiler_log",
                            profiler_log)
                        profiler_path = LOG_SERVER + os.path.join(
                            os.path.basename(args.log_path), "profiler_log",
                            profiler_path)
                        log_save_dict["profiler_log_path"] = profiler_log_path
                        log_save_dict["profiler_path"] = profiler_path

                pjrl = bm.JobResultsLog()
                pjrl.result_id = pjr.result_id
                pjrl.log_path = json.dumps(log_save_dict)
                pjrl.save()
                # cmd = "curl -I -m 10 -o /dev/null -s -w %{http_code} " + profiler_log_path
                #if commands.getoutput(cmd) != '200':

            except Exception as pfe:
                print pfe
            else:
                print(
                    "models: {}, run_machine_type: {}, index: {}, result: {}".
                    format(job_info["model_name"], run_machine_type,
                           job_info["index"], result))

                if job_info["index"] != 3:
                    check_results(job_info, run_machine_type, result,
                                  html_results)
                elif job_info["index"] == 3:
                    check_results(job_info, run_machine_type, result,
                                  html_results, "Framework_Total")
                    check_results(job_info, run_machine_type, result,
                                  html_results, "GpuMemcpy_Total")

    if html_results:
        template.construct_email_content(html_results, args.log_path, args)