Exemplo n.º 1
0
def model_experiments_finished(job: signac.Project.Job, key="succeeded"):
    if not model_experiments_needed(job):
        return False

    for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(job):
        if re.search(get_exp_regex(job), feature_graph_name) is None:
            continue
        elif splitJob not in utils.signac_tools.getSplitProject(featureJob).find_jobs(
                task_args.split_filter, task_args.split_doc_filter):
            continue
        md5_str = "_".join(map(lambda x: calculate_md5(
            splitJob.fn(x)), feature_graph_files))

        exp_args_list = task_args.model_args or splitJob.doc.get(
            expCode, default=[])
        if exp_args_list == [] and is_tuning():
            exp_args_list = [""]
        for args in exp_args_list:
            if task_args.arg_regex is not None and re.search(task_args.arg_regex, args) is None:
                continue
            dataset_dir = splitJob.workspace()
            datasetDirObj = Path(dataset_dir)

            # Workspace path
            workspaceDirObj = datasetDirObj / workspaceRoot  # type:Path
            if not (workspaceDirObj.exists() and workspaceDirObj.is_dir()):
                return False
            modelProject = signac.init_project(
                name=expProjectName, root=str(workspaceDirObj))
            run_id = "{}@{}".format(args, md5_str)
            if is_tuning():
                run_id += "[tuning]"
            if not any(map(lambda job_i: job_i.doc.get(key, False), modelProject.find_jobs(filter={"run_id": run_id}))):
                return False
    return True
Exemplo n.º 2
0
def export_dataset(job: signac.Project):
    for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(job):
        exp_regex = get_exp_regex(job)
        if re.search(exp_regex, feature_graph_name) is None:
            continue
        elif splitJob not in utils.signac_tools.getSplitProject(featureJob).find_jobs(
                task_args.split_filter, task_args.split_doc_filter):
            continue
        elif is_tuning() and (splitJob.sp.get("split_index", None) not in {None, 0}):
            continue
        dataset_dir = Path(splitJob.workspace())
Exemplo n.º 3
0
def clear_workspace(job: signac.Project.Job):
    for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(
            job):
        exp_regex = get_exp_regex(job)
        if re.search(exp_regex, feature_graph_name) is None:
            print("[run_model@{}] Regex {} not matching; skip on dataset {}".
                  format(job.get_id(), exp_regex, feature_graph_name))
            continue
        elif splitJob not in utils.signac_tools.getSplitProject(
                featureJob).find_jobs(task_args.split_filter,
                                      task_args.split_doc_filter):
            print("[run_model@{}] Filter {} not matching; skip on dataset {}".
                  format(job.get_id(),
                         (task_args.split_filter, task_args.split_doc_filter),
                         feature_graph_name))
            continue
        dataset_dir = splitJob.workspace()
        datasetDirObj = Path(dataset_dir)
        # Workspace path
        workspaceDirObj = datasetDirObj / workspaceRoot  # type: Path
        if workspaceDirObj.exists():
            assert workspaceDirObj.is_dir()
            if task_args.model_args:
                try:
                    modelProject = signac.get_project(
                        root=str(workspaceDirObj), search=False)
                    md5_str = "_".join(
                        map(lambda x: calculate_md5(splitJob.fn(x)),
                            feature_graph_files))
                    for args in task_args.model_args:
                        if task_args.arg_regex is not None and re.search(
                                task_args.arg_regex, args) is None:
                            print(
                                "[run_model@{}] Regex {} not matching; skip on args {}"
                                .format(job.get_id(), task_args.arg_regex,
                                        args))
                            continue
                        run_id = "{}@{}".format(args, md5_str)
                        for model_job in modelProject.find_jobs(
                                filter={"run_id": run_id}):
                            print("Removing folder {}".format(
                                model_job.workspace()))
                            shutil.rmtree(model_job.workspace())
                except LookupError:
                    pass
            else:
                print("Removing folder {}".format(workspaceDirObj))
                shutil.rmtree(str(workspaceDirObj))
Exemplo n.º 4
0
def clean_workspace(job: signac.Project.Job):
    for _, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(
            job):
        exp_regex = get_exp_regex(job)
        if re.search(exp_regex, feature_graph_name) is None:
            print("[run_model@{}] Regex {} not matching; skip on dataset {}".
                  format(job.get_id(), exp_regex, feature_graph_name))
            continue
        dataset_dir = splitJob.workspace()
        datasetDirObj = Path(dataset_dir)
        if all(map(splitJob.isfile, feature_graph_files)):
            md5_str = "_".join(
                map(lambda x: calculate_md5(splitJob.fn(x)),
                    feature_graph_files))
        else:
            md5_str = None
            print(
                f"[clean_workspace@{job.get_id()}] Missing files for split {feature_graph_name}"
            )

        # Workspace path
        workspaceDirObj = datasetDirObj / workspaceRoot  # type: Path
        if workspaceDirObj.exists():
            assert workspaceDirObj.is_dir()
            try:
                modelProject = signac.get_project(root=str(workspaceDirObj),
                                                  search=False)
                for model_job in modelProject:
                    if not model_job.doc.get("succeeded", False):
                        target_dir = model_job.workspace()
                        print(
                            f"[clean_workspace@{job.get_id()}] Experiment not succeeded: removing folder {target_dir}"
                        )
                        shutil.rmtree(target_dir)
                    elif (md5_str is not None) and (
                            not model_job.sp.run_id.endswith(md5_str)):
                        target_dir = model_job.workspace()
                        print(
                            f"[clean_workspace@{job.get_id()}] Experiment not matching current data: removing folder {target_dir}"
                        )
                        shutil.rmtree(target_dir)
            except LookupError:
                pass
Exemplo n.º 5
0
def generate_csv(job: signac.Project.Job, args):
    textBuffer = StringIO()
    textList = []
    args.csv_data_dict["numClass"] = job.sp.numClass
    try:
        args.csv_data_dict["h"] = "{:.2f}".format(job.sp.h)
    except AttributeError:
        args.csv_data_dict["h"] = job.sp.HName
    args.csv_data_dict["Graph ID"] = job.get_id()
    args.csv_data_dict["Clustering Coefficient"] = job.doc.get(
        "avgClusteringCoeff")
    for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(
            job):
        feature_file = featureJob.doc.get("feature_file")
        if featureJob.doc.get("feature_name"):
            args.csv_data_dict["Feature"] = featureJob.doc["feature_name"]
        else:
            args.csv_data_dict["Feature"] = Path(
                feature_file.replace(job.sp.graphName + "-", "")).stem
        args.csv_data_dict["Graph Name"] = feature_graph_name
        args.csv_data_dict["Split Config"] = splitJob.sp.split_config
        md5_str = "_".join(
            map(lambda x: calculate_md5(splitJob.fn(x)), feature_graph_files))
        dataset_dir = splitJob.workspace()
        datasetDirObj = Path(dataset_dir)

        # Workspace path
        workspaceDirObj = datasetDirObj / args.workspaceRoot
        try:
            gcnProject = signac.get_project(root=str(workspaceDirObj),
                                            search=False)
        except LookupError as e:
            print(e, file=sys.stderr)
            continue

        if args.exp_args is not None:
            exp_arg_list = args.exp_args
        elif args.add_args:
            exp_arg_list = list(
                set(splitJob.doc.get(args.exp_type, default=[]))
                | set(args.add_args))
        else:
            exp_arg_list = splitJob.doc.get(args.exp_type, default=[])

        for exp_args in exp_arg_list:
            args.csv_data_dict["Model Args"] = '"{}"'.format(exp_args)
            run_id = "{}@{}".format(exp_args, md5_str)
            job_iter = gcnProject.find_jobs(filter={"run_id": run_id})
            if any(
                    map(lambda job_i: job_i.doc.get("succeeded", False),
                        job_iter)):
                assert len(job_iter) == 1, (args.csv_data_dict, run_id)
                # Parse experiment results
                for job_m in job_iter:
                    args.csv_data_dict["Experiment ID"] = job_m.get_id()
                    args.result_parser(job_m, args)
                    if args.path_only:
                        path = [
                            job.get_id(),
                            featureJob.get_id(),
                            splitJob.get_id(), "/", args.workspaceRoot,
                            job_m.get_id()
                        ]
                        args.csv_data_dict["Job Path"] = json.dumps(path)
                assert len(args.csv_data_dict) == len(args.csv_header_list)

                # Write to text buffer
                textBuffer.write(
                    ",".join(map(str, args.csv_data_dict.values())) + "\n")
                textList.append(list(map(str, args.csv_data_dict.values())))

    if not args.path_only:
        # Write to the result file
        if not args.csv_file_generated:
            print(f"CSV will be saved to {args.output}")
            with open(args.output, "w") as csv_out:
                csv_out.write(",".join(args.csv_header_list) + "\n")
                csv_out.write(textBuffer.getvalue())
                args.csv_file_generated = True
        else:
            with open(args.output, "a") as csv_out:
                csv_out.write(textBuffer.getvalue())
    else:
        # Write to the result file
        csv_writer = csv.writer(sys.stdout)
        if not args.csv_file_generated:
            csv_writer.writerow(args.csv_header_list)
            csv_writer.writerows(textList)
            args.csv_file_generated = True
        else:
            csv_writer.writerows(textList)
Exemplo n.º 6
0
def run_model(job: signac.Project.Job):
    logger = logging.getLogger('run_model@{}'.format(job.get_id()))
    logger.setLevel(logging.DEBUG)
    logger.propagate = False

    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    chFormatter = logging.Formatter(
        '[{asctime} {name} {levelname:>8}] {message}', '%m-%d %H:%M:%S', '{')
    ch.setFormatter(chFormatter)
    logger.addHandler(ch)

    for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(job):
        exp_regex = get_exp_regex(job)
        if re.search(exp_regex, feature_graph_name) is None:
            print("[run_model@{}] Regex {} not matching; skip on dataset {}".format(
                job.get_id(), exp_regex, feature_graph_name))
            continue
        elif splitJob not in utils.signac_tools.getSplitProject(featureJob).find_jobs(
                task_args.split_filter, task_args.split_doc_filter):
            print("[run_model@{}] Filter {} not matching; skip on dataset {}".format(
                job.get_id(),
                (task_args.split_filter, task_args.split_doc_filter),
                feature_graph_name))
            continue
        elif is_tuning() and (splitJob.sp.get("split_index", None) not in {None, 0}):
            print("[run_model@{}] Split index is not 0 for tuning; skip on dataset {}".format(
                job.get_id(), feature_graph_name))
            continue
        md5_str = "_".join(map(lambda x: calculate_md5(
            splitJob.fn(x)), feature_graph_files))
        dataset_dir = splitJob.workspace()
        datasetDirObj = Path(dataset_dir)

        # Workspace path
        workspaceDirObj = datasetDirObj / workspaceRoot
        workspaceDirObj.mkdir(exist_ok=True, parents=True)
        modelProject = signac.init_project(
            name=expProjectName, root=str(workspaceDirObj))

        fh = logging.FileHandler(
            str(workspaceDirObj / "terminal_output.log"), "a")
        fh.setLevel(logging.DEBUG)
        fhFormatter = logging.Formatter(
            '[{asctime} {levelname:>8}] {message}', '%m-%d %H:%M:%S', '{')
        fh.setFormatter(fhFormatter)
        logger.addHandler(fh)

        exp_args_list = task_args.model_args or splitJob.doc.get(
            expCode, default=[])
        if exp_args_list == [] and is_tuning():
            exp_args_list = [""]
        for args in exp_args_list:
            if task_args.arg_regex is not None and re.search(task_args.arg_regex, args) is None:
                print("[run_model@{}] Regex {} not matching; skip on args {}".format(
                    job.get_id(), task_args.arg_regex, args))
                continue
            run_id = "{}@{}".format(args, md5_str)
            if is_tuning():
                run_id += "[tuning]"
                logger.removeHandler(fh)
            if any(map(lambda job_i: job_i.doc.get("succeeded", False), modelProject.find_jobs(filter={"run_id": run_id}))):
                print("[run_model@{}] Already run; skip on dataset {} for parameter {}".format(
                    job.get_id(), feature_graph_name, args))
            else:
                # Construct arguments
                args_split = args.split()
                dataset_args = dataset_args_func(
                    dataset_dir=dataset_dir, feature_graph_name=feature_graph_name,
                    run_id=run_id, workspaceDirObj=workspaceDirObj, task_args=task_args,
                    featureJob=featureJob, args=args, args_split=args_split, splitJob=splitJob
                )
                if dataset_args is None:
                    raise ValueError(
                        "dataset_args_func is not properly configured.")
                elif dataset_args is False:
                    print("[run_model@{}] Skip on dataset {} for parameter {}".format(
                        job.get_id(), feature_graph_name, args))
                    continue
                arg_list = [get_python_path(), "-u", modelScript] + \
                    dataset_args + args_split

                # Run model code
                print("[run_model@{}] run on dataset {} for parameter {}".format(
                    job.get_id(), feature_graph_name, args))
                try:
                    logger.info(
                        "===============\n>>>>Executing command {}\n===============".format(arg_list))
                    if not(job.doc.get("exp_terminal", False) or flags.log_to_terminal):
                        ch.setLevel(logging.WARNING)
                        ch.setFormatter(chFormatter)
                    if task_args.interactive:
                        proc = subprocess.Popen(
                            arg_list, cwd=str(modelPathObj))
                    else:
                        proc = subprocess.Popen(arg_list, cwd=str(modelPathObj),
                                                stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='utf-8')
                    if proc.stdout is not None:
                        msgcount = 0
                        for line in iter(proc.stdout.readline, ''):
                            msgcount += 1
                            logger.info(line.strip())
                            if msgcount % 100 == 0:
                                logger.debug("running on dataset {} for parameter {}".format(
                                    feature_graph_name, args))
                                msgcount = 0
                    returncode = proc.wait()
                    if returncode != 0:
                        raise subprocess.CalledProcessError(
                            returncode, arg_list)
                    else:
                        logger.debug("Completed on dataset {} for parameter {}".format(
                            feature_graph_name, args))
                except subprocess.CalledProcessError:
                    logger.error("Check log at {}".format(
                        workspaceDirObj / "terminal_output.log"))
                    raise
                logger.info("===============")
                ch.setLevel(logging.INFO)

                # Tag job as succeeded (except when tuning)
                assert len(modelProject.find_jobs(
                    filter={"run_id": run_id})) == 1
                if not task_args.tuning:
                    for job_m in modelProject.find_jobs(filter={"run_id": run_id}):
                        job_m.doc["succeeded"] = True
                else:
                    print("[run_model@{}]Job will not be tagged successful in tuning mode.".format(
                        job.get_id()))
        logger.removeHandler(fh)