示例#1
0
def test_json2tl(shared_data_dir):
    src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000", to_str=True)
    tl_tar = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.tl", to_str=True)
    json_tar = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.json", to_str=True)
    json2tl(src, tl_tar)
    tl2json(tl_tar, json_tar)
    assert True
示例#2
0
def test_build_interactions(shared_data_dir):
    question_csv_path = path_append(shared_data_dir, "tests", "EdNet",
                                    "contents", "questions.csv")
    users_dir = path_append(shared_data_dir, "tests", "EdNet", "KT1")

    user_csv = path_append(users_dir, "u1.csv")

    judgement = Judgement(question_csv_path)

    interactions = csv2interactions(user_csv, judgement)

    assert interactions[0] == [5011, 0]

    tar = path_append(shared_data_dir, "tests", "EdNet", "KT1", "data",
                      "kt.json")
    build_interactions(users_dir, question_csv_path, tar)

    with open(tar) as f:
        assert json.loads(f.readline())[0] == [5011, 0]
        assert len(json.loads(f.readline())) == 16

    tar2 = path_append(tar, to_str=True) + ".%s" % 1
    select_n_most_active(tar, tar2, 1)

    with open(tar2) as f:
        assert json.loads(f.readline())[0] == [5011, 0]
示例#3
0
def test_loading(tmpdir):
    csv_src = path_append(tmpdir, "test.csv")
    json_src = path_append(tmpdir, "test.json")

    text_to_csv(csv_src)
    csv2jsonl(csv_src, json_src)
    jsonl2csv(json_src, csv_src)

    for src in [csv_src, json_src, load_jsonl(json_src)]:
        for i, line in enumerate(loading(src)):
            assert int(line["id"]) == i, line
            if i == 0:
                assert line["name"] == "Tom", line
            elif i == 1:
                assert line["name"] == "Jerry", line

    src = path_append(tmpdir, "test")
    with as_out_io(src) as wf:
        print(DEMO_TEXT.strip(), file=wf)

    assert [line.strip()
            for line in loading(src)] == DEMO_TEXT.strip().split("\n")
    with as_io(src) as f:
        assert [line.strip()
                for line in loading(f)] == DEMO_TEXT.strip().split("\n")
    assert "hello world" == loading(lambda: "hello world")
示例#4
0
def download_data(url, data_dir, override, bloom_filter: set = None):
    bloom_filter = set() if bloom_filter is None else bloom_filter

    if url in bloom_filter:  # pragma: no cover
        return

    if url.endswith("/"):  # 以/结尾是文件夹,其余是文件
        _data_dir = path_append(data_dir, url.split('/')[-2], to_str=True)

        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, "lxml")
        al = soup.find_all('a')
        for a in al:
            # 获得链接名
            h = a.get('href')
            if h[0] != '.':
                url_h = url + h
                if url_h not in bloom_filter:
                    download_data(url_h, _data_dir, override, bloom_filter)
        bloom_filter.add(url)

    else:
        os.makedirs(data_dir, exist_ok=True)
        save_path = path_append(data_dir, url.split('/')[-1], to_str=True)
        download_file(url, save_path, override)
        bloom_filter.add(url)

    return data_dir
示例#5
0
    def _update(self, **kwargs):
        params = kwargs
        params["logger"] = params.pop(
            "logger",
            config_logging(logger=params.get("model_name", self.model_name),
                           console_log_level="info"))

        for key in params:
            if key.endswith("_params") and key + "_update" in params:
                params[key].update(params[key + "_update"])

        self.deep_update(**params)

        _vars = ["ctx"]
        for _var in _vars:
            if _var in kwargs:
                try:
                    setattr(self, _var, eval_var(kwargs[_var]))
                except TypeError:
                    pass

        self.validation_result_file = path_append(self.model_dir,
                                                  RESULT_JSON,
                                                  to_str=True)
        self.cfg_path = path_append(self.model_dir, CFG_JSON, to_str=True)
示例#6
0
def test_copy(tmpdir):
    src_dir = path_append(tmpdir, "src")
    tar_dir = path_append(tmpdir, "tar")

    src = path_append(src_dir, "src.txt")
    tar = path_append(tar_dir, "tar.txt")

    with as_out_io(src) as wf:
        print("hello world", file=wf)

    config.OVERRIDE = False
    copytree(src_dir, tar_dir)
    copytree(src_dir, tar_dir)
    copyfile(src, tar)
    template_copy(src, tar)

    config.OVERRIDE = True
    copytree(src_dir, tar_dir)
    copyfile(src, tar)

    config.OVERRIDE = None
    with simulate_stdin("y", "y"):
        copytree(src_dir, tar_dir)
        copyfile(src, tar)

    with simulate_stdin("n", "n"):
        copytree(src_dir, tar_dir)
        copyfile(src, tar)

    with simulate_stdin("unk", "y"):
        default_legal_input("", __legal_input={"y"})
示例#7
0
def build_json_sequence(src_root: str = "../raw_data/junyi/", tar_root: str = "../data/junyi/data/",
                        ku_dict_path: str = "../data/junyi/data/graph_vertex.json", n: int = 1000):
    select_n_most_frequent_students(
        path_append(src_root, "junyi_ProblemLog_for_PSLC.txt", to_str=True),
        path_append(tar_root, "student_log_kt_", to_str=True),
        ku_dict_path,
        n,
    )
示例#8
0
def transfer_synthetic_dataset(src_dir, tar_dir):
    for root, dirs, files in os.walk(src_dir):
        for filename in files:
            src = PurePath(path_append(root, filename))
            if src.suffix != ".csv":  # pragma: no cover
                continue
            tar = path_append(tar_dir, src.with_suffix(".json").name)
            synthetic2json(src, tar)
示例#9
0
def test_analysis(shared_data_dir):
    src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000", to_str=True)
    analysis_records(src)

    graph_src = path_append(shared_data_dir, "dense_graph", to_str=True)
    analysis_edges(graph_src)

    graph_src = path_append(shared_data_dir, "transition_graph", to_str=True)
    analysis_edges(graph_src, threshold=0.5)
    analysis_edges(graph_src, threshold=None)
示例#10
0
def test_path(tmp_path):
    path_append(tmp_path, "../data", "../dataset1/", "train", to_str=True)

    tmp_file = path_append(tmp_path, "test_path.txt")
    with wf_open(tmp_file) as wf:
        print("hello world", file=wf)

    assert file_exist(tmp_file)
    _dir = abs_current_dir(tmp_file)
    assert parent_dir(_dir, 2) == path_append(_dir, "..", "..")
示例#11
0
def test_graph(shared_data_dir):
    json_src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.json", to_str=True)

    dense_graph(835, path_append(shared_data_dir, "dense_graph", to_str=True))
    trans_graph = path_append(shared_data_dir, "transition_graph", to_str=True)
    transition_graph(835, json_src, tar=trans_graph)
    ctrans_graph = path_append(shared_data_dir, "correct_transition_graph", to_str=True)
    correct_transition_graph(835, json_src, tar=ctrans_graph)

    ctrans_sim = path_append(shared_data_dir, "correct_transition_sim_graph", to_str=True)
    similarity_graph(835, ctrans_graph, ctrans_sim)
示例#12
0
    def _update(self, **kwargs):
        params = kwargs

        params["logger"] = params.pop(
            "logger",
            config_logging(logger=params.get("model_name", self.model_name),
                           console_log_level="info"))

        for key in params:
            if key.endswith("_params") and key + "_update" in params:
                params[key].update(params[key + "_update"])

        # path_override_check
        path_check_list = [
            "dataset", "root_data_dir", "workspace", "root_model_dir",
            "model_dir"
        ]
        _overridden = {}
        for path_check in path_check_list:
            if kwargs.get(path_check) is None or kwargs[path_check] == getattr(
                    self, "%s" % path_check):
                _overridden[path_check] = False
            else:
                _overridden[path_check] = True

        for param, value in params.items():
            setattr(self, "%s" % param, value)

        def is_overridden(varname):
            return _overridden["%s" % varname]

        # set dataset
        if is_overridden("dataset") and not is_overridden("root_data_dir"):
            kwargs["root_data_dir"] = path_append("$root", "data", "$dataset")
        # set workspace
        if (is_overridden("workspace") or is_overridden("root_model_dir")
            ) and not is_overridden("model_dir"):
            kwargs["model_dir"] = path_append("$root_model_dir", "$workspace")

        # rebuild relevant directory or file path according to the kwargs
        _dirs = [
            "workspace", "root_data_dir", "data_dir", "root_model_dir",
            "model_dir"
        ]
        for _dir in _dirs:
            exp = var2exp(kwargs.get(_dir, getattr(self, _dir)),
                          env_wrap=lambda x: "self.%s" % x)
            setattr(self, _dir, eval(exp))

        self.validation_result_file = path_append(self.model_dir,
                                                  RESULT_JSON,
                                                  to_str=True)
        self.cfg_path = path_append(self.model_dir, CFG_JSON, to_str=True)
示例#13
0
def test_load_json(tmpdir):
    csv_src = path_append(tmpdir, "test.csv")
    src = path_append(tmpdir, "json.csv")
    text_to_csv(csv_src)

    csv2jsonl(csv_src, src)

    for i, line in enumerate(load_jsonl(src)):
        assert int(line["id"]) == i
        if i == 0:
            assert line["name"] == "Tom"
        elif i == 1:
            assert line["name"] == "Jerry"
示例#14
0
def test_io_type(tmp_path):
    with pytest.raises(TypeError):
        wf_open(12345)

    with pytest.raises(TypeError):
        rf_open(12345)

    with as_out_io(path_append(tmp_path, "test_out")) as wf:
        with as_out_io(wf):
            pass

    with as_io(path_append(tmp_path, "test_out")) as f:
        with rf_open(f):
            pass
示例#15
0
def load_environment_parameters(directory=None):
    if directory is None:
        directory = path_append(abs_current_dir(__file__), "meta_data")
    return {
        "configuration":
        load_configuration(path_append(directory, "configuration.json")),
        "knowledge_structure":
        load_knowledge_structure(
            path_append(directory, "knowledge_structure.csv")),
        "learning_order":
        load_learning_order(path_append(directory, "learning_order.json")),
        "items":
        load_items(path_append(directory, "items.json"))
    }
示例#16
0
def test_encode(tmpdir):
    demo_text = "测试用中文\nhello world\n如果再重来"

    src = path_append(tmpdir, "gbk.txt")
    tar = path_append(tmpdir, "utf8.txt")

    with wf_open(src, encoding="gbk") as wf:
        print(demo_text, end='', file=wf)

    encode(src, "gbk", tar, "utf-8")

    with rf_open(tar) as f:
        for line in f:
            print(line)
示例#17
0
def load_environment_parameters(directory):
    return {
        "transition_matrix":
        load_transition_matrix(path_append(directory,
                                           "transition_matrix.json")),
        "configuration":
        load_configuration(path_append(directory, "configuration.json")),
        "knowledge_structure":
        load_knowledge_structure(
            path_append(directory, "knowledge_structure.csv")),
        "state2vector":
        load_state_to_vector(path_append(directory, "state2vector.json")),
        "initial_states":
        load_initial_states(path_append(directory, "initial_states.json")),
    }
示例#18
0
def result_file(tmp_path_factory):
    tmp_path = tmp_path_factory.mktemp("result")
    tmp_file = path_append(tmp_path, "result.json", to_str=True)
    with wf_open(tmp_file) as wf:
        for r in result_demo:
            print(json.dumps(r), file=wf)
    return tmp_file
示例#19
0
文件: viz.py 项目: tswsxk/XKT
def net_viz(_net, _cfg, view_tag=False, **kwargs):  # pragma: no cover
    """visualization check, only support pure static network"""
    batch_size = _cfg.batch_size
    model_dir = _cfg.model_dir
    logger = kwargs.get(
        'logger',
        _cfg.logger if hasattr(_cfg, 'logger') else logging
    )

    try:
        viz_dir = path_append(model_dir, "plot/network")
        logger.info("visualization: file in %s" % viz_dir)
        from copy import deepcopy

        viz_net = deepcopy(_net)
        viz_net.length = 2
        viz_shape = {'data': (batch_size,) + (2,)}
        x = mx.sym.var("data")
        sym = viz_net(x)[1][-1]
        plot_network(
            nn_symbol=sym,
            save_path=viz_dir,
            shape=viz_shape,
            node_attrs={"fixedsize": "false"},
            view=view_tag
        )
    except VizError as e:
        logger.error("error happen in visualization, aborted")
        logger.error(e)
示例#20
0
文件: DeepMF.py 项目: tswsxk/xrec
    def toolbox_init(
        self,
        evaluation_formatter_parameters=None,
        validation_logger_mode="w",
        silent=False,
    ):

        from longling import path_append
        from longling.lib.clock import Clock
        from longling.lib.utilog import config_logging
        from longling.ML.toolkit import EvalFormatter as Formatter
        from longling.ML.toolkit import MovingLoss, ConsoleProgressMonitor as ProgressMonitor

        self.toolbox = {
            "monitor": dict(),
            "timer": None,
            "formatter": dict(),
        }

        mod = self.mod
        cfg = self.mod.cfg

        # 4.1 todo 定义损失函数
        # bp_loss_f 定义了用来进行 back propagation 的损失函数,
        # 有且只能有一个,命名中不能为 *_\d+ 型

        assert self.loss_function is not None

        loss_monitor = MovingLoss(self.loss_function)

        # 4.1 todo 初始化一些训练过程中的交互信息
        timer = Clock()

        progress_monitor = ProgressMonitor(
            indexes={"Loss": [name for name in self.loss_function]},
            values={"Loss": loss_monitor.losses},
            end_epoch=cfg.end_epoch - 1,
            silent=silent)

        validation_logger = config_logging(
            filename=path_append(cfg.model_dir, "result.log"),
            logger="%s-validation" % cfg.model_name,
            mode=validation_logger_mode,
            log_format="%(message)s",
        )

        # set evaluation formatter
        evaluation_formatter_parameters = {} \
            if evaluation_formatter_parameters is None \
            else evaluation_formatter_parameters

        evaluation_formatter = Formatter(
            logger=validation_logger,
            dump_file=mod.cfg.validation_result_file,
            **evaluation_formatter_parameters)

        self.toolbox["monitor"]["loss"] = loss_monitor
        self.toolbox["monitor"]["progress"] = progress_monitor
        self.toolbox["timer"] = timer
        self.toolbox["formatter"]["evaluation"] = evaluation_formatter
示例#21
0
def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False):
    try:
        return download_data(url_dict[dataset], data_dir, override)
    except FileExistsError:
        return path_append(data_dir,
                           url_dict[dataset].split('/')[-1],
                           to_str=True)
示例#22
0
def test_configuration(tmpdir, file_format):
    _config = DemoConfiguration()

    assert _config.class_var == DemoConfiguration.vars()
    assert _config.parsable_var == DemoConfiguration.pvars()

    filename = path_append(tmpdir, "test_config.%s" % file_format, to_str=True)

    _config.b = 4
    if file_format == ".err":
        with pytest.raises(TypeError):
            _config.dump(filename, override=True, file_format=file_format)
        return
    else:
        _config.dump(filename, override=True, file_format=file_format)
        _config.dump(filename, override=False, file_format=file_format)

    _config = DemoConfiguration.load(filename, file_format=file_format)

    assert "a" in _config
    assert _config["a"] == 1 and _config.b == 4

    print(_config)

    assert len(_config.items()) == 2
示例#23
0
def get_data(dataset,
             data_dir=DEFAULT_DATADIR,
             override=False,
             url_dict: dict = None):
    """
    Parameters
    ----------
    dataset: str
        数据集名
    data_dir: str
        数据存储目录
    override: bool
        是否覆盖已存在的文件
    url_dict:
        链接名称与链接映射

    Returns
    -------

    """
    url_dict = URL_DICT if not url_dict else url_dict
    if dataset in url_dict:
        url = url_dict[dataset]
    elif re.match("http(s?)://.*", dataset):
        url = dataset
    else:
        raise ValueError("%s is neither a valid dataset name nor an url" %
                         dataset)

    try:
        return download_data(url, data_dir, override)
    except FileExistsError:
        return path_append(data_dir, url.split('/')[-1], to_str=True)
示例#24
0
文件: run.py 项目: bigdata-ustc/XKT
def numerical_check(_net, _cfg: Configuration, train_data, test_data, dump_result=False):  # pragma: no cover
    ctx = _cfg.ctx
    batch_size = _cfg.batch_size

    _net.initialize(ctx=ctx)

    bp_loss_f = get_bp_loss(**_cfg.loss_params)
    loss_function = {}
    loss_function.update(bp_loss_f)

    from longling.ML.MxnetHelper.glue import module
    from longling.ML.toolkit import EvalFormatter as Formatter
    from longling.ML.toolkit import MovingLoss
    from tqdm import tqdm

    loss_monitor = MovingLoss(loss_function)
    progress_monitor = tqdm
    if dump_result:
        from longling import config_logging
        validation_logger = config_logging(
            filename=path_append(_cfg.model_dir, "result.log"),
            logger="%s-validation" % _cfg.model_name,
            mode="w",
            log_format="%(message)s",
        )
        evaluation_formatter = Formatter(
            logger=validation_logger,
            dump_file=_cfg.validation_result_file,
        )
    else:
        evaluation_formatter = Formatter()

    # train check
    trainer = module.Module.get_trainer(
        _net, optimizer=_cfg.optimizer,
        optimizer_params=_cfg.optimizer_params,
        select=_cfg.train_select
    )

    for epoch in range(_cfg.begin_epoch, _cfg.end_epoch):
        for batch_data in progress_monitor(train_data, "Epoch: %s" % epoch):
            fit_f(
                net=_net, batch_size=batch_size, batch_data=batch_data,
                trainer=trainer, bp_loss_f=bp_loss_f,
                loss_function=loss_function,
                loss_monitor=loss_monitor,
                ctx=ctx,
            )

        if epoch % 1 == 0:
            if epoch % 1 == 0:
                print(
                    evaluation_formatter(
                        epoch=epoch,
                        loss_name_value=dict(loss_monitor.items()),
                        eval_name_value=eval_f(_net, test_data, ctx=ctx),
                        extra_info=None,
                        dump=True,
                    )[0]
                )
示例#25
0
def get_data(dataset,
             data_dir=DEFAULT_DATADIR,
             override=False,
             url_dict: dict = None):
    """
    Parameters
    ----------
    dataset: str
        数据集名
    data_dir: str
        数据存储目录
    override: bool
        是否覆盖已存在的文件
    url_dict:
        链接名称与链接映射

    Returns
    -------

    """
    url_dict = URL_DICT if not url_dict else url_dict
    try:
        return download_data(url_dict[dataset], data_dir, override)
    except FileExistsError:  # pragma: no cover
        return path_append(data_dir,
                           url_dict[dataset].split('/')[-1],
                           to_str=True)
示例#26
0
    def _update(self, **kwargs):
        params = kwargs
        params["logger"] = params.pop(
            "logger",
            config_logging(logger=params.get("model_name", self.model_name),
                           console_log_level="info"))

        for key in params:
            if key.endswith("_params") and key + "_update" in params:
                params[key].update(params[key + "_update"])

        self.deep_update(**params)

        self.validation_result_file = path_append(self.model_dir,
                                                  RESULT_JSON,
                                                  to_str=True)
        self.cfg_path = path_append(self.model_dir, CFG_JSON, to_str=True)
示例#27
0
def get_epoch_params_filepath(model_name: str, epoch: int, model_dir: str = "./"):
    """
    Examples
    --------
    >>> get_epoch_params_filepath("CNN", 10)
    'CNN-0010.params'
    """
    return path_append(model_dir, epoch_params_filename(model_name, epoch), to_str=True)
示例#28
0
def get_params_filepath(model_name: str, model_dir: str = "./"):
    """
    Examples
    --------
    >>> get_params_filepath("CNN")
    'CNN.params'
    """
    return path_append(model_dir, params_filename(model_name), to_str=True)
示例#29
0
def build_knowledge_graph(src_root: str, tar_root: (str, None) = None,
                          ku_dict_path: str = None,
                          prerequisite_path: (str, None) = None,
                          similarity_path: (str, None) = None,
                          difficulty_path: (str, None) = None):
    tar_root = tar_root if tar_root is not None else src_root
    exercise_src = path_append(src_root, "junyi_Exercise_table.csv")

    assert ku_dict_path is not None

    relation_src = merge_relationship_annotation(
        [path_append(src_root, "relationship_annotation_{}.csv".format(name)) for name in ["testing", "training"]],
        path_append(src_root, "relationship_annotation.csv")
    )
    ku_dict_path = path_append(tar_root, ku_dict_path)
    build_ku_dict(exercise_src, ku_dict_path)

    if prerequisite_path is not None:
        prerequisite_path = path_append(tar_root, prerequisite_path)
        extract_prerequisite(exercise_src, prerequisite_path, ku_dict_path)

    if similarity_path is not None:
        similarity_path = path_append(tar_root, "similarity.json")
        extract_similarity(relation_src, similarity_path, ku_dict_path)

    if difficulty_path is not None:
        difficulty_path = path_append(tar_root, "difficulty.json")
        extract_difficulty(relation_src, difficulty_path, ku_dict_path)
示例#30
0
def test_load_csv(tmpdir):
    src = path_append(tmpdir, "test.csv")
    text_to_csv(src)

    for i, line in enumerate(load_csv(src)):
        assert int(line["id"]) == i
        if i == 0:
            assert line["name"] == "Tom"
        elif i == 1:
            assert line["name"] == "Jerry"