예제 #1
0
파일: format.py 프로젝트: tswsxk/CangJie
def gensim2json(src, tar):
    model = _load_gensim(src)

    with wf_open(tar) as wf:
        for word in tqdm(model.wv.vocab,
                         "gensim2json: %s --> %s" % (src, tar)):
            print(json.dumps([word, model.wv[word].tolist()]), file=wf)
예제 #2
0
파일: Env.py 프로젝트: wsgan001/EduSim
    def dump_kt(self, learner_num, filename, step=50):
        learners = self.generate_learners(learner_num)

        with wf_open(filename) as wf:
            for learner in tqdm(learners, "kss for kt"):
                self._learner_warm_up(learner, step)
                print(json.dumps(learner.exercise_history), file=wf)
예제 #3
0
def seq2idx(src, tar, vec_json, src_encoding="utf-8", tar_encoding="utf-8"):
    """convert token sequences in json format into idx sequence in json format"""
    vec_dict = WVDict.from_file(vec_json)
    with rf_open(src, encoding=src_encoding) as f, wf_open(
            tar, encoding=tar_encoding) as wf:
        for line in tqdm(f, desc="converting %s -> %s" % (src, tar)):
            print(json.dumps(vec_dict.token2idx(json.loads(line))), file=wf)
예제 #4
0
def extract_students_log(source, target, ku_dict):
    """require big memory to run this function"""

    outcome = {
        "INCORRECT": 0,
        "CORRECT": 1,
        "HINT": 0,
    }

    students = {}

    with open(ku_dict) as f:
        ku_dict = json.load(f)

    with open(source) as f:
        f.readline()
        for line in tqdm(csv.reader(f, delimiter='\t'), "reading data"):
            student, session, exercise, correct, timestamp = line[0], line[1], ku_dict[line[-5]], \
                                                             outcome[line[10]], line[8]
            if student not in students:
                students[student] = {}
            if session not in students[student]:
                students[student][session] = []

            students[student][session].append([int(timestamp), exercise, correct])

    with wf_open(target) as wf:
        for student_id, sessions in tqdm(students.items(), "sorting"):
            for session_id, exercises in sessions.items():
                exercises.sort(key=lambda x: x[0])
                exercise_response = [(exercise[1], exercise[2]) for exercise in exercises]
                print(json.dumps(exercise_response), file=wf)
예제 #5
0
def result_file(tmp_path_factory):
    tmp_path = tmp_path_factory.mktemp("result")
    tmp_file = path_append(tmp_path, "result.json", to_str=True)
    with wf_open(tmp_file) as wf:
        for r in result_demo:
            print(json.dumps(r), file=wf)
    return tmp_file
예제 #6
0
파일: parser.py 프로젝트: tswsxk/longling
    def dump(self, cfg_path: str, override=True, file_format=None):
        """
        将配置参数写入文件

        Updated in version 1.3.16

        Parameters
        ----------
        cfg_path: str
        override: bool
        file_format: str
        """
        if os.path.isfile(cfg_path) and not override:
            self.logger.warning(
                "file %s existed, dump aborted" % os.path.abspath(cfg_path)
            )
            return
        self.logger.info(
            "writing configuration parameters to %s" % os.path.abspath(cfg_path)
        )
        file_format = file_format if file_format is not None else self.default_file_format()

        with wf_open(cfg_path) as wf:
            if file_format == "json":
                json.dump(self.parsable_var, wf, indent=2)
            elif file_format == "toml":
                toml.dump(self.parsable_var, wf)
            elif file_format == "yaml":
                yaml.dump(self.parsable_var, wf)
            else:
                raise TypeError(
                    "Unsupported file format: %s, only `json`, `toml` and `yaml` are supported" % file_format
                )
예제 #7
0
def extract_prerequisite(source, target, ku_dict):
    """in target: (A, B) means predecessor --> successor"""
    with codecs.open(
            source,
            encoding="utf-8") as f, open(ku_dict) as kf, wf_open(target) as wf:
        ku_dict = json.load(kf)

        prerequisite_edges = []
        f.readline()
        for line in tqdm(csv.reader(f)):
            if not line[2]:
                continue
            successor = ku_dict[line[0]]
            for prerequisite in line[2].split(','):
                predecessor = ku_dict[prerequisite]
                if predecessor == successor:
                    continue
                if predecessor == 61 and successor == 498:
                    # there is a loop 498 -> 510 -> 61 -> 498 in original data
                    continue
                prerequisite_edges.append((predecessor, successor))

        logger.info("prerequisite edges: %s" % len(prerequisite_edges))

        # clean the loop in prerequisite graph

        graph = nx.DiGraph()
        graph.add_edges_from(prerequisite_edges)
        assert not list(nx.algorithms.simple_cycles(graph)), "loop in DiGraph"

        json.dump(prerequisite_edges, wf, indent=2)
예제 #8
0
def template_copy(src: PATH_TYPE,
                  tar: PATH_TYPE,
                  default_value: (str, dict, None) = "",
                  quotation="\'",
                  key_lower=True,
                  **variables):
    """
    Generate the tar file based on the template file where the variables will be replaced.
    Usually, the variable is specified like `$PROJECT` in the template file.


    Parameters
    ----------
    src: template file
    tar: target location
    default_value: the default value
    quotation: the quotation to wrap the variable value
    variables: the real variable values which are used to replace the variable in template file
    """

    if not override_check(tar):
        return

    with open(src) as f, wf_open(tar) as wf:
        for line in f:
            print(default_variable_replace(line,
                                           default_value=default_value,
                                           quotation=quotation,
                                           key_lower=key_lower,
                                           **variables),
                  end='',
                  file=wf)
예제 #9
0
파일: format.py 프로젝트: tswsxk/CangJie
def json2csv(src, tar, delimiter=' '):
    with rf_open(src) as f, wf_open(tar) as wf:
        writer = csv.writer(wf, delimiter=delimiter)
        for line in f:
            token, vec = json.loads(line)
            writer.writerow([token] + list(map(str, vec)))
    return tar
예제 #10
0
파일: format.py 프로젝트: tswsxk/CangJie
def gensim2csv(src, tar, delimiter=" "):
    model = _load_gensim(src)

    with wf_open(tar) as wf:
        writer = csv.writer(wf, delimiter=delimiter)
        for word in tqdm(model.wv.vocab,
                         "gensim2json: %s --> %s" % (src, tar)):
            writer.writerow([word] + model.wv[word].tolist())
예제 #11
0
def synthetic2json(src, tar):
    with open(src) as f, wf_open(tar) as wf:
        for line in tqdm(f, desc="%s -> %s" % (src, tar)):
            line = line.strip()
            if not line:  # pragma: no cover
                continue
            elems = line.split(",")
            print(json.dumps([[i, int(ans)] for i, ans in enumerate(elems)]),
                  file=wf)
예제 #12
0
파일: iterator.py 프로젝트: tswsxk/longling
    def cached(self):
        assert self.cache_queue is not None

        with wf_open(self.cache_file, mode="w") as wf:
            while True:
                data = self.cache_queue.get()
                if isinstance(data, StopIteration):
                    break
                print(json.dumps(data), file=wf)
예제 #13
0
def merge_relationship_annotation(sources, target):
    with wf_open(target) as wf:
        with codecs.open(sources[0]) as f:
            for line in f:
                wf.write(line)
        with codecs.open(sources[1]) as f:
            f.readline()
            for line in f:
                wf.write(line)
예제 #14
0
def _write(students, target):
    with wf_open(target) as wf:
        for student_id, sessions in tqdm(students.items(),
                                         "writing -> %s" % target):
            for session_id, exercises in sessions.items():
                exercises.sort(key=lambda x: x[0])
                exercise_response = [(exercise[1], exercise[2])
                                     for exercise in exercises]
                print(json.dumps(exercise_response), file=wf)
예제 #15
0
파일: format.py 프로젝트: tswsxk/CangJie
def csv2json(src, tar, delimiter=' ', skip_first_line=False):
    with rf_open(src) as f, wf_open(tar) as wf:
        if skip_first_line:  # pragma: no cover
            f.readline()
        for line in tqdm(csv.reader(f, delimiter=delimiter),
                         "csv2json: %s --> %s" % (src, tar)):
            token = line[0]
            vec = list(map(float, line[1:]))
            print(json.dumps([token, vec]), file=wf)
    return tar
예제 #16
0
def build_interactions(users_dir, questions_csv, tar):
    judgement = Judgement(questions_csv)

    with wf_open(tar) as wf:
        for root, dirs, files in os.walk(users_dir):
            for filename in tqdm(files, "building interactions"):
                if re.match("u.*\.csv", filename):
                    interactions_seq = csv2interactions(
                        path_append(root, filename, to_str=True), judgement)
                    print(json.dumps(interactions_seq), file=wf)
예제 #17
0
def dense_graph(ku_num, tar):
    _graph = []

    for i in range(ku_num):
        for j in range(ku_num):
            if i != j:
                _graph.append([i, j])

    with wf_open(tar) as wf:
        json.dump(_graph, wf, indent=2)
예제 #18
0
def test_path(tmp_path):
    path_append(tmp_path, "../data", "../dataset1/", "train", to_str=True)

    tmp_file = path_append(tmp_path, "test_path.txt")
    with wf_open(tmp_file) as wf:
        print("hello world", file=wf)

    assert file_exist(tmp_file)
    _dir = abs_current_dir(tmp_file)
    assert parent_dir(_dir, 2) == path_append(_dir, "..", "..")
예제 #19
0
def build_ku_dict(source, target):
    with codecs.open(source, encoding="utf-8") as f, wf_open(target) as wf:
        f.readline()
        idx = 0
        vertex_dict = {}
        for line in tqdm(csv.reader(f)):
            if line[0] not in vertex_dict:
                vertex_dict[line[0]] = idx
                idx += 1
        logger.info("vertex num: %s" % len(vertex_dict))
        json.dump(vertex_dict, wf, indent=2)
예제 #20
0
def dense_graph(ku_num: int, tar=None, undirected: bool = False):
    """
    Dense graph where any two vertex have a link

    No self loop is reserved.

    Parameters
    ----------
    ku_num: int
    tar
    undirected

    Examples
    --------
    Target file is a json file, json.load can be used to read it.

    Demo of target file with undirected tag is False:
    [
        [0, 1],
        [0, 2],
        [1, 0],
        ...
        [2, 0],
        [2, 1]
    ]

    Demo of target file with undirected tag is True:
    [
        [0, 1],
        [1, 2],
        [0, 2]
    ]

    >>> dense_graph(3)
    [[0, 1], [0, 2], [1, 0], [1, 2], [2, 0], [2, 1]]
    >>> dense_graph(3, undirected=True)
    [[0, 1], [0, 2], [1, 2]]
    """
    _graph = []

    if undirected:
        for i in range(ku_num):
            for j in range(i + 1, ku_num):
                _graph.append([i, j])
    else:
        for i in range(ku_num):
            for j in range(ku_num):
                if i != j:
                    _graph.append([i, j])

    if tar is not None:
        with wf_open(tar) as wf:
            json.dump(_graph, wf, indent=2)
    return _graph
예제 #21
0
def _output_graph(graph, tar):
    ku_num = len(graph)

    _graph = []

    for i in range(ku_num):
        for j in range(ku_num):
            if i != j and graph[i][j] > 0:
                _graph.append([i, j, graph[i][j]])

    with wf_open(tar) as wf:
        json.dump(_graph, wf, indent=2)
예제 #22
0
def extract_difficulty(source, target, ku_dict):
    """
    In target: (A, B, v) means A is similar with B in v degree.
    If v is small, A and B should be considered as not similar.
    """
    difficulty = []
    with codecs.open(source, encoding="utf-8") as f, open(ku_dict) as kf, wf_open(target) as wf:
        f.readline()
        ku_dict = json.load(kf)
        for line in csv.reader(f):
            difficulty.append((ku_dict[line[0]], ku_dict[line[1]], float(line[4])))

        logger.info("edges: %s" % len(difficulty))

        logger.info(pandas.Series([sim[-1] for sim in difficulty]).describe())
        json.dump(difficulty, wf, indent=2)
예제 #23
0
def select_n_most_active(src, tar, n):
    lengths = []
    with open(src) as f:
        for i, line in tqdm(enumerate(f), "evaluating length of each row"):
            lengths.append([i, len(json.loads(line))])

    selected_idx = set(
        list(zip(*heapq.nlargest(n, lengths, key=lambda x: x[1])))[0])

    with open(src) as f, wf_open(tar) as wf:
        for i, line in tqdm(
                enumerate(f),
                "selecting %s most active students from %s to %s" %
            (n, src, tar)):
            if i not in selected_idx:
                continue
            print(line, end='', file=wf)
예제 #24
0
def test_template_copy(tmpdir):
    config.OVERRIDE = True

    pseudo_template = """
    project=$PROJECT
    author=$AUTHOR
    """.lstrip()
    src = path_append(tmpdir, "src.template")
    tar = path_append(tmpdir, "tar")
    with wf_open(src) as wf:
        print(pseudo_template, file=wf)

    template_copy(src,
                  tar,
                  quotation='',
                  project="longling",
                  author="sherlock")

    with open(tar) as f:
        assert f.readline().strip() == "project=longling"
        assert f.readline().strip() == "author=sherlock"
예제 #25
0
def test_load_configuration_json(tmpdir, file_format):
    configuration = {"id": "12345", "name": "test_config"}

    filename = path_append(tmpdir, "test_config.%s" % file_format)

    with wf_open(filename) as wf:
        if file_format == "json":
            json.dump(configuration, wf)
        elif file_format == "toml":
            toml.dump(configuration, wf)
        elif file_format == "yaml":
            yaml.dump(configuration, wf)
        else:
            print(configuration, file=wf)

    if file_format == "err":
        with pytest.raises(TypeError):
            with open(filename) as f:
                _c = load_configuration(f, file_format=file_format)
    else:
        with open(filename) as f:
            _c = load_configuration(f, file_format=file_format)
            assert _c["id"] == "12345"
            assert _c["name"] == "test_config"
예제 #26
0
    with wf_open(target) as wf:
        for student_id, sessions in tqdm(students.items(), "sorting"):
            for session_id, exercises in sessions.items():
                exercises.sort(key=lambda x: x[0])
                exercise_response = [(exercise[1], exercise[2]) for exercise in exercises]
                print(json.dumps(exercise_response), file=wf)


if __name__ == '__main__':
    root = "../../"
    student_log_raw_file = root + "raw_data/junyi/junyi_ProblemLog_for_PSLC.txt"
    student_log_file = root + "data/junyi/student_log_kt.json"
    ku_dict_file = root + "data/junyi/graph_vertex.json"
    # extract_students_log(student_log_raw_file, student_log_file, ku_dict_file)

    student_log_file_small = student_log_file + ".small"

    with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
        for i, line in tqdm(enumerate(f)):
            if i > 50000:
                break
            print(line, end="", file=wf)

    print(train_valid_test(
        student_log_file_small,
        valid_ratio=0.,
        test_ratio=0.2,
        root_dir=root + "data/junyi/",
        silent=False,
    ))
예제 #27
0
def gitlab_ci(private,
              stages: dict,
              atype: str = "",
              tar_dir: PATH_TYPE = "./",
              version_in_path=True):
    """
    cli alias: ``arch gitlab_ci``

    Parameters
    ----------
    private
    stages
    atype
    tar_dir
    version_in_path

    Returns
    -------

    """
    base_src = path_append(META, "gitlab-ci", ".gitlab-ci.yml")
    src = path_append(META, "gitlab-ci", "%s.gitlab-ci.yml" % atype)
    tar = path_append(tar_dir, ".gitlab-ci.yml")

    config_template = OrderedDict()

    with open(base_src) as f:
        config_template.update(ordered_yaml_load(f))

    with open(src) as f:
        config_template.update(ordered_yaml_load(f))

    logger.info("generate %s" % tar)

    with wf_open(tar) as wf:
        for _c in ["variables", "cache"]:
            if _c in config_template:
                print(dump_folded_yaml({_c: config_template[_c]}), file=wf)

        print(dump_folded_yaml({
            "stages":
            [stage for stage in stages.keys() if stage in config_template]
        }),
              file=wf)

        for stage, params in stages.items():
            if stage == "docs":
                params["registry_suffix"] = "/docs"
            elif stage in {"test", "build"}:
                params["deployment"] = False

            if stage not in config_template:
                logger.warning("%s is not listed in %s, skipped" %
                               (stage, src))
                continue
            commands = {stage: config_template[stage]}
            _gitlab_ci(commands,
                       stage,
                       private=private,
                       version_in_path=version_in_path,
                       **params)
            print(dump_folded_yaml(commands), file=wf)

    if private:
        print("*" * 30)
        print("私有项目注意")
        print(
            "在项目的settings->repository->Deploy Tokens 添加一个name为gitlab-deploy-token、"
            "其他两项留空的具有read_registry权限的token")
        print("*" * 30)