Python json_loads 예제들, ncc.utils.file_ops.json_io.json_loads Python 예제들

예제 #1

0

파일 보기

    def docstring_tokens_fn(filename,
                            dest_filename,
                            idx,
                            start=0,
                            end=-1,
                            *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring_tokens = json_io.json_loads(line)
                if docstring_tokens:
                    docstring_tokens = [
                        token for token in docstring_tokens \
                        if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token))
                    ]
                    if not all(
                            str.isascii(token) for token in docstring_tokens):
                        docstring_tokens = None
                    if (docstring_tokens is
                            None) or not (3 < len(docstring_tokens) <= 50):
                        docstring_tokens = None
                else:
                    docstring_tokens = None
                print(json_io.json_dumps(docstring_tokens), file=writer)
                line = safe_readline(reader)

예제 #2

0

파일 보기

파일: spm_tokenize.py 프로젝트: CGCL-codes/naturalcc

def tokenization(
    in_file,
    out_file,
    lang,
    attr,
    start=0,
    end=-1,
):
    with file_io.open(in_file, "r") as reader, file_io.open(out_file,
                                                            'w') as writer:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line).strip()

            if lang == 'python' and attr == 'code':
                tokens = python_code_tokenize(line)
                line = ' '.join(tokens).strip()

            if attr == 'code':
                line = normalize_program(line, remove_eol=True)
            else:
                line = normalize_docstring(line,
                                           remove_eol=True,
                                           remove_url=True)

            line = line.strip()
            tokens = tokenizer.encode_as_pieces(line)
            print(json_io.json_dumps(tokens), file=writer)
            line = file_io.safe_readline(reader)

예제 #3

0

파일 보기

파일: preprocess.py 프로젝트: CGCL-codes/naturalcc

def type_tokenize(line, **kwargs):
    paths = json_io.json_loads(line)
    subtokens = []
    for p in paths:
        _, body, _ = p.split(constants.PATH_SEP)
        subtokens.extend(body.split(constants.PATH_LINK))
    return subtokens

예제 #4

0

파일 보기

    def raw_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing
        lang = kwargs.get('lang')
        so_dir = kwargs.get('so_dir')

        so_filename = os.path.join(os.path.expanduser(so_dir),
                                   '{}.so'.format(lang))
        parser = TreeSitterASTParser(so_filename, lang)
        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code = json_io.json_loads(line)
                if code:
                    raw_ast = parser.parse_raw_ast(code)
                else:
                    raw_ast = None
                print(json_io.json_dumps(raw_ast), file=writer)
                line = safe_readline(reader)

예제 #5

0

파일 보기

파일: feature_extract.py 프로젝트: CGCL-codes/naturalcc

    def binary_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    try:
                        ast = util_ast.value2children(ast)
                        ast = util_ast.remove_root_with_uni_child(ast)
                        root_idx = util_ast.get_root_idx(ast)
                        ast = util_ast.delete_node_with_uni_child(ast, idx=root_idx)
                        root_idx = util_ast.get_root_idx(ast)
                        bin_ast = util_ast.binarize_tree(ast, idx=root_idx)  # to binary ast tree
                        root_idx = util_ast.get_root_idx(ast)
                        bin_ast = util_ast.reset_indices(bin_ast, root_idx)  # reset node indices
                        bin_ast = util_ast.pad_leaf_node(bin_ast, MAX_SUB_TOKEN_LEN)
                    except RecursionError:
                        LOGGER.error('RecursionError, ignore this tree')
                        bin_ast = None
                    except Exception as err:
                        LOGGER.error(err)
                        bin_ast = None
                else:
                    bin_ast = None
                print(json_io.json_dumps(bin_ast), file=writer)
                line = safe_readline(reader)

예제 #6

0

파일 보기

파일: feature_extract.py 프로젝트: CGCL-codes/naturalcc

    def path_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename_terminals, dest_filename = dest_filename + '.terminals' + str(idx), dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename_terminals, 'w') as writer_terminals, \
            file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    paths = util_path.ast_to_path(ast, MAX_PATH=PATH_NUM)
                    if paths is None:
                        paths = [[None] * 3] * PATH_NUM
                    else:
                        # copy paths size to PATH_NUM
                        if len(paths) < PATH_NUM:
                            supply_ids = list(range(len(paths))) * ((PATH_NUM - len(paths)) // len(paths)) \
                                         + random.sample(range(len(paths)), ((PATH_NUM - len(paths)) % len(paths)))
                            paths.extend([paths[idx] for idx in supply_ids])
                    random.shuffle(paths)
                    assert len(paths) == PATH_NUM
                    head, body, tail = zip(*paths)
                else:
                    head, body, tail = [None] * PATH_NUM, [None] * PATH_NUM, [None] * PATH_NUM
                # terminals
                for terminal in itertools.chain(*zip(head, tail)):
                    print(json_io.json_dumps(terminal), file=writer_terminals)
                # path
                for b in body:
                    print(json_io.json_dumps(b), file=writer)
                line = safe_readline(reader)

예제 #7

0

파일 보기

    def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code_tokens = json_io.json_loads(line)
                if code_tokens:
                    # filter comment in code_tokens, eg. //***\n /* */\n
                    code_tokens = [token for token in code_tokens
                                   if not (str.startswith(token, '//') or str.startswith(token, '#') or \
                                           (str.startswith(token, '/*') and str.endswith(token, '*/')))
                                   ]

                    if not all(str.isascii(token) for token in code_tokens):
                        code_tokens = None
                    if code_tokens is None or len(code_tokens) < 1:
                        code_tokens = None
                else:
                    code_tokens = None

                print(json_io.json_dumps(code_tokens), file=writer)
                line = safe_readline(reader)

예제 #8

0

파일 보기

파일: preprocess.py 프로젝트: CGCL-codes/naturalcc

def subtoken_tokenize(line, **kwargs):
    paths = json_io.json_loads(line)
    subtokens = []
    for p in paths:
        head, _, tail = p.split(constants.PATH_SEP)
        subtokens.extend(head.split(constants.PATH_LINK))
        subtokens.extend(tail.split(constants.PATH_LINK))
    return subtokens

예제 #9

0

파일 보기

def type_tokenize(line, **kwargs):
    line = json_io.json_loads(line)
    paths = line.split(' ')[1:]
    subtokens = []
    for p in paths:
        _, body, _ = p.split(',')
        subtokens.extend(body.split('|'))
    return subtokens

예제 #10

0

파일 보기

def tokenize_func(line):
    dp = []
    for node in json_io.json_loads(line):
        if "value" in node:
            dp.append(node["value"])
        else:
            dp.append(node["type"])
    return dp

예제 #11

0

파일 보기

def subtoken_tokenize(line, **kwargs):
    line = json_io.json_loads(line)
    paths = line.split(' ')[1:]
    subtokens = []
    for p in paths:
        head, _, tail = p.split(',')
        subtokens.extend(head.split('|'))
        subtokens.extend(tail.split('|'))
    return subtokens

예제 #12

0

파일 보기

 def _func(line):
     line = py150_util.separate_dps(
         json_io.json_loads(line.strip()),
         args['preprocess']['n_ctx'])
     line = [
         py150_util.get_dfs(ast) + [ext] for ast, ext in line
         if len(ast) > 1
     ]
     # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1]
     return line

예제 #13

0

파일 보기

파일: preprocess.py 프로젝트: CGCL-codes/naturalcc

def path_tokenize(line, **kwargs):
    paths = json_io.json_loads(line)
    paths = paths[:kwargs['max_path_num']]
    # do not sample paths randomly to keep our generated datasets to be the same
    # if len(paths) > kwargs['max_path_num']:
    #     paths = np.random.choice(paths, kwargs['max_path_num'], replace=False).tolist()
    heads, bodies, tails = [], [], []
    for p in paths:
        head, body, tail = p.split(constants.PATH_SEP)
        heads.append(head.split(constants.PATH_LINK))
        bodies.append(body.split(constants.PATH_LINK))
        tails.append(tail.split(constants.PATH_LINK))
    return heads, bodies, tails

예제 #14

0

파일 보기

파일: preprocess.py 프로젝트: CGCL-codes/naturalcc

def main(args):
    # task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    def parse_source_input(code):
        code_tokens = vocab.tokenize(code)
        # truncating
        code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2]
        source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token]
        source_ids = vocab.convert_tokens_to_ids(source_tokens)
        source_size = len(source_tokens)
        source_mask = [1] * source_size
        padding_length = config.MAX_SOURCE_LENGTH - len(source_ids)
        source_ids += [vocab.pad()] * padding_length
        source_mask += [0] * padding_length
        return [source_ids, source_mask, source_size]

    def parse_target_input(code):
        target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2]
        target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token]
        target_ids = vocab.convert_tokens_to_ids(target_tokens)
        target_size = len(target_ids)
        target_mask = [1] * target_size
        padding_length = config.MAX_TARGET_LENGTH - len(target_ids)
        target_ids += [vocab.pad_token_id] * padding_length
        target_mask += [0] * padding_length
        return [target_ids, target_mask, target_size]

    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']
    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl")
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(src_file, 'r') as reader:
            keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes']
            data = {key: [] for key in keys}
            for line in reader:
                src_code = json_io.json_loads(line)
                # src_code = SPACE_SPLITTER.sub(" ", line)
                # source_ids, source_mask
                src_line = parse_source_input(src_code)
                # target_ids, target_mask
                tgt_line = parse_target_input(src_code)
                for key, src in zip(keys, [src_code] + src_line + tgt_line):
                    data[key].append(src)
            file_io.open(dst_file, mode='wb', data=data)

예제 #15

0

파일 보기

파일: preprocess.py 프로젝트: CGCL-codes/naturalcc

def binarize(args, in_file: str, out_file: str, vocab, token_dict, offset: int, end: int):
    ds = indexed_dataset.make_builder(f"{out_file}.mmap", impl='mmap', vocab_size=len(vocab))
    with file_io.open(in_file, 'r') as reader:
        reader.seek(offset)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line)
            code_tokens = vocab.encode(line, out_type=str)
            code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens])
            ds.add_item(code_tokens)
            line = reader.readline()
    ds.finalize(f'{out_file}.idx')

예제 #16

0

파일 보기

파일: preprocess.py 프로젝트: CGCL-codes/naturalcc

def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    dictionary = save_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    lang = args['preprocess']['lang']
    for mode in MODES:
        file = f"{args['preprocess'][f'{mode}pref']}.code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")
        PathManager.mkdir(os.path.dirname(dst_file))
        dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap",
                                               impl='mmap',
                                               vocab_size=len(vocab))
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(file, 'r') as reader:
            data = {'code': []}
            for line in reader:
                line = json_io.json_loads(line)
                code = SPACE_SPLITTER.sub(" ", line)
                data['code'].append(code)
                code_tokens = vocab.encode(code, out_type=str)
                code_tokens = torch.IntTensor(
                    [dictionary.index(token) for token in code_tokens])
                # code_tokens = torch.IntTensor(vocab.encode_as_ids(code))
                dataset.add_item(code_tokens)
            dataset.finalize(f"{dst_file}_tokens.idx")
            # proj indices
            # cp id
            data['proj_indices'] = [1] * len(data['code'])
            file_io.open(f"{dst_file}.pkl", mode='wb', data=data)

예제 #17

0

파일 보기

def type_tokenize_func(line):
    ast = json_io.json_loads(line)
    code_types = []
    idx = 0
    while idx < len(ast):
        if ast[idx].get('type', None) in {
                "attr", "Num", "NameLoad", "NameStore", "NameParam"
        }:
            code_types.extend([constants.PAD, ast[idx]['type']])
            idx += 2
        else:
            code_types.append(constants.PAD)
            idx += 1
    return code_types

예제 #18

0

파일 보기

def ast_fn(filename, dest_filename, idx, start=0, end=-1):
    dest_filename = dest_filename + str(idx)
    with file_io.open(filename, "r",
                      encoding="UTF-8") as reader, open(dest_filename,
                                                        'w') as writer:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line)
            ast = convert(line)
            print(json_io.json_dumps(ast), file=writer)
            line = file_io.safe_readline(reader)

예제 #19

0

파일 보기

파일: feature_extract.py 프로젝트: CGCL-codes/naturalcc

    def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring = json_io.json_loads(line)
                print(json_io.json_dumps(docstring), file=writer)
                line = safe_readline(reader)

예제 #20

0

파일 보기

    def code_wo_func_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        func_filename = filename[:str.rfind(filename, '.')] + '.func'
        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, open(func_filename, 'r') as func_reader, \
            file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            func_line = safe_readline(func_reader)
            while line and func_line:
                if end > 0 and reader.tell() > end:
                    break
                code = json_io.json_loads(line)
                func_name = json_io.json_loads(func_line)
                start_idx = str.find(code, func_name)
                if start_idx != -1:
                    code_wo_func = code[:start_idx] + code[start_idx +
                                                           len(func_name):]
                else:
                    code_wo_func = None
                print(json_io.json_dumps(code_wo_func), file=writer)
                line = safe_readline(reader)
                func_line = safe_readline(func_reader)

예제 #21

0

파일 보기

파일: feature_extract.py 프로젝트: CGCL-codes/naturalcc

    def ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing
        parser = CodeParser(SO_FILE=os.path.join(kwargs['so_dir'], f"{kwargs['lang']}.so"), LANGUAGE=kwargs['lang'])

        dest_filename = f"{dest_filename}{idx}"
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                ast = parser.parse_raw_ast(code=line, MAX_AST_SIZE=99999999999, append_index=True)
                print(json_io.json_dumps(ast), file=writer)
                line = safe_readline(reader)

예제 #22

0

파일 보기

    def func_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                func_name = json_io.json_loads(line)
                func = func_name.split('.')[-1]
                print(json_io.json_dumps(func), file=writer)
                line = safe_readline(reader)

예제 #23

0

파일 보기

파일: attributes_cast.py 프로젝트: CGCL-codes/naturalcc

def code_tokenization(src_file):
    from clgen._atomizer import GreedyAtomizer
    from clgen._langs import Language

    with open(src_file, 'r') as reader:
        src_codes = reader.readlines()
    opencl_lang = Language.from_str('opencl')
    atomizer = GreedyAtomizer.from_text(opencl_lang, text='\n'.join(src_codes))

    dst_file = f"{src_file}_tokens"
    with open(dst_file, 'w') as writer:
        for code in src_codes:
            code = json_io.json_loads(code)
            code_tokens = atomizer.atomize(code)
            code_tokens = [atomizer.atoms[idx] for idx in code_tokens]
            print(json_io.json_dumps(code_tokens), file=writer)

예제 #24

0

파일 보기

파일: feature_extract.py 프로젝트: CGCL-codes/naturalcc

    def traversal_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    ast_traversal = util_traversal.get_dfs(ast)
                else:
                    ast_traversal = None
                print(json_io.json_dumps(ast_traversal), file=writer)
                line = safe_readline(reader)

예제 #25

0

파일 보기

파일: feature_extract.py 프로젝트: CGCL-codes/naturalcc

    def path_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    paths = util_path.ast_to_path(ast)
                    print(json_io.json_dumps(paths), file=writer)
                line = safe_readline(reader)

예제 #26

0

파일 보기

파일: feature_extract.py 프로젝트: CGCL-codes/naturalcc

    def dfs_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = f"{dest_filename}{idx}"
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast is not None:
                    dfs, _ = ast_to_dfs(ast)
                else:
                    dfs = None
                print(json_io.json_dumps(dfs), file=writer)
                line = safe_readline(reader)

예제 #27

0

파일 보기

    def add_from_file(self, f):
        """
        Loads a pre-existing dictionary from a text file and adds its symbols
        to this instance.
        """
        if isinstance(f, str):
            try:
                with file_io.open(f, "r") as fd:
                    self.add_from_file(fd)
            except FileNotFoundError as fnfe:
                raise fnfe
            except UnicodeError:
                raise Exception("Incorrect encoding detected in {}, please "
                                "rebuild the dataset".format(f))
            return

        lines = f.readlines()
        indices_start_line = self._load_meta(lines)

        for line in lines[indices_start_line:]:
            try:
                raw_line = json_io.json_loads(line.rstrip())
                line, field = raw_line[:-1], raw_line[-1]
                if field == "#fairseq:overwrite":
                    overwrite = True
                    line, field = line[:-1], line[-1]
                else:
                    line = line[0]
                    overwrite = False
                count = int(field)
                word = line
                if word in self and not overwrite:
                    raise RuntimeError(
                        "Duplicate word found when loading Dictionary: '{}'. "
                        "Duplicate words can overwrite earlier ones by adding the "
                        "#fairseq:overwrite flag at the end of the corresponding row "
                        "in the dictionary file. If using the Camembert model, please "
                        "download an updated copy of the model file.".format(
                            word))
                self.add_symbol(word, n=count, overwrite=overwrite)
            except ValueError:
                raise ValueError(
                    "Incorrect dictionary format, expected '<token> <cnt> [flags]'"
                )

예제 #28

0

파일 보기

파일: preprocess.py 프로젝트: CGCL-codes/naturalcc

def binarize_dfs(args, filename: str, dict, in_file: str, offset: int,
                 end: int):
    """binarize function for multi-processing"""
    ds_file = '{}.mmap'.format(in_file)
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))

    with file_io.open(filename, 'r') as reader:
        reader.seek(offset)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line)
            dfs = torch.IntTensor([dict.index(tok) for tok in line])
            ds.add_item(dfs)
            line = reader.readline()
    ds.finalize('{}.idx'.format(in_file))

예제 #29

0

파일 보기

def flatten_attrs(raw_file, flatten_dir, lang, attrs):
    def _get_file_info(filename):
        """get mode and file index from file name"""
        filename = os.path.split(filename)[-1]
        mode = filename[:str.rfind(filename, '.jsonl')]
        return mode

    mode = _get_file_info(raw_file)
    attr_writers = {}
    for attr in attrs:
        attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}')
        PathManager.mkdir(os.path.dirname(attr_file))
        attr_writers[attr] = file_io.open(attr_file, 'w')
    print('raw_file: ', raw_file)
    with file_io.open(raw_file, 'r') as reader:
        for line in reader:
            code_snippet = json_io.json_loads(line)
            for attr, info in code_snippet.items():
                if attr in attr_writers:
                    print(json_io.json_dumps(info), file=attr_writers[attr])

예제 #30

0

파일 보기

파일: feature_extract.py 프로젝트: CGCL-codes/naturalcc

    def sbtao_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    ast = util_ast.value2children(ast)
                    padded_ast = util_ast.pad_leaf_node(ast, MAX_SUB_TOKEN_LEN)
                    root_idx = util_ast.get_root_idx(padded_ast)
                    sbt = util_ast.build_sbtao_tree(padded_ast, idx=root_idx)
                else:
                    sbt = None
                print(json_io.json_dumps(sbt), file=writer)
                line = safe_readline(reader)