예제 #1
0
    def path_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename_terminals, dest_filename = dest_filename + '.terminals' + str(idx), dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename_terminals, 'w') as writer_terminals, \
            file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    paths = util_path.ast_to_path(ast, MAX_PATH=PATH_NUM)
                    if paths is None:
                        paths = [[None] * 3] * PATH_NUM
                    else:
                        # copy paths size to PATH_NUM
                        if len(paths) < PATH_NUM:
                            supply_ids = list(range(len(paths))) * ((PATH_NUM - len(paths)) // len(paths)) \
                                         + random.sample(range(len(paths)), ((PATH_NUM - len(paths)) % len(paths)))
                            paths.extend([paths[idx] for idx in supply_ids])
                    random.shuffle(paths)
                    assert len(paths) == PATH_NUM
                    head, body, tail = zip(*paths)
                else:
                    head, body, tail = [None] * PATH_NUM, [None] * PATH_NUM, [None] * PATH_NUM
                # terminals
                for terminal in itertools.chain(*zip(head, tail)):
                    print(json_io.json_dumps(terminal), file=writer_terminals)
                # path
                for b in body:
                    print(json_io.json_dumps(b), file=writer)
                line = safe_readline(reader)
예제 #2
0
def flatten_attrs(raw_file, flatten_dir, mode, attrs, start=0, end=-1):
    attr_writers = {}
    for attr in attrs:
        attr_file = os.path.join(flatten_dir, '{}.{}'.format(mode, attr))
        os.makedirs(os.path.dirname(attr_file), exist_ok=True)
        attr_writers[attr] = file_io.open(attr_file, 'w')

    with file_io.open(raw_file, 'r') as reader:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            filename = os.path.join(os.path.dirname(raw_file), line.strip())
            # tokens, types = parse_file(filename)
            try:
                tokens, types = parse_file(filename)
                # replace None with [PAD] for type dictionary build
                types = [PAD if t is None else t for t in types]
            except Exception as err:
                # print(err)
                # print(f'parsing {filename} error')
                line = file_io.safe_readline(reader)
                continue
            print(json_io.json_dumps(tokens), file=attr_writers['code_tokens'])
            print(json_io.json_dumps(types), file=attr_writers['code_types'])
            line = file_io.safe_readline(reader)
예제 #3
0
    def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code_tokens = json_io.json_loads(line)
                if code_tokens:
                    # filter comment in code_tokens, eg. //***\n /* */\n
                    code_tokens = [token for token in code_tokens
                                   if not (str.startswith(token, '//') or str.startswith(token, '#') or \
                                           (str.startswith(token, '/*') and str.endswith(token, '*/')))
                                   ]

                    if not all(str.isascii(token) for token in code_tokens):
                        code_tokens = None
                    if code_tokens is None or len(code_tokens) < 1:
                        code_tokens = None
                else:
                    code_tokens = None

                print(json_io.json_dumps(code_tokens), file=writer)
                line = safe_readline(reader)
예제 #4
0
def tokenization(
    in_file,
    out_file,
    lang,
    attr,
    start=0,
    end=-1,
):
    with file_io.open(in_file, "r") as reader, file_io.open(out_file,
                                                            'w') as writer:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line).strip()

            if lang == 'python' and attr == 'code':
                tokens = python_code_tokenize(line)
                line = ' '.join(tokens).strip()

            if attr == 'code':
                line = normalize_program(line, remove_eol=True)
            else:
                line = normalize_docstring(line,
                                           remove_eol=True,
                                           remove_url=True)

            line = line.strip()
            tokens = tokenizer.encode_as_pieces(line)
            print(json_io.json_dumps(tokens), file=writer)
            line = file_io.safe_readline(reader)
예제 #5
0
    def raw_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing
        lang = kwargs.get('lang')
        so_dir = kwargs.get('so_dir')

        so_filename = os.path.join(os.path.expanduser(so_dir),
                                   '{}.so'.format(lang))
        parser = TreeSitterASTParser(so_filename, lang)
        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code = json_io.json_loads(line)
                if code:
                    raw_ast = parser.parse_raw_ast(code)
                else:
                    raw_ast = None
                print(json_io.json_dumps(raw_ast), file=writer)
                line = safe_readline(reader)
예제 #6
0
    def docstring_tokens_fn(filename,
                            dest_filename,
                            idx,
                            start=0,
                            end=-1,
                            *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring_tokens = json_io.json_loads(line)
                if docstring_tokens:
                    docstring_tokens = [
                        token for token in docstring_tokens \
                        if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token))
                    ]
                    if not all(
                            str.isascii(token) for token in docstring_tokens):
                        docstring_tokens = None
                    if (docstring_tokens is
                            None) or not (3 < len(docstring_tokens) <= 50):
                        docstring_tokens = None
                else:
                    docstring_tokens = None
                print(json_io.json_dumps(docstring_tokens), file=writer)
                line = safe_readline(reader)
예제 #7
0
    def binary_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    try:
                        ast = util_ast.value2children(ast)
                        ast = util_ast.remove_root_with_uni_child(ast)
                        root_idx = util_ast.get_root_idx(ast)
                        ast = util_ast.delete_node_with_uni_child(ast, idx=root_idx)
                        root_idx = util_ast.get_root_idx(ast)
                        bin_ast = util_ast.binarize_tree(ast, idx=root_idx)  # to binary ast tree
                        root_idx = util_ast.get_root_idx(ast)
                        bin_ast = util_ast.reset_indices(bin_ast, root_idx)  # reset node indices
                        bin_ast = util_ast.pad_leaf_node(bin_ast, MAX_SUB_TOKEN_LEN)
                    except RecursionError:
                        LOGGER.error('RecursionError, ignore this tree')
                        bin_ast = None
                    except Exception as err:
                        LOGGER.error(err)
                        bin_ast = None
                else:
                    bin_ast = None
                print(json_io.json_dumps(bin_ast), file=writer)
                line = safe_readline(reader)
예제 #8
0
 def _add_tok_to_dictionary_single_worker(
     filename: str,
     tokenize: Any,
     eos_word: Optional[str],
     worker_id: int = 0,
     num_workers: int = 1,
 ) -> Counter:
     counter = Counter()
     with file_io.open(filename, "r") as f:
         size = os.fstat(f.fileno()).st_size
         chunk_size = size // num_workers
         offset = worker_id * chunk_size
         end = offset + chunk_size
         f.seek(offset)
         if offset > 0:
             safe_readline(f)  # drop first incomplete line
         line = f.readline()
         while line:
             tokens = tokenize(line)
             counter.update(tokens)
             if eos_word is not None:
                 counter.update([eos_word])
             if f.tell() > end:
                 break
             line = f.readline()
     return counter
예제 #9
0
 def find_offsets(filename, num_chunks):
     with open(filename, "r", encoding="utf-8") as f:
         size = os.fstat(f.fileno()).st_size
         chunk_size = size // num_chunks
         offsets = [0 for _ in range(num_chunks + 1)]
         for i in range(1, num_chunks):
             f.seek(chunk_size * i)
             safe_readline(f)
             offsets[i] = f.tell()
         return offsets
예제 #10
0
def ast_fn(filename, dest_filename, idx, start=0, end=-1):
    dest_filename = dest_filename + str(idx)
    with file_io.open(filename, "r",
                      encoding="UTF-8") as reader, open(dest_filename,
                                                        'w') as writer:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line)
            ast = convert(line)
            print(json_io.json_dumps(ast), file=writer)
            line = file_io.safe_readline(reader)
예제 #11
0
    def ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing
        parser = CodeParser(SO_FILE=os.path.join(kwargs['so_dir'], f"{kwargs['lang']}.so"), LANGUAGE=kwargs['lang'])

        dest_filename = f"{dest_filename}{idx}"
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                ast = parser.parse_raw_ast(code=line, MAX_AST_SIZE=99999999999, append_index=True)
                print(json_io.json_dumps(ast), file=writer)
                line = safe_readline(reader)
예제 #12
0
    def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring = json_io.json_loads(line)
                print(json_io.json_dumps(docstring), file=writer)
                line = safe_readline(reader)
예제 #13
0
    def binarize_bpe(
        filename,
        dict,
        consumer,
        reverse_order=False,
        offset=0,
        end=-1,
    ):
        nseq, ntok = 0, 0  # nseq = sentence number, ntok = token number
        replaced = Counter()  # un-recorded tokens

        with open(filename, "r", encoding="utf-8") as f:
            f.seek(offset)
            # next(f) breaks f.tell(), hence readline() must be used
            line = safe_readline(f)
            while line:
                if end > 0 and f.tell() > end:
                    break
                line = ujson.loads(line)
                line = ' '.join(line) if isinstance(line, list) else line
                ids = dict.encode_ids(line)
                if reverse_order:
                    words = list(reversed(words))
                ids = torch.IntTensor(ids)

                nseq += 1
                ntok += len(ids)
                consumer(ids)
                line = f.readline()
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }
예제 #14
0
def build_dgl_graph(vocab, input_file, output_file, start=0, end=-1):
    graph_batch = []
    with open(input_file, 'r') as reader:
        reader.seek(start)
        line = safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            ast = ujson.loads(line)
            if ast is None:
                graph = dgl.DGLGraph()
            else:
                graph = tree2dgl(ast, vocab)
            graph_batch.append(graph)
            line = safe_readline(reader)
    save_graphs(output_file, graph_batch)
예제 #15
0
    def func_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                func_name = json_io.json_loads(line)
                func = func_name.split('.')[-1]
                print(json_io.json_dumps(func), file=writer)
                line = safe_readline(reader)
예제 #16
0
    def dfs_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = f"{dest_filename}{idx}"
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast is not None:
                    dfs, _ = ast_to_dfs(ast)
                else:
                    dfs = None
                print(json_io.json_dumps(dfs), file=writer)
                line = safe_readline(reader)
예제 #17
0
    def path_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    paths = util_path.ast_to_path(ast)
                    print(json_io.json_dumps(paths), file=writer)
                line = safe_readline(reader)
예제 #18
0
    def traversal_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    ast_traversal = util_traversal.get_dfs(ast)
                else:
                    ast_traversal = None
                print(json_io.json_dumps(ast_traversal), file=writer)
                line = safe_readline(reader)
예제 #19
0
    def binarize(
        filename,
        dict,  # Ditionary
        consumer,
        tokenize=tokenize_string,
        append_eos=True,
        reverse_order=False,
        offset=0,
        end=-1,
        already_numberized=False,
        **kwargs,
    ):
        nseq, ntok = 0, 0  # nseq = sentence number, ntok = token number
        replaced = Counter()  # un-recorded tokens

        def replaced_consumer(word, idx):
            """save un-recorded token"""
            if idx == dict.unk_index and word != dict.unk_word:
                replaced.update([word])

        with open(filename, "r", encoding="utf-8") as f:
            f.seek(offset)
            # next(f) breaks f.tell(), hence readline() must be used
            line = safe_readline(f)
            while line:
                if end > 0 and f.tell() > end:
                    break
                if already_numberized:
                    id_strings = line.strip().split()
                    id_list = [int(id_string) for id_string in id_strings]
                    if reverse_order:
                        id_list.reverse()
                    if append_eos:
                        id_list.append(dict.eos())
                    ids = torch.IntTensor(id_list)
                else:
                    ids = dict.encode_line(
                        line=line,
                        line_tokenizer=tokenize,
                        add_if_not_exist=False,
                        consumer=replaced_consumer,
                        append_eos=append_eos,
                        reverse_order=reverse_order,
                        **kwargs,
                    )
                nseq += 1
                ntok += len(ids)
                consumer(ids)
                line = f.readline()
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }
예제 #20
0
    def sbtao_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    ast = util_ast.value2children(ast)
                    padded_ast = util_ast.pad_leaf_node(ast, MAX_SUB_TOKEN_LEN)
                    root_idx = util_ast.get_root_idx(padded_ast)
                    sbt = util_ast.build_sbtao_tree(padded_ast, idx=root_idx)
                else:
                    sbt = None
                print(json_io.json_dumps(sbt), file=writer)
                line = safe_readline(reader)
예제 #21
0
    def path_binarizer(filename,
                       subtoken_dict,
                       consumer,
                       tokenize=None,
                       append_eos=True,
                       reverse_order=False,
                       offset=0,
                       end=-1,
                       type_dict=None,
                       **kwargs):
        nseq, ntok = 0, 0  # nseq = sentence number, ntok = token number
        replaced = Counter()  # un-recorded tokens

        def binarization(parts, dict):
            part_sizes = [len(p) for p in parts]
            parts = list(itertools.chain(*parts))
            parts = torch.Tensor([dict.index(token) for token in parts]).long()
            parts = parts.split(part_sizes, dim=0)
            return parts

        def encode_path(line, ):
            heads, bodies, tails = tokenize(
                line, max_path_num=kwargs['max_path_num'])
            heads = binarization(heads, subtoken_dict)
            bodies = binarization(bodies, type_dict)
            tails = binarization(tails, subtoken_dict)
            paths, path_sizes = [], []
            for head, body, tail in zip(heads, bodies, tails):
                paths.extend([head, body, tail])
                path_sizes.extend([len(head), len(body), len(tail)])
            paths = torch.cat(paths, dim=0)
            path_sizes = torch.Tensor(path_sizes).long()
            assert len(paths) == path_sizes.sum().item()
            return paths, path_sizes

        with file_io.open(filename, "r", encoding="utf-8") as f:
            f.seek(offset)
            # next(f) breaks f.tell(), hence readline() must be used
            line = file_io.safe_readline(f)
            while line:
                if end > 0 and f.tell() > end:
                    break
                paths, path_sizes = encode_path(line)
                ntok += len(paths)
                consumer(paths, path_sizes)
                line = f.readline()
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }
예제 #22
0
def binarize(args, in_file: str, out_file: str, vocab, token_dict, offset: int, end: int):
    ds = indexed_dataset.make_builder(f"{out_file}.mmap", impl='mmap', vocab_size=len(vocab))
    with file_io.open(in_file, 'r') as reader:
        reader.seek(offset)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line)
            code_tokens = vocab.encode(line, out_type=str)
            code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens])
            ds.add_item(code_tokens)
            line = reader.readline()
    ds.finalize(f'{out_file}.idx')
예제 #23
0
    def binarize_trav_trans(
        filename,
        dicts,  # (token_dict, mask_dict)
        consumer,  # (data, ext, ids, )
        tokenize=tokenize_string,
        offset=0,
        end=-1,
    ):
        nseq, ntok = 0, 0  # nseq = sentence number, ntok = token number
        token_dict, mask_dict = dicts
        replaced = Counter()  # un-recorded tokens

        def replaced_consumer(word, idx):
            """save un-recorded token"""
            if idx == token_dict.unk_index and word != token_dict.unk_word:
                replaced.update([word])

        with open(filename, "r", encoding="utf-8") as f:
            f.seek(offset)
            # next(f) breaks f.tell(), hence readline() must be used
            line = safe_readline(f)
            while line:
                if end > 0 and f.tell() > end:
                    break
                for data, ext, ids, mask in tokenize(line):
                    data = token_dict.encode_list(data,
                                                  add_if_not_exist=False,
                                                  consumer=replaced_consumer)
                    ext = torch.IntTensor([ext])
                    if ids:
                        for key, value in ids.items():
                            if len(value) == 0:
                                ids[key] = torch.IntTensor([-1])
                            else:
                                ids[key] = torch.IntTensor(value)
                    if mask:
                        mask = mask_dict.encode_list(mask,
                                                     add_if_not_exist=False)

                    consumer(data, ext, ids, mask)
                    nseq += 1
                    ntok += len(data)
                line = f.readline()
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }
예제 #24
0
    def code_wo_func_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        func_filename = filename[:str.rfind(filename, '.')] + '.func'
        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, open(func_filename, 'r') as func_reader, \
            file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            func_line = safe_readline(func_reader)
            while line and func_line:
                if end > 0 and reader.tell() > end:
                    break
                code = json_io.json_loads(line)
                func_name = json_io.json_loads(func_line)
                start_idx = str.find(code, func_name)
                if start_idx != -1:
                    code_wo_func = code[:start_idx] + code[start_idx +
                                                           len(func_name):]
                else:
                    code_wo_func = None
                print(json_io.json_dumps(code_wo_func), file=writer)
                line = safe_readline(reader)
                func_line = safe_readline(func_reader)
예제 #25
0
def tokenization(
    in_file,
    out_file,
    lang,
    attr,
    start=0,
    end=-1,
):
    with file_io.open(in_file, "r") as reader, file_io.open(out_file,
                                                            'w') as writer:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line).strip()

            if lang == 'python' and attr == 'code':
                line = re.sub(r'\s+', ' ', line)

            line = line.strip()
            tokens = tokenizer.encode_as_pieces(line)
            print(json_io.json_dumps(tokens), file=writer)
            line = file_io.safe_readline(reader)
 def binarize_alignments(filename,
                         alignment_parser,
                         consumer,
                         offset=0,
                         end=-1):
     nseq = 0
     with file_io.open(filename, "r") as f:
         f.seek(offset)
         line = safe_readline(f)
         while line:
             if end > 0 and f.tell() > end:
                 break
             ids = alignment_parser(line)
             nseq += 1
             consumer(ids)
             line = f.readline()
     return {"nseq": nseq}
예제 #27
0
    def binarize_seperate(
        filename,
        dict,
        consumer,
        tokenize=None,
        append_eos=True,
        reverse_order=False,
        offset=0,
        end=-1,
    ):
        nseq, ntok = 0, 0  # nseq = sentence number, ntok = token number
        replaced = Counter()  # un-recorded tokens

        def replaced_consumer(word, idx):
            """save un-recorded token"""
            if idx == dict.unk_index and word != dict.unk_word:
                replaced.update([word])

        with file_io.open(filename, "r", encoding="utf-8") as f:
            f.seek(offset)
            # next(f) breaks f.tell(), hence readline() must be used
            line = file_io.safe_readline(f)
            while line:
                if end > 0 and f.tell() > end:
                    break
                ids_ext = dict.encode_line(
                    line=line,
                    line_tokenizer=tokenize,
                    add_if_not_exist=False,
                    consumer=replaced_consumer,
                    append_eos=append_eos,
                    reverse_order=reverse_order,
                )
                if len(ids_ext) > 0:
                    nseq += 1
                    for ids, ext in ids_ext:
                        ntok += len(ids)
                        consumer(ids, ext)
                line = f.readline()
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }
예제 #28
0
def binarize_dfs(args, filename: str, dict, in_file: str, offset: int,
                 end: int):
    """binarize function for multi-processing"""
    ds_file = '{}.mmap'.format(in_file)
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))

    with file_io.open(filename, 'r') as reader:
        reader.seek(offset)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line)
            dfs = torch.IntTensor([dict.index(tok) for tok in line])
            ds.add_item(dfs)
            line = reader.readline()
    ds.finalize('{}.idx'.format(in_file))
예제 #29
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_token_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    token_dict = save_token_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']

    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens")
        PathManager.mkdir(os.path.dirname(dst_file))

        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        src_file,
                        prefix,
                        vocab,
                        token_dict,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                )
            pool.close()

        ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab))
        end = offsets[1]

        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                code_tokens = vocab.encode(line, out_type=str)
                code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens])
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize(f"{dst_file}.idx")
    def binarize(
        filename,
        dict,
        consumer,
        tokenize=None,
        use_func=False,
        append_eos=True,
        reverse_order=False,
        offset=0,
        end=-1,
        func_offset=0,
        already_numberized=False,
        **kwargs,
    ):
        nseq, ntok = 0, 0  # nseq = sentence number, ntok = token number
        replaced = Counter()  # un-recorded tokens

        def replaced_consumer(word, idx):
            """save un-recorded token"""
            if idx == dict.unk_index and word != dict.unk_word:
                replaced.update([word])

        with file_io.open(filename, "r", encoding="utf-8") as f:
            f.seek(offset)
            if use_func:
                func_reader = file_io.open(
                    filename[:str.rfind(filename, '.')] + '.func_name', 'r')
                func_reader.seek(func_offset)
            line = safe_readline(f)
            func_name = safe_readline(func_reader) if use_func else None
            while line:
                if end > 0 and f.tell() > end:
                    break
                if already_numberized:
                    id_strings = line.strip().split()
                    id_list = [int(id_string) for id_string in id_strings]
                    if reverse_order:
                        id_list.reverse()
                    if append_eos:
                        id_list.append(dict.eos())
                    ids = torch.IntTensor(id_list)
                else:
                    ids = dict.encode_line(
                        line=line,
                        line_tokenizer=tokenize,
                        func_name=func_name,
                        add_if_not_exist=False,
                        consumer=replaced_consumer,
                        append_eos=append_eos,
                        reverse_order=reverse_order,
                        **kwargs,
                    )
                nseq += 1
                ntok += len(ids)
                consumer(ids)
                line = f.readline()
                func_name = safe_readline(func_reader) if use_func else None
        if use_func:
            func_reader.close()
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }