def binarize(args: Dict, filename: str, dict: Dictionary, in_file: str, offset: int, end: int, append_eos: bool = False): """binarize function for multi-processing""" ds_file = f'{in_file}.mmap' ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) ext_ds = indexed_dataset.make_builder(f"{in_file}.ext", impl='seq') def consumer(data, start_idx): ds.add_item(data) ext_ds.add_item(start_idx) def seperate_tokenize(line): line = json_io.json_loads(line) tokens = separate_list(line, args['preprocess']['max_len']) return tokens res = Binarizer.binarize_seperate(filename, dict, consumer, tokenize=seperate_tokenize, append_eos=append_eos, offset=offset, end=end) ds.finalize('{}.idx'.format(in_file)) ext_ds.finalize() return res
def binarize(args, filename: str, vocab, aux_dict, in_file: str, lang, tokenize, max_path_num: int, offset: int, end: int, append_eos: bool = False): ds_file = '{}.mmap'.format(in_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if lang == 'path': sz_ds_file = '{}.sz.mmap'.format(in_file) sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) else: sz_ds = None def consumer(tensor, size=None): ds.add_item(tensor) if size is not None: sz_ds.add_item(size) if sz_ds is None: res = Binarizer.binarize(filename, vocab, consumer, tokenize=tokenize, append_eos=append_eos, offset=offset, end=end, ) ds.finalize('{}.idx'.format(in_file)) else: res = PathSummarizationBinarizer.path_binarizer(filename, vocab, consumer, tokenize=tokenize, append_eos=append_eos, offset=offset, end=end, type_dict=aux_dict, max_path_num=max_path_num, ) ds.finalize('{}.idx'.format(in_file)) sz_ds.finalize('{}.sz.idx'.format(in_file)) return res
def binarize(args: Dict, filename: str, dict: Dictionary, in_file: str, offset: int, end: int, append_eos: bool = False): """binarize function for multi-processing""" ds_file = '{}.mmap'.format(in_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize(filename, dict, consumer, tokenize=tokenizers.sub_tokenizer, append_eos=append_eos, offset=offset, end=end) ds.finalize('{}.idx'.format(in_file)) return res
def make_binary_dataset(vocab: Dictionary, input_file, output_file, attr: str, num_workers: int): """make binary dataset""" LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, attr, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize_bpe(input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, BPE no replaced token".format( attr, input_file, n_seq_tok[0], n_seq_tok[1], ))
def binarize(args, filename: str, dict, in_file: str, lang, offset: int, end: int, append_eos: bool = False): ds_file = '{}.mmap'.format(in_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) ext_ds = indexed_dataset.make_builder(f'{in_file}.ext', impl='seq') def consumer(tensor, start_idx): ds.add_item(tensor) ext_ds.add_item(start_idx) def string2dfs(line): line = json_io.json_loads(line) asts = py150_util.separate_dps(line, args['preprocess']['max_len']) ast_dfs = [[py150_util.get_dfs(ast), ext] for ast, ext in asts if len(ast) > 1] return ast_dfs def string2type_dfs(line): type_dfs = type_tokenize_func(line) type_dfs = py150_util.separate_dps(type_dfs, args['preprocess']['max_len']) type_dfs = [[dfs, ext] for dfs, ext in type_dfs if len(dfs) > 1] return type_dfs tokenize = string2dfs if lang == 'ast' else string2type_dfs res = Binarizer.binarize_seperate(filename, dict, consumer, tokenize=tokenize, append_eos=append_eos, offset=offset, end=end) ds.finalize('{}.idx'.format(in_file)) ext_ds.finalize() return res
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab dictionary = save_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly lang = args['preprocess']['lang'] for mode in MODES: file = f"{args['preprocess'][f'{mode}pref']}.code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") PathManager.mkdir(os.path.dirname(dst_file)) dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap", impl='mmap', vocab_size=len(vocab)) PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(file, 'r') as reader: data = {'code': []} for line in reader: line = json_io.json_loads(line) code = SPACE_SPLITTER.sub(" ", line) data['code'].append(code) code_tokens = vocab.encode(code, out_type=str) code_tokens = torch.IntTensor( [dictionary.index(token) for token in code_tokens]) # code_tokens = torch.IntTensor(vocab.encode_as_ids(code)) dataset.add_item(code_tokens) dataset.finalize(f"{dst_file}_tokens.idx") # proj indices # cp id data['proj_indices'] = [1] * len(data['code']) file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
def binarize(args, in_file: str, out_file: str, vocab, token_dict, offset: int, end: int): ds = indexed_dataset.make_builder(f"{out_file}.mmap", impl='mmap', vocab_size=len(vocab)) with file_io.open(in_file, 'r') as reader: reader.seek(offset) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.encode(line, out_type=str) code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens]) ds.add_item(code_tokens) line = reader.readline() ds.finalize(f'{out_file}.idx')
def binarize(args: Dict, filename: str, dict: Dictionary, out_file_prefix: str, attr: str, offset: int, end: int): """binarize function for multi-processing""" ds_file = '{}.mmap'.format(out_file_prefix) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize_bpe(filename, dict, consumer, offset=offset, end=end) ds.finalize('{}.idx'.format(out_file_prefix)) return res
def binarize(args, filename, dict, in_file, offset, end, append_eos=False): """binarize function for multi-processing""" ds_file = '{}.mmap'.format(in_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) def consumer(data, _): ds.add_item(data) res = Binarizer.binarize_seperate(filename, dict, consumer, tokenize=string2tokens, append_eos=append_eos, offset=offset, end=end) ds.finalize('{}.idx'.format(in_file)) return res
def binarize_dfs(args, filename: str, dict, in_file: str, offset: int, end: int): """binarize function for multi-processing""" ds_file = '{}.mmap'.format(in_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) with file_io.open(filename, 'r') as reader: reader.seek(offset) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) dfs = torch.IntTensor([dict.index(tok) for tok in line]) ds.add_item(dfs) line = reader.readline() ds.finalize('{}.idx'.format(in_file))
def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None # if num_workers > 1: # # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) # pool = Pool(processes=num_workers - 1) # for worker_id in range(1, num_workers): # prefix = "{}{}".format(output_file, worker_id) # pool.apply_async( # binarize, # ( # args, # input_file, # vocab, # prefix, # offsets[worker_id], # offsets[worker_id + 1] # ), # callback=merge_result # ) # pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if 'code_tokens_wo_func' in os.path.basename(output_file): bin_out = Binarizer.binarize_wo_func( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.string_sub_tokenizer, offset=0, end=offsets[1], append_eos=False, ) elif 'code_tokens' in os.path.basename(input_file): bin_out = Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.sub_tokenizer, offset=0, end=offsets[1], append_eos=False, ) elif 'docstring_tokens' in os.path.basename(input_file): bin_out = Binarizer.binarize_bpe( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.lower_tokenizer, offset=0, end=offsets[1], append_eos=False, ) elif 'func_name' in os.path.basename(input_file): bin_out = Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.func_name_tokenizer, offset=0, end=offsets[1], append_eos=False, ) merge_result(bin_out) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_token_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab token_dict = save_token_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") PathManager.mkdir(os.path.dirname(dst_file)) offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize, ( args, src_file, prefix, vocab, token_dict, offsets[worker_id], offsets[worker_id + 1] ), ) pool.close() ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.encode(line, out_type=str) code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens]) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: pool.join() for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(f"{dst_file}.idx")
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def tokenization(tokens): for idx, tok in enumerate(tokens): if len(tok) != 0: tokens[idx] = vocab.encode(tok, out_type=str) return tokens def ast_to_graph(ast): nodes, tokens, adjacence = [], [], [[] for _ in range(len(ast))] for idx, node in enumerate(ast): nodes.append(node['type']) if 'children' in node: tokens.append([]) for child in node['children']: adjacence[idx].append(child) adjacence[child].append(idx) elif 'value' in node: tokens.append(node['value']) else: raise NotImplementedError tokens = tokenization(tokens) depth = {0: 1} # 0 for pad for idx, node in enumerate(ast[1:], start=1): depth[idx] = depth[node['parent']] + 1 depth = list(depth.values()) assert len(nodes) == len(tokens) == len(adjacence) == len(depth) return nodes, tokens, adjacence, depth def save_token_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab token_dict = save_token_dict() def save_node_dict(): src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/node.jsonl") dict = Dictionary.load(src_file) tgt_file = os.path.join(args['preprocess']['destdir'], 'node.jsonl') PathManager.mkdir(os.path.dirname(tgt_file)) dict.save(tgt_file) return dict node_dict = save_node_dict() def save_lang_dict(): src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl") dict = Dictionary.load(src_file) tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl') PathManager.mkdir(os.path.dirname(tgt_file)) dict.save(tgt_file) return dict lang_dict = save_lang_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly lang = args['preprocess']['lang'] for mode in MODES: src_file = f"{args['preprocess'][f'{mode}pref']}.ast" node_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.node") PathManager.mkdir(os.path.dirname(node_file)) node_dataset = indexed_dataset.make_builder(f"{node_file}.mmap", impl='mmap', vocab_size=len(node_dict)) depth_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.depth") depth_dataset = indexed_dataset.make_builder(f"{depth_file}.mmap", impl='mmap') code_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") code_dataset = indexed_dataset.make_builder(f"{code_file}.bin", impl='bin', dtype=str) adjacence_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.adjacence") adjacence_dataset = indexed_dataset.make_builder(f"{adjacence_file}.bin", impl='bin') code_tokens_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") code_tokens_dataset = indexed_dataset.make_builder(f"{code_tokens_file}.bin", impl='bin') with file_io.open(src_file, 'r') as reader: for idx, line in enumerate(reader): line = json_io.json_loads(line) ast = bfs_to_dfs(line) nodes, tokens, adjacence, depth = ast_to_graph(ast) # save node into mmap dataset nodes = torch.IntTensor([node_dict.index(tok) for tok in nodes]) node_dataset.add_item(nodes) # save depth into mmap dataset depth = torch.IntTensor(depth) depth_dataset.add_item(depth) # code code = ''.join(itertools.chain(*tokens)).replace(constants.SPM_SPACE, ' ').strip() code_dataset.add_item(code) # tokens tokens = [[token_dict.index(tok) for tok in toks] if len(toks) > 0 else [] for toks in tokens] code_tokens_dataset.add_item(tokens) # adjacence for adj in adjacence: assert adj == sorted(adj) adjacence_dataset.add_item(adjacence) node_dataset.finalize(f"{node_file}.idx") depth_dataset.finalize(f"{depth_file}.idx") code_dataset.finalize(f"{code_file}.idx") code_tokens_dataset.finalize(f"{code_tokens_file}.idx") adjacence_dataset.finalize(f"{adjacence_file}.idx") # proj indices with file_io.open(f"{args['preprocess'][f'{mode}pref']}.proj", 'r') as reader: projs = [json_io.json_loads(line) for line in reader] proj_indices = Counter(projs) proj_indices = [proj_num for idx, proj_num in proj_indices.items()] proj_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.proj") proj_dataset = indexed_dataset.make_builder(f"{proj_file}.seq", impl='seq') proj_dataset.add_item(torch.IntTensor(proj_indices)) proj_dataset.finalize(f"{proj_file}.idx")
def make_binary_dataset(vocab, input_file, output_file, lang, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result) pool.close() ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq') def consumer(data, start_idx): ds.add_item(data) ext_ds.add_item(start_idx) tokenize = string2dfs if lang == 'ast' else string2type_dfs merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) ext_ds.merge_file_(f"{temp_file_path}.ext") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) os.remove( indexed_dataset.index_file_path(f"{temp_file_path}.ext")) ds.finalize('{}.idx'.format(output_file)) ext_ds.finalize() LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = f'{output_file}.mmap' ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq') def consumer(data, start_idx): ds.add_item(data) ext_ds.add_item(start_idx) merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=seperate_tokenize, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) ext_ds.merge_file_(f"{temp_file_path}.ext") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) os.remove( indexed_dataset.index_file_path(f"{temp_file_path}.ext")) ds.finalize('{}.idx'.format(output_file)) ext_ds.finalize() LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def make_binary_dataset(vocab: Dictionary, input_file, output_file, use_func, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = find_offsets(input_file, num_chunks=num_workers) func_offsets = None modality = input_file.split('.')[-1] if modality == 'code_tokens': tokenizer = tokenizers.list_tokenizer if use_func: func_offsets = Binarizer.find_func_offsets(input_file, offsets=offsets) elif modality == 'func_name': tokenizer = tokenizers.func_name_tokenizer elif modality == 'docstring_tokens': tokenizer = tokenizers.lower_tokenizer else: raise NotImplementedError(modality) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, ( args, input_file, vocab, prefix, tokenizer, use_func and (modality == 'code_tokens'), offsets[worker_id], offsets[worker_id + 1], func_offsets[worker_id] if func_offsets else 0, ), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizer, use_func=use_func and (modality == 'code_tokens'), offset=offsets[0], end=offsets[1], func_offset=func_offsets[0] if func_offsets else 0, append_eos=False, min_func_len=args['preprocess']['min_func_len'], )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, aux_dict, prefix, lang, tokenize, max_path_num, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result ) pool.close() ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if lang == 'path': sz_ds_file = '{}.sz.mmap'.format(output_file) sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) else: sz_ds = None def consumer(tensor, size=None): ds.add_item(tensor) if size is not None: sz_ds.add_item(size) if sz_ds is None: merge_result( Binarizer.binarize( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, max_path_num=max_path_num, ) ) else: merge_result( PathSummarizationBinarizer.path_binarizer( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict, max_path_num=max_path_num, ) ) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) if sz_ds is not None: sz_ds.merge_file_(f"{temp_file_path}.sz") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) if sz_ds is not None: os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz")) os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz")) ds.finalize('{}.idx'.format(output_file)) if sz_ds is not None: sz_ds.finalize('{}.sz.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ) )
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) from ncc.data.dictionary import TransformersDictionary vocab = TransformersDictionary.from_pretrained( 'microsoft/graphcodebert-base') file = os.path.join(args['preprocess']['destdir'], 'dfs.jsonl') node_dict = Dictionary.load(file) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess'][ 'tgt_lang'] # code tokens => code tokens for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.code_tokens" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize_tokens, (args, src_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() ds_file = '{}.mmap'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="mmap", vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.subtokenize(line) code_tokens = torch.IntTensor( vocab.tokens_to_indices(code_tokens)) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(dst_file)) # code => code for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.code" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") ds_file = '{}.bin'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="bin", vocab_size=len(vocab)) with open(src_file, 'r') as reader: for line in reader: line = json_io.json_loads(line) ds.add_item(line) ds.finalize('{}.idx'.format(dst_file)) # dfs => dfs for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.dfs" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.dfs") offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize_dfs, (args, src_file, node_dict, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() ds_file = '{}.mmap'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="mmap", vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) dfs = torch.IntTensor([node_dict.index(tok) for tok in line]) ds.add_item(dfs) line = reader.readline() if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(dst_file))
) from ncc.utils.file_ops.yaml_io import recursive_expanduser from ncc.utils.file_ops import file_io from ncc.utils.path_manager import PathManager if __name__ == '__main__': task = tasks.get_task('multilingual_denoising') base_dir = recursive_expanduser( '~/ncc_data/codexglue/code_to_text/multilingual_denoising/data-mmap') dict_file = os.path.join(base_dir, 'dict.jsonl') vocab = task.load_dictionary(dict_file) for mode in MODES: dst_file = os.path.join(base_dir, 'docstring', f"{mode}.docstring.spm") PathManager.mkdir(os.path.dirname(dst_file)) # mmap ds = indexed_dataset.make_builder(f'{dst_file}.mmap', impl='mmap', vocab_size=len(vocab)) for lang in LANGUAGES: src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm") ds.merge_file_(src_file) ds.finalize(f'{dst_file}.idx') # # raw # with file_io.open(ds, 'w') as writer: # for lang in LANGUAGES: # src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm") # with open(src_file, 'r') as reader: # shutil.copyfileobj(reader, writer)