def make_binary_dataset(vocab: Dictionary, input_file, output_file, attr: str, num_workers: int): """make binary dataset""" LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, attr, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize_bpe(input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, BPE no replaced token".format( attr, input_file, n_seq_tok[0], n_seq_tok[1], ))
def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None # if num_workers > 1: # # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) # pool = Pool(processes=num_workers - 1) # for worker_id in range(1, num_workers): # prefix = "{}{}".format(output_file, worker_id) # pool.apply_async( # binarize, # ( # args, # input_file, # vocab, # prefix, # offsets[worker_id], # offsets[worker_id + 1] # ), # callback=merge_result # ) # pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if 'code_tokens_wo_func' in os.path.basename(output_file): bin_out = Binarizer.binarize_wo_func( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.string_sub_tokenizer, offset=0, end=offsets[1], append_eos=False, ) elif 'code_tokens' in os.path.basename(input_file): bin_out = Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.sub_tokenizer, offset=0, end=offsets[1], append_eos=False, ) elif 'docstring_tokens' in os.path.basename(input_file): bin_out = Binarizer.binarize_bpe( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.lower_tokenizer, offset=0, end=offsets[1], append_eos=False, ) elif 'func_name' in os.path.basename(input_file): bin_out = Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.func_name_tokenizer, offset=0, end=offsets[1], append_eos=False, ) merge_result(bin_out) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_token_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab token_dict = save_token_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") PathManager.mkdir(os.path.dirname(dst_file)) offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize, ( args, src_file, prefix, vocab, token_dict, offsets[worker_id], offsets[worker_id + 1] ), ) pool.close() ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.encode(line, out_type=str) code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens]) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: pool.join() for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(f"{dst_file}.idx")
def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = f'{output_file}.mmap' ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq') def consumer(data, start_idx): ds.add_item(data) ext_ds.add_item(start_idx) merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=seperate_tokenize, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) ext_ds.merge_file_(f"{temp_file_path}.ext") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) os.remove( indexed_dataset.index_file_path(f"{temp_file_path}.ext")) ds.finalize('{}.idx'.format(output_file)) ext_ds.finalize() LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def make_binary_dataset(vocab: Dictionary, input_file, output_file, use_func, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = find_offsets(input_file, num_chunks=num_workers) func_offsets = None modality = input_file.split('.')[-1] if modality == 'code_tokens': tokenizer = tokenizers.list_tokenizer if use_func: func_offsets = Binarizer.find_func_offsets(input_file, offsets=offsets) elif modality == 'func_name': tokenizer = tokenizers.func_name_tokenizer elif modality == 'docstring_tokens': tokenizer = tokenizers.lower_tokenizer else: raise NotImplementedError(modality) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, ( args, input_file, vocab, prefix, tokenizer, use_func and (modality == 'code_tokens'), offsets[worker_id], offsets[worker_id + 1], func_offsets[worker_id] if func_offsets else 0, ), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizer, use_func=use_func and (modality == 'code_tokens'), offset=offsets[0], end=offsets[1], func_offset=func_offsets[0] if func_offsets else 0, append_eos=False, min_func_len=args['preprocess']['min_func_len'], )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, aux_dict, prefix, lang, tokenize, max_path_num, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result ) pool.close() ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if lang == 'path': sz_ds_file = '{}.sz.mmap'.format(output_file) sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) else: sz_ds = None def consumer(tensor, size=None): ds.add_item(tensor) if size is not None: sz_ds.add_item(size) if sz_ds is None: merge_result( Binarizer.binarize( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, max_path_num=max_path_num, ) ) else: merge_result( PathSummarizationBinarizer.path_binarizer( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict, max_path_num=max_path_num, ) ) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) if sz_ds is not None: sz_ds.merge_file_(f"{temp_file_path}.sz") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) if sz_ds is not None: os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz")) os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz")) ds.finalize('{}.idx'.format(output_file)) if sz_ds is not None: sz_ds.finalize('{}.sz.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ) )
def make_binary_dataset(vocab, input_file, output_file, lang, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result) pool.close() ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq') def consumer(data, start_idx): ds.add_item(data) ext_ds.add_item(start_idx) tokenize = string2dfs if lang == 'ast' else string2type_dfs merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) ext_ds.merge_file_(f"{temp_file_path}.ext") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) os.remove( indexed_dataset.index_file_path(f"{temp_file_path}.ext")) ds.finalize('{}.idx'.format(output_file)) ext_ds.finalize() LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) from ncc.data.dictionary import TransformersDictionary vocab = TransformersDictionary.from_pretrained( 'microsoft/graphcodebert-base') file = os.path.join(args['preprocess']['destdir'], 'dfs.jsonl') node_dict = Dictionary.load(file) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess'][ 'tgt_lang'] # code tokens => code tokens for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.code_tokens" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize_tokens, (args, src_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() ds_file = '{}.mmap'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="mmap", vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.subtokenize(line) code_tokens = torch.IntTensor( vocab.tokens_to_indices(code_tokens)) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(dst_file)) # code => code for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.code" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") ds_file = '{}.bin'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="bin", vocab_size=len(vocab)) with open(src_file, 'r') as reader: for line in reader: line = json_io.json_loads(line) ds.add_item(line) ds.finalize('{}.idx'.format(dst_file)) # dfs => dfs for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.dfs" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.dfs") offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize_dfs, (args, src_file, node_dict, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() ds_file = '{}.mmap'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="mmap", vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) dfs = torch.IntTensor([node_dict.index(tok) for tok in line]) ds.add_item(dfs) line = reader.readline() if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(dst_file))