def make_all(lang, vocab): for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]): # copy shared dict into each languages out_dir = os.path.join(args['preprocess']['destdir'], l) PathManager.mkdir(out_dir) dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl') PathManager.copy(dict_path(lang), dst_dict) if args['preprocess']['trainpref']: out_file = os.path.join(out_dir, f"train.{lang}") make_dataset(vocab, args['preprocess']['trainpref'].replace('*', l), "train", lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: out_file = os.path.join(out_dir, f"valid.{lang}") make_dataset(vocab, args['preprocess']['validpref'].replace('*', l), 'valid', lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: out_file = os.path.join(out_dir, f"test.{lang}") make_dataset(vocab, args['preprocess']['testpref'].replace('*', l), 'test', lang, out_file=out_file, num_workers=args['preprocess']['workers'])
def save_lang_dict(): src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl") dict = Dictionary.load(src_file) tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl') PathManager.mkdir(os.path.dirname(tgt_file)) dict.save(tgt_file) return dict
def docstring_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring_tokens = json_io.json_loads(line) if docstring_tokens: docstring_tokens = [ token for token in docstring_tokens \ if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token)) ] if not all( str.isascii(token) for token in docstring_tokens): docstring_tokens = None if (docstring_tokens is None) or not (3 < len(docstring_tokens) <= 50): docstring_tokens = None else: docstring_tokens = None print(json_io.json_dumps(docstring_tokens), file=writer) line = safe_readline(reader)
def _save(self, f, kv_iterator): if isinstance(f, str): PathManager.mkdir(os.path.dirname(f)) with file_io.open(f, "w") as fd: return self.save(fd) for k, v in kv_iterator: print(json_io.json_dumps([k, v]), file=f)
def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code_tokens = json_io.json_loads(line) if code_tokens: # filter comment in code_tokens, eg. //***\n /* */\n code_tokens = [token for token in code_tokens if not (str.startswith(token, '//') or str.startswith(token, '#') or \ (str.startswith(token, '/*') and str.endswith(token, '*/'))) ] if not all(str.isascii(token) for token in code_tokens): code_tokens = None if code_tokens is None or len(code_tokens) < 1: code_tokens = None else: code_tokens = None print(json_io.json_dumps(code_tokens), file=writer) line = safe_readline(reader)
def main(): from dataset.py150 import (RAW_DIR, ATTRIBUTES_DIR, ) from ncc.utils.path_manager import PathManager parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default=RAW_DIR) parser.add_argument('--output_dir', type=str, default=ATTRIBUTES_DIR) parser.add_argument('--valid_p', type=float, default=0.2) parser.add_argument('--max_path_length', type=int, default=8) parser.add_argument('--max_path_width', type=int, default=2) parser.add_argument('--use_method_name', type=bool, default=True) parser.add_argument('--use_nums', type=bool, default=True) parser.add_argument('--n_jobs', type=int, default=multiprocessing.cpu_count()) parser.add_argument('--seed', type=int, default=239) args = parser.parse_args() np.random.seed(args.seed) data_dir = Path(args.data_dir) trains = __collect_asts(data_dir / 'python100k_train.json') evals = __collect_asts(data_dir / 'python50k_eval.json') train, valid = sklearn_model_selection.train_test_split( trains, test_size=args.valid_p, ) test = evals output_dir = Path(args.output_dir) PathManager.mkdir(output_dir) for split_name, split in zip(('train', 'valid', 'test'), (train, valid, test)): output_file = output_dir / f'{split_name}.method_path' __collect_all_and_save(split, args, output_file)
def make_dataset(vocab, aux_dict, input_prefix, output_prefix, lang, max_path_num, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, aux_dict, in_file, out_file, lang, max_path_num, num_workers)
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] PathManager.mkdir(out_dir) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers)
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": # TODO: parse json to txt file, one line one traversal, please help me parallize it. """ because only 1 thread is allowed to write file, we have to use multi-processing for deal with data and merge results from CPUs into a block and then dumps such block. """ def _func(line): line = py150_util.separate_dps( json_io.json_loads(line.strip()), args['preprocess']['n_ctx']) line = [ py150_util.get_dfs(ast) + [ext] for ast, ext in line if len(ast) > 1 ] # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1] return line with PPool() as thread_pool: with file_io.open(file_name(input_prefix, lang), 'r') as f, \ file_io.open(dest_path(output_prefix, lang), 'w') as fout: def _write(result): for res in itertools.chain(*result): print(json_io.json_dumps(res), file=fout) batch_data = [] for line in f: batch_data.append(line) if len(batch_data) >= MAX_BATCH_SIZE: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data batch_data = [] if len(batch_data) > 0: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data else: if lang == 'code_types': in_file = file_name(input_prefix, 'ast') else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, lang, num_workers)
def main(args): # task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly def parse_source_input(code): code_tokens = vocab.tokenize(code) # truncating code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2] source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token] source_ids = vocab.convert_tokens_to_ids(source_tokens) source_size = len(source_tokens) source_mask = [1] * source_size padding_length = config.MAX_SOURCE_LENGTH - len(source_ids) source_ids += [vocab.pad()] * padding_length source_mask += [0] * padding_length return [source_ids, source_mask, source_size] def parse_target_input(code): target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2] target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token] target_ids = vocab.convert_tokens_to_ids(target_tokens) target_size = len(target_ids) target_mask = [1] * target_size padding_length = config.MAX_TARGET_LENGTH - len(target_ids) target_ids += [vocab.pad_token_id] * padding_length target_mask += [0] * padding_length return [target_ids, target_mask, target_size] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl") PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(src_file, 'r') as reader: keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes'] data = {key: [] for key in keys} for line in reader: src_code = json_io.json_loads(line) # src_code = SPACE_SPLITTER.sub(" ", line) # source_ids, source_mask src_line = parse_source_input(src_code) # target_ids, target_mask tgt_line = parse_target_input(src_code) for key, src in zip(keys, [src_code] + src_line + tgt_line): data[key].append(src) file_io.open(dst_file, mode='wb', data=data)
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: languages = [ os.path.basename(d) for d in PathManager.ls(os.path.dirname(input_prefix)) ] for l in languages: in_file = file_name(input_prefix, lang) in_file = str.replace(in_file, '*', l) out_file = dest_path(os.path.join(l, output_prefix), lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers)
def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring = json_io.json_loads(line) print(json_io.json_dumps(docstring), file=writer) line = safe_readline(reader)
def cast_code(raw_code_file, refined_code_file, dst_file): with file_io.open(raw_code_file, 'r') as raw_reader: raw_codes = {} for line in raw_reader: raw_code = line raw_code = raw_code[raw_code.find('def '):] func_name = raw_code[:raw_code.find('(')][4:].strip() raw_codes[func_name] = line.rstrip('\n') PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(refined_code_file, 'r') as refined_reader, file_io.open(dst_file, 'w') as writer: for line in refined_reader: func_name = line[line.find('def '):].split()[1] raw_code = raw_codes[func_name] print(raw_code, file=writer)
def flatten_attrs(raw_file, flatten_dir, lang, attrs): def _get_file_info(filename): """get mode and file index from file name""" filename = os.path.split(filename)[-1] mode = filename[:str.rfind(filename, '.jsonl')] return mode mode = _get_file_info(raw_file) attr_writers = {} for attr in attrs: attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}') PathManager.mkdir(os.path.dirname(attr_file)) attr_writers[attr] = file_io.open(attr_file, 'w') print('raw_file: ', raw_file) with file_io.open(raw_file, 'r') as reader: for line in reader: code_snippet = json_io.json_loads(line) for attr, info in code_snippet.items(): if attr in attr_writers: print(json_io.json_dumps(info), file=attr_writers[attr])
def flatten_attrs(raw_file, flatten_dir, lang, mode, attrs): def _get_file_info(filename): """get mode and file index from file name""" filename = os.path.split(filename)[-1] filename = filename[:str.rfind(filename, '.jsonl.gz')] _, _, idx = filename.split('_') return idx idx = _get_file_info(raw_file) attr_writers = {} for attr in attrs: attr_dir = os.path.join(flatten_dir, lang, mode, attr) PathManager.mkdir(attr_dir) attr_file = os.path.join(attr_dir, '{}.jsonl'.format(idx)) attr_writers[attr] = file_io.open(attr_file, 'w') with file_io.open(raw_file, 'r') as reader: for line in reader: code_snippet = json_io.json_loads(line) for attr, info in code_snippet.items(): if attr in attr_writers: print(json_io.json_dumps(info), file=attr_writers[attr])
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab dictionary = save_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly lang = args['preprocess']['lang'] for mode in MODES: file = f"{args['preprocess'][f'{mode}pref']}.code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") PathManager.mkdir(os.path.dirname(dst_file)) dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap", impl='mmap', vocab_size=len(vocab)) PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(file, 'r') as reader: data = {'code': []} for line in reader: line = json_io.json_loads(line) code = SPACE_SPLITTER.sub(" ", line) data['code'].append(code) code_tokens = vocab.encode(code, out_type=str) code_tokens = torch.IntTensor( [dictionary.index(token) for token in code_tokens]) # code_tokens = torch.IntTensor(vocab.encode_as_ids(code)) dataset.add_item(code_tokens) dataset.finalize(f"{dst_file}_tokens.idx") # proj indices # cp id data['proj_indices'] = [1] * len(data['code']) file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
) from ncc.utils.file_ops.yaml_io import recursive_expanduser from ncc.utils.file_ops import file_io from ncc.utils.path_manager import PathManager if __name__ == '__main__': task = tasks.get_task('multilingual_denoising') base_dir = recursive_expanduser( '~/ncc_data/codexglue/code_to_text/multilingual_denoising/data-mmap') dict_file = os.path.join(base_dir, 'dict.jsonl') vocab = task.load_dictionary(dict_file) for mode in MODES: dst_file = os.path.join(base_dir, 'docstring', f"{mode}.docstring.spm") PathManager.mkdir(os.path.dirname(dst_file)) # mmap ds = indexed_dataset.make_builder(f'{dst_file}.mmap', impl='mmap', vocab_size=len(vocab)) for lang in LANGUAGES: src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm") ds.merge_file_(src_file) ds.finalize(f'{dst_file}.idx') # # raw # with file_io.open(ds, 'w') as writer: # for lang in LANGUAGES: # src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm") # with open(src_file, 'r') as reader: # shutil.copyfileobj(reader, writer)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" target = not args['preprocess']['only_source'] if args['preprocess']['joined_dictionary']: assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \ "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) elif args['preprocess']['tgtdict']: src_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [ train_path(args['preprocess']['source_lang']), train_path(args['preprocess']['target_lang']) ] if not args['preprocess']['only_train']: filenames.extend( \ [valid_path(args['preprocess']['source_lang']), valid_path(args['preprocess']['target_lang'])]) src_dict = task.build_dictionary( filenames, tokenize_func=tokenization.dpu_sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], # set max len for joint dictionaries nwords=max(args['preprocess']['nwordssrc'], args['preprocess']['nwordstgt']), ) tgt_dict = src_dict else: if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = PathManager.ls( train_path(args['preprocess']['source_lang'])) if not args['preprocess']['only_train']: filenames.extend( PathManager.ls( valid_path(args['preprocess']['source_lang']))) src_dict = task.build_dictionary( filenames, tokenize_func=tokenization.dpu_sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdsrc'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], ) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --tgtdict is not specified" filenames = PathManager.ls( train_path(args['preprocess']['target_lang'])) if not args['preprocess']['only_train']: filenames.extend( PathManager.ls( valid_path(args['preprocess']['target_lang']))) tgt_dict = task.build_dictionary( filenames, tokenize_func=tokenization.dpu_sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdtgt'], nwords=args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], ) else: tgt_dict = None src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary if target and tgt_dict is not None: tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenization.dpu_sub_tokenizer, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, out_file=None, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: in_file = file_name(input_prefix, lang) if out_file is None: out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]): # copy shared dict into each languages out_dir = os.path.join(args['preprocess']['destdir'], l) PathManager.mkdir(out_dir) dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl') PathManager.copy(dict_path(lang), dst_dict) if args['preprocess']['trainpref']: out_file = os.path.join(out_dir, f"train.{lang}") make_dataset(vocab, args['preprocess']['trainpref'].replace('*', l), "train", lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: out_file = os.path.join(out_dir, f"valid.{lang}") make_dataset(vocab, args['preprocess']['validpref'].replace('*', l), 'valid', lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: out_file = os.path.join(out_dir, f"test.{lang}") make_dataset(vocab, args['preprocess']['testpref'].replace('*', l), 'test', lang, out_file=out_file, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)
def main(args): # task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = TransformersDictionary.from_pretrained('microsoft/graphcodebert-base') # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly def parse_source_input(code, lang): code_tokens, dfg = extract_dataflow(code, parsers[lang], lang) code_tokens = vocab.subtokenize(code_tokens) ori2cur_pos = {} ori2cur_pos[-1] = (0, 0) for i in range(len(code_tokens)): ori2cur_pos[i] = (ori2cur_pos[i - 1][1], ori2cur_pos[i - 1][1] + len(code_tokens[i])) # truncating code_tokens = code_tokens[ :config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - 3 - min(len(dfg), config.MAX_DATA_FLOW_LEN)] \ [:512 - 3] source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token] source_ids = vocab.convert_tokens_to_ids(source_tokens) position_idx = [i + vocab.pad() + 1 for i in range(len(source_tokens))] dfg = dfg[:config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - len(source_tokens)] source_tokens += [x[0] for x in dfg] position_idx += [0 for _ in dfg] source_ids += [vocab.unk() for _ in dfg] padding_length = config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - len(source_ids) position_idx += [vocab.pad()] * padding_length source_ids += [vocab.pad()] * padding_length # reindex reverse_index = {} for idx, x in enumerate(dfg): reverse_index[x[1]] = idx for idx, x in enumerate(dfg): dfg[idx] = x[:-1] + ([reverse_index[i] for i in x[-1] if i in reverse_index],) dfg_to_dfg = [x[-1] for x in dfg] dfg_to_code = [ori2cur_pos[x[1]] for x in dfg] length = len([vocab.cls()]) dfg_to_code = [(x[0] + length, x[1] + length) for x in dfg_to_code] return [source_ids, position_idx, dfg_to_code, dfg_to_dfg] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl") PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(src_file, 'r') as reader: keys = [ 'code', 'src_tokens', 'src_positions', 'dfg2code', 'dfg2dfg', ] data = {key: [] for key in keys} for line in reader: src_code = json_io.json_loads(line) # source_ids, source_mask src_line = parse_source_input(src_code, lang) for key, src in zip(keys, [src_code] + src_line): data[key].append(src) # cp id data['proj_indices'] = [1] * len(data['code']) file_io.open(dst_file, mode='wb', data=data)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" target = not args['preprocess']['only_source'] from dataset.codexglue.code_to_text import BPE_DIR source_dict_file = os.path.join(BPE_DIR, 'csn/csn.spm.vocab') target_dict_file = os.path.join(os.path.dirname(args['preprocess']['destdir']), 'dict.jsonl') with open(source_dict_file, 'r') as reader, open(target_dict_file, 'w') as writer: for line in reader: print(json_io.json_dumps([line.split('\t')[0], 100]), file=writer) src_dict = tgt_dict = task.load_dictionary(target_dict_file) # shared dicts for all languages src_dict.save( os.path.join(os.path.dirname(args['preprocess']['destdir']), f"{args['preprocess']['source_lang']}.jsonl") ) tgt_dict.save( os.path.join(os.path.dirname(args['preprocess']['destdir']), f"{args['preprocess']['target_lang']}.jsonl") ) src_dict.save(dict_path(args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary if target and tgt_dict is not None: tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1] ), callback=merge_result ) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenization.json_tokenizer, offset=0, end=offsets[1], append_eos=False, ) ) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info( "{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ) ) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] PathManager.mkdir(out_dir) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate(args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate(args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)
mpool.apply_async(func, (_src_filename, _tgt_filename, idx, offsets[idx], offsets[idx + 1])) for idx in range(num_workers) ] result = [res.get() for res in result] def _concate(_tgt_filename, num_workers, tgt_filename): src_filenames = [ _tgt_filename + str(idx) for idx in range(num_workers) ] with file_io.open(tgt_filename, 'w') as writer: for _src_fl in src_filenames: with file_io.open(_src_fl, 'r') as reader: shutil.copyfileobj(reader, writer) PathManager.rm(_src_fl) _concate(_tgt_filename, num_workers, tgt_filename) if __name__ == '__main__': # old ast => new ast for file, mode in zip(['python100k_train.json', 'python50k_eval.json'], MODES): file = os.path.join(RAW_DIR, file) PathManager.mkdir(ATTRIBUTES_DIR) tgt_file = os.path.join(ATTRIBUTES_DIR, f'{mode}.ast') process(src_filename=file, tgt_filename=tgt_file, num_workers=cpu_count(), func=ast_fn)
) parser.add_argument( "--raw_dataset_dir", "-r", default=RAW_DIR, type=str, help="raw dataset download directory", ) parser.add_argument( "--attributes_dir", "-d", default=ATTRIBUTES_DIR, type=str, help="data directory of attributes directory", ) args = parser.parse_args() # print(args) for lang, mode in itertools.product(args.languages, MODES): raw_file = os.path.join(args.raw_dataset_dir, f'{lang}.csv') dst_dir = os.path.join(args.attributes_dir, lang) PathManager.mkdir(dst_dir) flatten(raw_file, dst_dir, mode) code_tokenization(src_file=os.path.join(dst_dir, f'{mode}.src')) # xfg -> inst2vec xfg(src_dir=args.raw_dataset_dir, languages=args.languages, dst_dir=args.attributes_dir)
def main(args): LOGGER.info('mkdir for {} task'.format(args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) # 1. ***************build vocabulary*************** task = tasks.get_task(args['preprocess']['task']) def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" def string2dfs(line): line = json_io.json_loads(line) asts = py150_util.separate_dps(line, args['preprocess']['max_len']) ast_dfs = [[py150_util.get_dfs(ast), ext] for ast, ext in asts if len(ast) > 1] return ast_dfs def string2type_dfs(line): type_dfs = type_tokenize_func(line) type_dfs = py150_util.separate_dps(type_dfs, args['preprocess']['max_len']) type_dfs = [[dfs, ext] for dfs, ext in type_dfs if len(dfs) > 1] return type_dfs def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") target = not args['preprocess']['only_source'] if not args['preprocess']['srcdict'] and os.path.exists( dict_path(args['preprocess']['source_lang'])): raise FileExistsError(dict_path(args['preprocess']['source_lang'])) if target and not args['preprocess']['tgtdict'] and os.path.exists( dict_path(args['preprocess']['target_lang'])): raise FileExistsError(dict_path(args['preprocess']['target_lang'])) if args['preprocess']['only_train']: LOGGER.info('Generating dictionaries with Train data files.') else: LOGGER.info( 'Generating dictionaries with Train/Validation data files.') if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) src_dict = task.build_dictionary( filenames, tokenize_func=tokenize_func, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --tgtdict is not specified" # code_types are from ast filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) tgt_dict = task.build_dictionary( filenames, tokenize_func=type_tokenize_func, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) else: tgt_dict = None src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary if target and tgt_dict is not None: tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab, input_file, output_file, lang, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result) pool.close() ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq') def consumer(data, start_idx): ds.add_item(data) ext_ds.add_item(start_idx) tokenize = string2dfs if lang == 'ast' else string2type_dfs merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) ext_ds.merge_file_(f"{temp_file_path}.ext") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) os.remove( indexed_dataset.index_file_path(f"{temp_file_path}.ext")) ds.finalize('{}.idx'.format(output_file)) ext_ds.finalize() LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": # TODO: parse json to txt file, one line one traversal, please help me parallize it. """ because only 1 thread is allowed to write file, we have to use multi-processing for deal with data and merge results from CPUs into a block and then dumps such block. """ def _func(line): line = py150_util.separate_dps( json_io.json_loads(line.strip()), args['preprocess']['n_ctx']) line = [ py150_util.get_dfs(ast) + [ext] for ast, ext in line if len(ast) > 1 ] # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1] return line with PPool() as thread_pool: with file_io.open(file_name(input_prefix, lang), 'r') as f, \ file_io.open(dest_path(output_prefix, lang), 'w') as fout: def _write(result): for res in itertools.chain(*result): print(json_io.json_dumps(res), file=fout) batch_data = [] for line in f: batch_data.append(line) if len(batch_data) >= MAX_BATCH_SIZE: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data batch_data = [] if len(batch_data) > 0: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data else: if lang == 'code_types': in_file = file_name(input_prefix, 'ast') else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, lang, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_token_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab token_dict = save_token_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") PathManager.mkdir(os.path.dirname(dst_file)) offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize, ( args, src_file, prefix, vocab, token_dict, offsets[worker_id], offsets[worker_id + 1] ), ) pool.close() ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.encode(line, out_type=str) code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens]) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: pool.join() for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(f"{dst_file}.idx")
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def tokenization(tokens): for idx, tok in enumerate(tokens): if len(tok) != 0: tokens[idx] = vocab.encode(tok, out_type=str) return tokens def ast_to_graph(ast): nodes, tokens, adjacence = [], [], [[] for _ in range(len(ast))] for idx, node in enumerate(ast): nodes.append(node['type']) if 'children' in node: tokens.append([]) for child in node['children']: adjacence[idx].append(child) adjacence[child].append(idx) elif 'value' in node: tokens.append(node['value']) else: raise NotImplementedError tokens = tokenization(tokens) depth = {0: 1} # 0 for pad for idx, node in enumerate(ast[1:], start=1): depth[idx] = depth[node['parent']] + 1 depth = list(depth.values()) assert len(nodes) == len(tokens) == len(adjacence) == len(depth) return nodes, tokens, adjacence, depth def save_token_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab token_dict = save_token_dict() def save_node_dict(): src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/node.jsonl") dict = Dictionary.load(src_file) tgt_file = os.path.join(args['preprocess']['destdir'], 'node.jsonl') PathManager.mkdir(os.path.dirname(tgt_file)) dict.save(tgt_file) return dict node_dict = save_node_dict() def save_lang_dict(): src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl") dict = Dictionary.load(src_file) tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl') PathManager.mkdir(os.path.dirname(tgt_file)) dict.save(tgt_file) return dict lang_dict = save_lang_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly lang = args['preprocess']['lang'] for mode in MODES: src_file = f"{args['preprocess'][f'{mode}pref']}.ast" node_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.node") PathManager.mkdir(os.path.dirname(node_file)) node_dataset = indexed_dataset.make_builder(f"{node_file}.mmap", impl='mmap', vocab_size=len(node_dict)) depth_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.depth") depth_dataset = indexed_dataset.make_builder(f"{depth_file}.mmap", impl='mmap') code_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") code_dataset = indexed_dataset.make_builder(f"{code_file}.bin", impl='bin', dtype=str) adjacence_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.adjacence") adjacence_dataset = indexed_dataset.make_builder(f"{adjacence_file}.bin", impl='bin') code_tokens_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") code_tokens_dataset = indexed_dataset.make_builder(f"{code_tokens_file}.bin", impl='bin') with file_io.open(src_file, 'r') as reader: for idx, line in enumerate(reader): line = json_io.json_loads(line) ast = bfs_to_dfs(line) nodes, tokens, adjacence, depth = ast_to_graph(ast) # save node into mmap dataset nodes = torch.IntTensor([node_dict.index(tok) for tok in nodes]) node_dataset.add_item(nodes) # save depth into mmap dataset depth = torch.IntTensor(depth) depth_dataset.add_item(depth) # code code = ''.join(itertools.chain(*tokens)).replace(constants.SPM_SPACE, ' ').strip() code_dataset.add_item(code) # tokens tokens = [[token_dict.index(tok) for tok in toks] if len(toks) > 0 else [] for toks in tokens] code_tokens_dataset.add_item(tokens) # adjacence for adj in adjacence: assert adj == sorted(adj) adjacence_dataset.add_item(adjacence) node_dataset.finalize(f"{node_file}.idx") depth_dataset.finalize(f"{depth_file}.idx") code_dataset.finalize(f"{code_file}.idx") code_tokens_dataset.finalize(f"{code_tokens_file}.idx") adjacence_dataset.finalize(f"{adjacence_file}.idx") # proj indices with file_io.open(f"{args['preprocess'][f'{mode}pref']}.proj", 'r') as reader: projs = [json_io.json_loads(line) for line in reader] proj_indices = Counter(projs) proj_indices = [proj_num for idx, proj_num in proj_indices.items()] proj_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.proj") proj_dataset = indexed_dataset.make_builder(f"{proj_file}.seq", impl='seq') proj_dataset.add_item(torch.IntTensor(proj_indices)) proj_dataset.finalize(f"{proj_file}.idx")
"--dataset_dir", "-d", default=RAW_DIR, type=str, help="raw dataset download directory", ) parser.add_argument( "--flatten_dir", "-f", default=ATTRIBUTES_DIR, type=str, help="data directory of flatten attribute", ) parser.add_argument( "--attrs", "-a", default=['code', 'code_tokens', 'code_types', 'ast'], type=str, nargs='+', ) parser.add_argument( "--cores", "-c", default=cpu_count(), type=int, help="cpu cores for flatten raw data attributes", ) args = parser.parse_args() # print(args) for mode in MODES: src_files = [os.path.join(args.dataset_dir, f"{mode}.{lang}") for lang in args.languages] src_readers = [file_io.open(file, 'r') for lang, file in zip(args.languages, src_files)] for lang in args.languages: PathManager.mkdir(os.path.join(args.flatten_dir, lang)) dst_files = [os.path.join(args.flatten_dir, lang, f"{mode}.code") for lang in args.languages] dst_writers = {lang: file_io.open(file, 'w') for lang, file in zip(args.languages, dst_files)} for lines in zip(*src_readers): lines = list(map(lambda line: SPACE_SPLITTER.sub(" ", line.strip()), lines)) for lang, line in zip(args.languages, lines): print(json_io.json_dumps(line.strip()), file=dst_writers[lang])
def main(args): LOGGER.info('mkdir for {} task'.format(args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) # 1. ***************build vocabulary*************** task = tasks.get_task(args['preprocess']['task']) def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") if args['preprocess']['only_train']: LOGGER.info('Generating dictionaries with Train data files.') else: LOGGER.info('Generating dictionaries with Train/Validation data files.') if args['preprocess']['subtokendict']: subtoken_dict = task.load_dictionary(args['preprocess']['subtokendict']) else: assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) subtoken_dict = task.build_dictionary( filenames, tokenize_func=subtoken_tokenize, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordssubtoken'], padding_factor=args['preprocess']['padding_factor'], ) if args['preprocess']['typedict']: type_dict = task.load_dictionary(args['preprocess']['typedict']) else: assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) type_dict = task.build_dictionary( filenames, tokenize_func=type_tokenize, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordstype'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) if args['preprocess']['docstringdict']: docstring_dict = task.load_dictionary(args['preprocess']['docstringdict']) else: assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['target_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['target_lang'])) docstring_dict = task.build_dictionary( filenames, tokenize_func=tokenization.json_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordsdocstring'], padding_factor=args['preprocess']['padding_factor'], ) subtoken_dict.save(dict_path('subtoken')) type_dict.save(dict_path('type')) docstring_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, aux_dict, prefix, lang, tokenize, max_path_num, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result ) pool.close() ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if lang == 'path': sz_ds_file = '{}.sz.mmap'.format(output_file) sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) else: sz_ds = None def consumer(tensor, size=None): ds.add_item(tensor) if size is not None: sz_ds.add_item(size) if sz_ds is None: merge_result( Binarizer.binarize( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, max_path_num=max_path_num, ) ) else: merge_result( PathSummarizationBinarizer.path_binarizer( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict, max_path_num=max_path_num, ) ) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) if sz_ds is not None: sz_ds.merge_file_(f"{temp_file_path}.sz") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) if sz_ds is not None: os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz")) os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz")) ds.finalize('{}.idx'.format(output_file)) if sz_ds is not None: sz_ds.finalize('{}.sz.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ) ) def make_dataset(vocab, aux_dict, input_prefix, output_prefix, lang, max_path_num, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, aux_dict, in_file, out_file, lang, max_path_num, num_workers) def make_all(lang, vocab, aux_dict=None): if args['preprocess']['trainpref']: max_path_num = args['preprocess']['train_path_num'] make_dataset(vocab, aux_dict, args['preprocess']['trainpref'], "train", lang, max_path_num, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: max_path_num = args['preprocess']['eval_path_num'] for k, validpref in enumerate(args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, aux_dict, validpref, outprefix, lang, max_path_num, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: max_path_num = args['preprocess']['eval_path_num'] for k, testpref in enumerate(args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, aux_dict, testpref, outprefix, lang, max_path_num, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], subtoken_dict, type_dict) make_all(args['preprocess']['target_lang'], docstring_dict)
# -*- coding: utf-8 -*- import argparse import os import gdown from ncc import ( __TREE_SITTER_LIBS_DIR__, LOGGER, ) from ncc.utils.path_manager import PathManager PathManager.mkdir(__TREE_SITTER_LIBS_DIR__) TREE_SITTER_SO_FILE_ARCHIVE_MAP = { "c": "https://drive.google.com/uc?id=1Ce0Wp_IYw4a69dMAd4RbaOqRK-DD592G", "cpp": "https://drive.google.com/uc?id=1Ip-_lW95I7DU_wj96CR-j31VehLtJLz2", "csharp": "https://drive.google.com/uc?id=1fCnNd3WiU1aVqgYZ9ygydTgedHq09pzw", "go": "https://drive.google.com/uc?id=18nIHKBahzkK4Xgm5mHRCOY2npiTC2NLd", "java": "https://drive.google.com/uc?id=1lP-H7D0IpqijmaseigcyqkKBzxWdwmYH", "javascript": "https://drive.google.com/uc?id=1OxM0VFhDi2P8WsOuL0pKzZ8MD-CErzqP", "julia": "https://drive.google.com/uc?id=13_GehtPCUgD1Df6p1-CF0vcEfzMtBTEj", "nix": "https://drive.google.com/uc?id=13W5w4OgcmTEakOSOVGvtqmm97_Px6O5z", "php": "https://drive.google.com/uc?id=1lGzi98rQn4qRnidKpn0jchL8QyLS6gUT", "python": "https://drive.google.com/uc?id=1jhadgdOng1I95cwtmNJz2fqW-SUvhpch", "ruby": "https://drive.google.com/uc?id=1geDqNll4ewd8zqmvUPg9uNrCMZ1iHbQz",
from ncc.utils.file_ops import file_io from ncc.utils.file_ops import json_io from ncc.utils.path_manager import PathManager if __name__ == '__main__': from ncc.data.dictionary import TransformersDictionary vocab = TransformersDictionary.from_pretrained( 'microsoft/graphcodebert-base') for topk in [1, 3, 5]: attributes = ['code', 'ast', 'dfs'] dst_dir = os.path.join(DATASET_DIR, 'codedisen', 'data') for lang in LANGUAGES: PathManager.mkdir(os.path.join(dst_dir, f"top{topk}", lang)) for mode in MODES: readers = [ file_io.open( os.path.join(ATTRIBUTES_DIR, f"top{topk}", lang, f"{mode}.{attr}"), 'r') for lang in LANGUAGES for attr in attributes ] writers = [ file_io.open( os.path.join(dst_dir, f"top{topk}", lang, f"{mode}.{attr}"), 'w') for lang in LANGUAGES for attr in attributes ] writers += [ file_io.open(
# tests: 1 X 1 for mode in MODES: code_num = 0 TOPK = 1 if mode != "train" else args.topk file = os.path.join(args.dataset_dir, f"{mode}.jsonl") id_file = os.path.join(args.flatten_dir, f"top{args.topk}", f"{mode}.id") print(id_file) jv_code = os.path.join(args.flatten_dir, f"top{args.topk}", 'java', f"{mode}.code") jv_raw_code = os.path.join(args.flatten_dir, f"top{args.topk}", 'java', f"{mode}.raw_code") PathManager.mkdir(os.path.dirname(jv_code)) py_code = os.path.join(args.flatten_dir, f"top{args.topk}", 'python', f"{mode}.code") py_raw_code = os.path.join(args.flatten_dir, f"top{args.topk}", 'python', f"{mode}.raw_code") PathManager.mkdir(os.path.dirname(py_code)) with file_io.open(file, 'r') as reader, file_io.open(id_file, 'w') as id_writer, \ file_io.open(jv_code, 'w') as jv_code_writer, file_io.open(jv_raw_code, 'w') as jv_raw_writer, \ file_io.open(py_code, 'w') as py_code_writer, file_io.open(py_raw_code, 'w') as py_raw_writer: for line in reader: line = json_io.json_loads(line) id, jv_codes, py_codes = line['id'], line['java'][:TOPK], line[ 'python'][:TOPK]