def docstring_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring_tokens = json_io.json_loads(line) if docstring_tokens: docstring_tokens = [ token for token in docstring_tokens \ if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token)) ] if not all( str.isascii(token) for token in docstring_tokens): docstring_tokens = None if (docstring_tokens is None) or not (3 < len(docstring_tokens) <= 50): docstring_tokens = None else: docstring_tokens = None print(json_io.json_dumps(docstring_tokens), file=writer) line = safe_readline(reader)
def tokenization( in_file, out_file, lang, attr, start=0, end=-1, ): with file_io.open(in_file, "r") as reader, file_io.open(out_file, 'w') as writer: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line).strip() if lang == 'python' and attr == 'code': tokens = python_code_tokenize(line) line = ' '.join(tokens).strip() if attr == 'code': line = normalize_program(line, remove_eol=True) else: line = normalize_docstring(line, remove_eol=True, remove_url=True) line = line.strip() tokens = tokenizer.encode_as_pieces(line) print(json_io.json_dumps(tokens), file=writer) line = file_io.safe_readline(reader)
def type_tokenize(line, **kwargs): paths = json_io.json_loads(line) subtokens = [] for p in paths: _, body, _ = p.split(constants.PATH_SEP) subtokens.extend(body.split(constants.PATH_LINK)) return subtokens
def raw_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing lang = kwargs.get('lang') so_dir = kwargs.get('so_dir') so_filename = os.path.join(os.path.expanduser(so_dir), '{}.so'.format(lang)) parser = TreeSitterASTParser(so_filename, lang) dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code = json_io.json_loads(line) if code: raw_ast = parser.parse_raw_ast(code) else: raw_ast = None print(json_io.json_dumps(raw_ast), file=writer) line = safe_readline(reader)
def binary_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: try: ast = util_ast.value2children(ast) ast = util_ast.remove_root_with_uni_child(ast) root_idx = util_ast.get_root_idx(ast) ast = util_ast.delete_node_with_uni_child(ast, idx=root_idx) root_idx = util_ast.get_root_idx(ast) bin_ast = util_ast.binarize_tree(ast, idx=root_idx) # to binary ast tree root_idx = util_ast.get_root_idx(ast) bin_ast = util_ast.reset_indices(bin_ast, root_idx) # reset node indices bin_ast = util_ast.pad_leaf_node(bin_ast, MAX_SUB_TOKEN_LEN) except RecursionError: LOGGER.error('RecursionError, ignore this tree') bin_ast = None except Exception as err: LOGGER.error(err) bin_ast = None else: bin_ast = None print(json_io.json_dumps(bin_ast), file=writer) line = safe_readline(reader)
def path_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename_terminals, dest_filename = dest_filename + '.terminals' + str(idx), dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename_terminals, 'w') as writer_terminals, \ file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: paths = util_path.ast_to_path(ast, MAX_PATH=PATH_NUM) if paths is None: paths = [[None] * 3] * PATH_NUM else: # copy paths size to PATH_NUM if len(paths) < PATH_NUM: supply_ids = list(range(len(paths))) * ((PATH_NUM - len(paths)) // len(paths)) \ + random.sample(range(len(paths)), ((PATH_NUM - len(paths)) % len(paths))) paths.extend([paths[idx] for idx in supply_ids]) random.shuffle(paths) assert len(paths) == PATH_NUM head, body, tail = zip(*paths) else: head, body, tail = [None] * PATH_NUM, [None] * PATH_NUM, [None] * PATH_NUM # terminals for terminal in itertools.chain(*zip(head, tail)): print(json_io.json_dumps(terminal), file=writer_terminals) # path for b in body: print(json_io.json_dumps(b), file=writer) line = safe_readline(reader)
def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code_tokens = json_io.json_loads(line) if code_tokens: # filter comment in code_tokens, eg. //***\n /* */\n code_tokens = [token for token in code_tokens if not (str.startswith(token, '//') or str.startswith(token, '#') or \ (str.startswith(token, '/*') and str.endswith(token, '*/'))) ] if not all(str.isascii(token) for token in code_tokens): code_tokens = None if code_tokens is None or len(code_tokens) < 1: code_tokens = None else: code_tokens = None print(json_io.json_dumps(code_tokens), file=writer) line = safe_readline(reader)
def subtoken_tokenize(line, **kwargs): paths = json_io.json_loads(line) subtokens = [] for p in paths: head, _, tail = p.split(constants.PATH_SEP) subtokens.extend(head.split(constants.PATH_LINK)) subtokens.extend(tail.split(constants.PATH_LINK)) return subtokens
def type_tokenize(line, **kwargs): line = json_io.json_loads(line) paths = line.split(' ')[1:] subtokens = [] for p in paths: _, body, _ = p.split(',') subtokens.extend(body.split('|')) return subtokens
def tokenize_func(line): dp = [] for node in json_io.json_loads(line): if "value" in node: dp.append(node["value"]) else: dp.append(node["type"]) return dp
def subtoken_tokenize(line, **kwargs): line = json_io.json_loads(line) paths = line.split(' ')[1:] subtokens = [] for p in paths: head, _, tail = p.split(',') subtokens.extend(head.split('|')) subtokens.extend(tail.split('|')) return subtokens
def _func(line): line = py150_util.separate_dps( json_io.json_loads(line.strip()), args['preprocess']['n_ctx']) line = [ py150_util.get_dfs(ast) + [ext] for ast, ext in line if len(ast) > 1 ] # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1] return line
def path_tokenize(line, **kwargs): paths = json_io.json_loads(line) paths = paths[:kwargs['max_path_num']] # do not sample paths randomly to keep our generated datasets to be the same # if len(paths) > kwargs['max_path_num']: # paths = np.random.choice(paths, kwargs['max_path_num'], replace=False).tolist() heads, bodies, tails = [], [], [] for p in paths: head, body, tail = p.split(constants.PATH_SEP) heads.append(head.split(constants.PATH_LINK)) bodies.append(body.split(constants.PATH_LINK)) tails.append(tail.split(constants.PATH_LINK)) return heads, bodies, tails
def main(args): # task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly def parse_source_input(code): code_tokens = vocab.tokenize(code) # truncating code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2] source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token] source_ids = vocab.convert_tokens_to_ids(source_tokens) source_size = len(source_tokens) source_mask = [1] * source_size padding_length = config.MAX_SOURCE_LENGTH - len(source_ids) source_ids += [vocab.pad()] * padding_length source_mask += [0] * padding_length return [source_ids, source_mask, source_size] def parse_target_input(code): target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2] target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token] target_ids = vocab.convert_tokens_to_ids(target_tokens) target_size = len(target_ids) target_mask = [1] * target_size padding_length = config.MAX_TARGET_LENGTH - len(target_ids) target_ids += [vocab.pad_token_id] * padding_length target_mask += [0] * padding_length return [target_ids, target_mask, target_size] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl") PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(src_file, 'r') as reader: keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes'] data = {key: [] for key in keys} for line in reader: src_code = json_io.json_loads(line) # src_code = SPACE_SPLITTER.sub(" ", line) # source_ids, source_mask src_line = parse_source_input(src_code) # target_ids, target_mask tgt_line = parse_target_input(src_code) for key, src in zip(keys, [src_code] + src_line + tgt_line): data[key].append(src) file_io.open(dst_file, mode='wb', data=data)
def binarize(args, in_file: str, out_file: str, vocab, token_dict, offset: int, end: int): ds = indexed_dataset.make_builder(f"{out_file}.mmap", impl='mmap', vocab_size=len(vocab)) with file_io.open(in_file, 'r') as reader: reader.seek(offset) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.encode(line, out_type=str) code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens]) ds.add_item(code_tokens) line = reader.readline() ds.finalize(f'{out_file}.idx')
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab dictionary = save_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly lang = args['preprocess']['lang'] for mode in MODES: file = f"{args['preprocess'][f'{mode}pref']}.code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") PathManager.mkdir(os.path.dirname(dst_file)) dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap", impl='mmap', vocab_size=len(vocab)) PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(file, 'r') as reader: data = {'code': []} for line in reader: line = json_io.json_loads(line) code = SPACE_SPLITTER.sub(" ", line) data['code'].append(code) code_tokens = vocab.encode(code, out_type=str) code_tokens = torch.IntTensor( [dictionary.index(token) for token in code_tokens]) # code_tokens = torch.IntTensor(vocab.encode_as_ids(code)) dataset.add_item(code_tokens) dataset.finalize(f"{dst_file}_tokens.idx") # proj indices # cp id data['proj_indices'] = [1] * len(data['code']) file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
def type_tokenize_func(line): ast = json_io.json_loads(line) code_types = [] idx = 0 while idx < len(ast): if ast[idx].get('type', None) in { "attr", "Num", "NameLoad", "NameStore", "NameParam" }: code_types.extend([constants.PAD, ast[idx]['type']]) idx += 2 else: code_types.append(constants.PAD) idx += 1 return code_types
def ast_fn(filename, dest_filename, idx, start=0, end=-1): dest_filename = dest_filename + str(idx) with file_io.open(filename, "r", encoding="UTF-8") as reader, open(dest_filename, 'w') as writer: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) ast = convert(line) print(json_io.json_dumps(ast), file=writer) line = file_io.safe_readline(reader)
def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring = json_io.json_loads(line) print(json_io.json_dumps(docstring), file=writer) line = safe_readline(reader)
def code_wo_func_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing func_filename = filename[:str.rfind(filename, '.')] + '.func' dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, open(func_filename, 'r') as func_reader, \ file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) func_line = safe_readline(func_reader) while line and func_line: if end > 0 and reader.tell() > end: break code = json_io.json_loads(line) func_name = json_io.json_loads(func_line) start_idx = str.find(code, func_name) if start_idx != -1: code_wo_func = code[:start_idx] + code[start_idx + len(func_name):] else: code_wo_func = None print(json_io.json_dumps(code_wo_func), file=writer) line = safe_readline(reader) func_line = safe_readline(func_reader)
def ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing parser = CodeParser(SO_FILE=os.path.join(kwargs['so_dir'], f"{kwargs['lang']}.so"), LANGUAGE=kwargs['lang']) dest_filename = f"{dest_filename}{idx}" with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) ast = parser.parse_raw_ast(code=line, MAX_AST_SIZE=99999999999, append_index=True) print(json_io.json_dumps(ast), file=writer) line = safe_readline(reader)
def func_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break func_name = json_io.json_loads(line) func = func_name.split('.')[-1] print(json_io.json_dumps(func), file=writer) line = safe_readline(reader)
def code_tokenization(src_file): from clgen._atomizer import GreedyAtomizer from clgen._langs import Language with open(src_file, 'r') as reader: src_codes = reader.readlines() opencl_lang = Language.from_str('opencl') atomizer = GreedyAtomizer.from_text(opencl_lang, text='\n'.join(src_codes)) dst_file = f"{src_file}_tokens" with open(dst_file, 'w') as writer: for code in src_codes: code = json_io.json_loads(code) code_tokens = atomizer.atomize(code) code_tokens = [atomizer.atoms[idx] for idx in code_tokens] print(json_io.json_dumps(code_tokens), file=writer)
def traversal_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: ast_traversal = util_traversal.get_dfs(ast) else: ast_traversal = None print(json_io.json_dumps(ast_traversal), file=writer) line = safe_readline(reader)
def path_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: paths = util_path.ast_to_path(ast) print(json_io.json_dumps(paths), file=writer) line = safe_readline(reader)
def dfs_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = f"{dest_filename}{idx}" with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast is not None: dfs, _ = ast_to_dfs(ast) else: dfs = None print(json_io.json_dumps(dfs), file=writer) line = safe_readline(reader)
def add_from_file(self, f): """ Loads a pre-existing dictionary from a text file and adds its symbols to this instance. """ if isinstance(f, str): try: with file_io.open(f, "r") as fd: self.add_from_file(fd) except FileNotFoundError as fnfe: raise fnfe except UnicodeError: raise Exception("Incorrect encoding detected in {}, please " "rebuild the dataset".format(f)) return lines = f.readlines() indices_start_line = self._load_meta(lines) for line in lines[indices_start_line:]: try: raw_line = json_io.json_loads(line.rstrip()) line, field = raw_line[:-1], raw_line[-1] if field == "#fairseq:overwrite": overwrite = True line, field = line[:-1], line[-1] else: line = line[0] overwrite = False count = int(field) word = line if word in self and not overwrite: raise RuntimeError( "Duplicate word found when loading Dictionary: '{}'. " "Duplicate words can overwrite earlier ones by adding the " "#fairseq:overwrite flag at the end of the corresponding row " "in the dictionary file. If using the Camembert model, please " "download an updated copy of the model file.".format( word)) self.add_symbol(word, n=count, overwrite=overwrite) except ValueError: raise ValueError( "Incorrect dictionary format, expected '<token> <cnt> [flags]'" )
def binarize_dfs(args, filename: str, dict, in_file: str, offset: int, end: int): """binarize function for multi-processing""" ds_file = '{}.mmap'.format(in_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) with file_io.open(filename, 'r') as reader: reader.seek(offset) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) dfs = torch.IntTensor([dict.index(tok) for tok in line]) ds.add_item(dfs) line = reader.readline() ds.finalize('{}.idx'.format(in_file))
def flatten_attrs(raw_file, flatten_dir, lang, attrs): def _get_file_info(filename): """get mode and file index from file name""" filename = os.path.split(filename)[-1] mode = filename[:str.rfind(filename, '.jsonl')] return mode mode = _get_file_info(raw_file) attr_writers = {} for attr in attrs: attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}') PathManager.mkdir(os.path.dirname(attr_file)) attr_writers[attr] = file_io.open(attr_file, 'w') print('raw_file: ', raw_file) with file_io.open(raw_file, 'r') as reader: for line in reader: code_snippet = json_io.json_loads(line) for attr, info in code_snippet.items(): if attr in attr_writers: print(json_io.json_dumps(info), file=attr_writers[attr])
def sbtao_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: ast = util_ast.value2children(ast) padded_ast = util_ast.pad_leaf_node(ast, MAX_SUB_TOKEN_LEN) root_idx = util_ast.get_root_idx(padded_ast) sbt = util_ast.build_sbtao_tree(padded_ast, idx=root_idx) else: sbt = None print(json_io.json_dumps(sbt), file=writer) line = safe_readline(reader)