def main(args): dataset = Dataset(args['TAR_FILES']) code_line_file = open(args['OUTPUT_CODE_FILE'], 'w') all_preserved_tokens = set() for example in dataset.get_iterator(num_workers=5): code = example.ast.code # code_tokens = tokenize_raw_code(code) # preserved_tokens = [token for token in code_tokens if token.startswith('@@') and token.endswith('@@')] # all_preserved_tokens.update(preserved_tokens) # code_line_file.write(' '.join(code_tokens) + '\n') code_line_file.close() with open(args['OUTPUT_CODE_FILE'] + '.preserved_tokens.txt', 'w') as f: for token in all_preserved_tokens: f.write(token + '\n')
args = docopt(__doc__) vocab_size = int(args['--size']) vocab_file = args['VOCAB_FILE'] train_set = Dataset(args['TRAIN_FILE']) src_code_tokens_file = vocab_file + '.src_code_tokens.txt' src_preserved_tokens = set() f_src_token = open(src_code_tokens_file, 'w') # extract vocab and node types node_types = set() src_words = [] tgt_words = [] identifier_names = [] type_tokens = [] for example in train_set.get_iterator(progress=True, num_workers=5): for node in example.ast: node_types.add(node.node_type) if node.is_variable_node: old_var_name = node.old_name new_var_name = node.new_name src_words.append(old_var_name) if old_var_name != new_var_name: tgt_words.append(new_var_name) if node.node_type == 'obj' or node.node_type == 'block' and hasattr( node, 'name'): identifier_names.append(node.name)