def process_heart_stone_dataset(): data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out' parse_trees = [] rule_num = 0. example_num = 0 for line in open(data_file): code = line.replace('§', '\n').strip() parse_tree = parse(code) # sanity check pred_ast = parse_tree_to_python_ast(parse_tree) pred_code = astor.to_source(pred_ast) ref_ast = ast.parse(code) ref_code = astor.to_source(ref_ast) if pred_code != ref_code: raise RuntimeError('code mismatch!') rules, _ = parse_tree.get_productions(include_value_node=False) rule_num += len(rules) example_num += 1 parse_trees.append(parse_tree) grammar = get_grammar(parse_trees) with open('hs.grammar.txt', 'w') as f: for rule in grammar: str = rule.__repr__() f.write(str + '\n') with open('hs.parse_trees.txt', 'w') as f: for tree in parse_trees: f.write(tree.__repr__() + '\n') print 'avg. nums of rules: %f' % (rule_num / example_num)
def is_well_formed_python_code(_hyp): try: _hyp = _hyp.replace('#NEWLINE#', '\n').replace('#INDENT#', ' ').replace(' #MERGE# ', '') hyp_ast_tree = parse(_hyp) return True except: return False
def extract_grammar(code_file, prefix='py'): line_num = 0 parse_trees = [] for line in open(code_file): code = line.strip() parse_tree = parse(code) # leaves = parse_tree.get_leaves() # for leaf in leaves: # if not is_terminal_type(leaf.type): # print parse_tree # parse_tree = add_root(parse_tree) parse_trees.append(parse_tree) # sanity check ast_tree = parse_tree_to_python_ast(parse_tree) ref_ast_tree = ast.parse(canonicalize_code(code)).body[0] source1 = astor.to_source(ast_tree) source2 = astor.to_source(ref_ast_tree) assert source1 == source2 # check rules # rule_list = parse_tree.get_rule_list(include_leaf=True) # for rule in rule_list: # if rule.parent.type == int and rule.children[0].type == int: # # rule.parent.type == str and rule.children[0].type == str: # pass # ast_tree = tree_to_ast(parse_tree) # print astor.to_source(ast_tree) # print parse_tree # except Exception as e: # error_num += 1 # #pass # #print e line_num += 1 print 'total line of code: %d' % line_num grammar = get_grammar(parse_trees) with open(prefix + '.grammar.txt', 'w') as f: for rule in grammar: str = rule.__repr__() f.write(str + '\n') with open(prefix + '.parse_trees.txt', 'w') as f: for tree in parse_trees: f.write(tree.__repr__() + '\n') return grammar, parse_trees
def rule_vs_node_stat(): line_num = 0 parse_trees = [] code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out' # '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' node_nums = rule_nums = 0. for line in open(code_file): code = line.replace('§', '\n').strip() parse_tree = parse(code) node_nums += len(list(parse_tree.nodes)) rules, _ = parse_tree.get_productions() rule_nums += len(rules) parse_trees.append(parse_tree) line_num += 1 print 'avg. nums of nodes: %f' % (node_nums / line_num) print 'avg. nums of rules: %f' % (rule_nums / line_num)
def rule_vs_node_stat(): line_num = 0 parse_trees = [] code_file = '/home1/zjq/try3/en-django/all.code' node_nums = rule_nums = 0. for line in open(code_file): code = line.replace('§', '\n').strip() parse_tree = parse(code) node_nums += len(list(parse_tree.nodes)) rules, _ = parse_tree.get_productions() rule_nums += len(rules) parse_trees.append(parse_tree) line_num += 1 print('avg. nums of nodes: %f' % (node_nums / line_num)) print('avg. nums of rules: %f' % (rule_nums / line_num))
def get_unary_links(): # data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out' data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' parse_trees = [] unary_links_counter = Counter() for line in open(data_file): code = line.replace('§', '\n').strip() parse_tree = parse(code) parse_trees.append(parse_tree) example_unary_links = extract_unary_closure(parse_tree) for link in example_unary_links: unary_links_counter[link] += 1 ranked_links = sorted(unary_links_counter, key=unary_links_counter.get, reverse=True) for link in ranked_links: print(str(link) + ' ||| ' + str(unary_links_counter[link])) unary_links = ranked_links[:20] unary_closures = [] for link in unary_links: unary_closures.append(unary_link_to_closure(link)) unary_closures = zip(unary_links, unary_closures) node_nums = rule_nums = 0. for parse_tree in parse_trees: original_parse_tree = parse_tree.copy() for link, closure in unary_closures: apply_unary_closure(parse_tree, closure, link) # assert original_parse_tree != parse_tree compressed_ast_to_normal(parse_tree) assert original_parse_tree == parse_tree rules, _ = parse_tree.get_productions() rule_nums += len(rules) node_nums += len(list(parse_tree.nodes)) print('**** after applying unary closures ****') print('avg. nums of nodes: %f' % (node_nums / len(parse_trees))) print('avg. nums of rules: %f' % (rule_nums / len(parse_trees)))
def get_unary_links(): # data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out' data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' parse_trees = [] unary_links_counter = Counter() for line in open(data_file): code = line.replace('§', '\n').strip() parse_tree = parse(code) parse_trees.append(parse_tree) example_unary_links = extract_unary_closure(parse_tree) for link in example_unary_links: unary_links_counter[link] += 1 ranked_links = sorted(unary_links_counter, key=unary_links_counter.get, reverse=True) for link in ranked_links: print str(link) + ' ||| ' + str(unary_links_counter[link]) unary_links = ranked_links[:20] unary_closures = [] for link in unary_links: unary_closures.append(unary_link_to_closure(link)) unary_closures = zip(unary_links, unary_closures) node_nums = rule_nums = 0. for parse_tree in parse_trees: original_parse_tree = parse_tree.copy() for link, closure in unary_closures: apply_unary_closure(parse_tree, closure, link) # assert original_parse_tree != parse_tree compressed_ast_to_normal(parse_tree) assert original_parse_tree == parse_tree rules, _ = parse_tree.get_productions() rule_nums += len(rules) node_nums += len(list(parse_tree.nodes)) print '**** after applying unary closures ****' print 'avg. nums of nodes: %f' % (node_nums / len(parse_trees)) print 'avg. nums of rules: %f' % (rule_nums / len(parse_trees))
def process_heart_stone_dataset(): data_file = '/home1/zjq/try3/en-django/all.anno' parse_trees = [] rule_num = 0. example_num = 0 for line in open(data_file): code = line.replace('§', '\n').strip() parse_tree = parse(code) # sanity check pred_ast = parse_tree_to_python_ast(parse_tree) pred_code = astor.to_source(pred_ast) #print(pred_code) ref_ast = ast.parse(code) ref_code = astor.to_source(ref_ast) #print(ref_code) if pred_code != ref_code: raise RuntimeError('code mismatch!') rules, _ = parse_tree.get_productions(include_value_node=False) rule_num += len(rules) example_num += 1 parse_trees.append(parse_tree) grammar = get_grammar(parse_trees) with open('hs.grammar.txt', 'w') as f: for rule in grammar: str = rule.__repr__() f.write(str + '\n') with open('hs.parse_trees.txt', 'w') as f: for tree in parse_trees: f.write(tree.__repr__() + '\n') print('avg. nums of rules: %f' % (rule_num / example_num))
f_test.close() f_train_rawid.close() f_dev_rawid.close() f_test_rawid.close() # print 'num. of decoding time steps distribution:' for k in sorted(decode_time_steps): print '%d\t%d' % (k, decode_time_steps[k]) if __name__ == '__main__': init_logging('py.log') # code = "return ( format_html_join ( '' , '_STR:0_' , sorted ( attrs . items ( ) ) ) + format_html_join ( '' , ' {0}' , sorted ( boolean_attrs ) ) )" code = "call('{0}')" parse_tree = parse(code) # parse_tree = ASTNode('root', children=[ # ASTNode('lambda'), # ASTNode('$0'), # ASTNode('e', children=[ # ASTNode('and', children=[ # ASTNode('>', children=[ASTNode('$0')]), # ASTNode('from', children=[ASTNode('$0'), ASTNode('ci0')]), # ]) # ]), # ]) original_parse_tree = parse_tree.copy() break_value_nodes(parse_tree)