proof_steps[split][-1]['goal_id'] = proof_data['goal_id'] proof_steps[split][-1]['length'] = proof_data['length'] else: proof_steps[split][-1]['is_synthetic'] = False if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Extract the proof steps from CoqGym for trainig ASTactic via supervised learning') arg_parser.add_argument('--data_root', type=str, default='../data', help='The folder for CoqGym') arg_parser.add_argument('--output', type=str, default='./proof_steps/', help='The output file') arg_parser.add_argument('--filter', type=str, help='filter the proofs') args = arg_parser.parse_args() print(args) iter_proofs(args.data_root, process_proof, include_synthetic=False, show_progress=True) for split in ['train', 'valid']: print("Proof steps {}: {}".format(split, len(proof_steps[split]))) for i, step in enumerate(proof_steps[split]): dirname = os.path.join(args.output, split) if not os.path.exists(dirname): os.makedirs(dirname) if args.filter: pickle.dump(step, open(os.path.join(dirname, '%s-%08d.pickle' % (args.filter, i)), 'wb')) else: pickle.dump(step, open(os.path.join(dirname, '%08d.pickle' % i), 'wb')) print('output saved to ', args.output)
data["num_goals"].append(len(proof_data["goals"])) data["num_env_constants"].append(len(proof_data["env"]["constants"])) data["num_env_inductives"].append(len(proof_data["env"]["inductives"])) data["num_env"].append( data["num_env_constants"][-1] + data["num_env_inductives"][-1] ) # data['num_env_constants_same_file'].append(len([const for const in proof_data['env']['constants'] # if const['qualid'].startswith('SerTop.')])) data["avg_size_local_context"].append( np.mean([len(g["hypotheses"]) for g in proof_data["goals"].values()]) ) iter_proofs( common.data_root, process_proof, include_synthetic=args.synthetic, show_progress=True, ) df = pd.DataFrame(data) df.to_csv(args.output) print("output saved to ", args.output) # show some statistics print(df.describe()) print( df.groupby("project").agg( {"name": "count", "num_steps": "mean", "num_goals": "mean", "num_env": "mean"} ) )
ast_height = [] num_tokens = [] num_chars = [] has_argument = [] def process_proof(filename, proof_data): global ast_height global num_tokens global num_chars for step in proof_data['steps']: if step['command'][1] != 'VernacExtend': continue if not step['command'][0].endswith('.'): continue tac_str = step['command'][0][:-1] try: tree = tree_builder.transform(grammar.parser.parse(tac_str)) except (UnexpectedCharacters, ParseError) as ex: continue ast_height.append(tree.height()) num_tokens.append(tree.num_tokens()) num_chars.append(len(tac_str)) has_argument.append(int(tree.has_argument())) iter_proofs(common.data_root, process_proof, show_progress=True) print(np.mean(ast_height), np.mean(num_tokens), np.mean(num_chars), np.mean(has_argument))
global abnormal_proofs global abnormal_env_files tempname = proof_data['name'] if tempname not in term_proofs: return goal_dict = proof_data['goals'] tempenv = proof_data['env'] """ for xx in tempenv['constants']: if xx['type'] == None: abnormal_env_files.add(filename) break """ for tempgoalid in goal_dict: tempgoal = goal_dict[tempgoalid] #if tempgoal['type'] == None: # abnormal_proofs.add((filename, tempname)) for temphypo in tempgoal['hypotheses']: # if temphypo['type'] == None: # abnormal_proofs.add((filename, tempname)) if len(temphypo['term']) > 1: special_terms_proofs.add((filename, tempname)) proj = filename.split(os.path.sep)[2] iter_proofs(common.data_root, count_proof, include_synthetic=False, show_progress=True) print(special_terms_proofs)
# get the statistics of the proofs import argparse import common from utils import iter_proofs import pandas as pd import numpy as np import re import pdb oup = open('short_proofs.txt', 'wt') def process_proof(filename, proof_data): if 1 <= len(proof_data['steps']) <= 2: print(proof_data['steps'][0]['command'][0]) oup.write(proof_data['steps'][0]['command'][0] + '\n') iter_proofs(common.data_root, process_proof)