def __call__(self, context: TacticContext) -> int: if len(context.hypotheses) == 0: return 0 hyp_types = [ limitNumTokens(serapi_instance.get_hyp_type(hyp), self.max_length) for hyp in context.hypotheses ] goal = limitNumTokens(context.goal, self.max_length) closest_hyp_type = max(hyp_types, key=lambda x: SequenceMatcher(None, goal, x). ratio() * len(get_symbols(x))) headToken = get_symbols(closest_hyp_type)[0] if headToken in self.headKeywords: return self.headKeywords.index(headToken) + 1 else: return 0
def __call__(self, context: TacticContext) -> List[float]: identifiers = get_symbols(context.goal) locallyBoundInHyps = serapi_instance.get_vars_in_hyps( context.hypotheses) binders = [ "forall\s+(.*)(?::.*)?,", "fun\s+(.*)(?::.*)?,", "let\s+\S+\s+:=" ] punctuation = ["(", ")", ":", ",", "_", ":=", "=>", "{|", "|}"] locallyBoundInTerm = [ var for binder_pattern in binders for varString in re.findall(binder_pattern, context.goal) for var in re.findall("\((\S+)\s+:", varString) if var not in punctuation ] globallyBoundIdentifiers = \ [ident for ident in identifiers if not ident in locallyBoundInHyps + locallyBoundInTerm + punctuation] locallyBoundIdentifiers = [ ident for ident in identifiers if not ident in globallyBoundIdentifiers + punctuation ] for var in locallyBoundInTerm: assert var in locallyBoundIdentifiers, \ "{}, {}".format(globallyBoundIdentifiers, locallyBoundInTerm) locallyBoundIdentifiers.remove(var) return [ math.log1p(float(len(locallyBoundIdentifiers))), # math.log1p(float(len(globallyBoundIdentifiers))), float(len(globallyBoundIdentifiers)) / float( len(globallyBoundIdentifiers) + len(locallyBoundIdentifiers)) ]
def encode_seq_structural_data(data : RawDataset, context_tokenizer_type : \ Callable[[List[str], int], Tokenizer], num_keywords : int, num_reserved_tokens: int) -> \ Tuple[StructDataset, Tokenizer, SimpleEmbedding]: embedding = SimpleEmbedding() hyps_and_goals = [ hyp_or_goal for hyp_and_goal in [ zip(hyps + [goal], itertools.repeat(embedding.encode_token(tactic))) for prev_tactics, hyps, goal, tactic in data ] for hyp_or_goal in hyp_and_goal ] context_tokenizer = make_keyword_tokenizer_relevance( hyps_and_goals, context_tokenizer_type, num_keywords, num_reserved_tokens) encodedData = [] for prev_tactics, hyps, goal, tactic in data: stem, rest = serapi_instance.split_tactic(tactic) encodedData.append( ([context_tokenizer.toTokenList(hyp) for hyp in hyps], context_tokenizer.toTokenList(goal), (embedding.encode_token(stem), [hyp_index(hyps, arg) for arg in get_symbols(rest)]))) return encodedData, context_tokenizer, embedding
def __call__(self, context: TacticContext) -> int: if context.goal.strip() == "": return 0 headToken = get_symbols(context.goal)[0] if headToken in self.headKeywords: return self.headKeywords.index(headToken) + 2 else: return 1
def numeric_args(in_data : TacticContext, tactic : str, next_in_data : TacticContext, arg_values : argparse.Namespace) -> bool: goal_words = get_symbols(in_data.goal) stem, rest = serapi_instance.split_tactic(tactic) args = get_subexprs(rest.strip(".")) for arg in args: if not re.fullmatch("\d+", arg): return False return True
def get_arg_idx(max_length: int, inter: ScrapedTactic) -> int: tactic_stem, tactic_rest = serapi_instance.split_tactic(inter.tactic) symbols = tokenizer.get_symbols(inter.context.focused_goal) arg = tactic_rest.split()[0].strip(".") assert arg in symbols, "tactic: {}, arg: {}, goal: {}, symbols: {}"\ .format(inter.tactic, arg, inter.context.focused_goal, symbols) idx = symbols.index(arg) if idx >= max_length: return 0 else: return idx + 1
def args_token_in_goal(in_data: ContextData, tactic: str, next_in_data: ContextData, arg_values: argparse.Namespace) -> bool: goal = in_data["goal"] goal_words = get_symbols(cast(str, goal))[:arg_values.max_length] stem, rest = serapi_instance.split_tactic(tactic) args = get_subexprs(rest.strip(".")) for arg in args: if not arg in goal_words: return False return True
def __init__(self, init_dataset: List[TacticContext], args: argparse.Namespace) -> None: headTokenCounts: typing.Counter[str] = Counter() for prev_tactics, hyps, goal in init_dataset: headToken = get_symbols(goal)[0] headTokenCounts[headToken] += 1 self.headKeywords = [ word for word, count in headTokenCounts.most_common( args.num_head_keywords) ] eprint("Goal head keywords are {}".format(self.headKeywords), guard=args.print_keywords)
def get_stem_and_arg_idx(max_length: int, embedding: Embedding, inter: ScrapedTactic) -> Tuple[int, int]: tactic_stem, tactic_rest = serapi_instance.split_tactic(inter.tactic) stem_idx = embedding.encode_token(tactic_stem) symbols = tokenizer.get_symbols(inter.context.focused_goal) arg = tactic_rest.split()[0].strip(".") assert arg in symbols, "tactic: {}, arg: {}, goal: {}, symbols: {}"\ .format(inter.tactic, arg, inter.context.focused_goal, symbols) idx = symbols.index(arg) if idx >= max_length: return stem_idx, 0 else: return stem_idx, idx + 1
def __init__(self, init_dataset: List[TacticContext], args: argparse.Namespace) -> None: self.max_length = args.max_length headTokenCounts: typing.Counter[str] = Counter() for prev_tactics, hyps, goal in init_dataset: for hyp in hyps: headToken = get_symbols(serapi_instance.get_hyp_type(hyp))[0] headTokenCounts[headToken] += 1 self.headKeywords = [ word for word, count in headTokenCounts.most_common( args.num_head_keywords) ] eprint("Hypothesis head keywords are {}".format(self.headKeywords), guard=args.print_keywords)
def predictKTactics(self, in_data : TacticContext, k : int) -> List[Prediction]: if len(in_data.hypotheses) == 0: return [Prediction("eauto", 0)] k = min(k, len(in_data.hypotheses)) best_hyps = \ sorted(in_data.hypotheses, reverse=True, key=lambda hyp: SequenceMatcher(None, tokenizer.get_symbols( serapi_instance.get_hyp_type(hyp)), in_data.goal).ratio() * len(hyp) )[:k] return [Prediction("apply " + serapi_instance.get_first_var_in_hyp(hyp) + ".", .5 ** idx) for idx, hyp in enumerate(best_hyps)]
def __init__(self, init_dataset : List[TacticContext], args : argparse.Namespace) -> None: headTokenCounts : typing.Counter[str] = Counter() for relevant_lemmas, prev_tactics, hyps, goal in init_dataset: headToken = get_symbols(goal)[0] headTokenCounts[headToken] += 1 if args.load_head_keywords and Path2(args.load_head_keywords).exists(): self.headKeywords = torch.load(args.load_head_keywords) else: self.headKeywords = [word for word, count in headTokenCounts.most_common(args.num_head_keywords)] if args.save_head_keywords: torch.save(self.headKeywords, args.save_head_keywords) eprint("Head keywords are {}".format(self.headKeywords), guard=args.print_keywords)
def from_data(init_dataset: List[TacticContext], args: argparse.Namespace) -> 'TopLevelTokenInBestHyp': headTokenCounts: typing.Counter[str] = Counter() for relevant_lemmas, prev_tactics, hyps, goal in init_dataset: for hyp in hyps: headToken = get_symbols(serapi_instance.get_hyp_type(hyp))[0] headTokenCounts[headToken] += 1 if args.load_head_keywords and Path2(args.load_head_keywords).exists(): result = TopLevelTokenInBestHyp( args, torch.load(args.load_head_keywords)) else: result = TopLevelTokenInBestHyp( [word for word, count in headTokenCounts.most_common(args.num_head_keywords)]) eprint("Hypothesis head keywords are {}".format(result.headKeywords), guard=args.print_keywords) return result
def args_token_in_goal(in_data : TacticContext, tactic : str, next_in_data : TacticContext, arg_values : argparse.Namespace) -> bool: goal_words = get_symbols(in_data.goal)[:arg_values.max_length] stem, rest = serapi_instance.split_tactic(tactic) if len(rest) > 0 and rest[-1] == '.': rest = rest[:-1] args = get_subexprs(rest) # While the arguments to an intro(s) might *look* like # goal arguments, they are actually fresh variables if (stem == "intros" or stem == "intro") and len(args) > 0: return False for arg in args: if not any([serapi_instance.symbol_matches(goal_word, arg) for goal_word in goal_words]): return False return True
def from_data(init_dataset: List[TacticContext], args: argparse.Namespace) -> 'TopLevelTokenInGoal': headTokenCounts: typing.Counter[str] = Counter() for relevant_lemmas, prev_tactics, hyps, goal in init_dataset: if goal.strip() == "": continue headToken = get_symbols(goal)[0] headTokenCounts[headToken] += 1 if args.load_head_keywords and Path2(args.load_head_keywords).exists(): result = TopLevelTokenInGoal(torch.load(args.load_head_keywords)) else: result = TopLevelTokenInGoal( [word for word, count in headTokenCounts.most_common(args.num_head_keywords)]) if args.save_head_keywords: torch.save(result.headKeywords, args.save_head_keywords) eprint("Goal head keywords are {}".format(result.headKeywords), guard=args.print_keywords) return result
def _encode_action(self, context: TacticContext, action: str) \ -> Tuple[int, int]: stem, argument = serapi_instance.split_tactic(action) stem_idx = emap_lookup(self.tactic_map, 32, stem) all_premises = context.hypotheses + context.relevant_lemmas stripped_arg = argument.strip(".").strip() if stripped_arg == "": arg_idx = 0 else: index_hyp_vars = dict( serapi_instance.get_indexed_vars_in_hyps(all_premises)) if stripped_arg in index_hyp_vars: hyp_varw, _, rest = all_premises[index_hyp_vars[stripped_arg]]\ .partition(":") arg_idx = emap_lookup(self.token_map, 128, tokenizer.get_words(rest)[0]) + 2 else: goal_symbols = tokenizer.get_symbols(context.goal) if stripped_arg in goal_symbols: arg_idx = emap_lookup(self.token_map, 128, stripped_arg) + 128 + 2 else: arg_idx = 1 return stem_idx, arg_idx
def score_hyp_type(goal: str, hyp_type: str, max_length: int): ratio = SequenceMatcher(None, goal, hyp_type).ratio() score = ratio * (len(get_symbols(hyp_type)) / max_length) return score
def __call__(self, context: TacticContext) -> List[float]: headToken = get_symbols(context.goal)[0] oneHotHeads = [0.] * len(self.headKeywords) if headToken in self.headKeywords: oneHotHeads[self.headKeywords.index(headToken)] = 1. return oneHotHeads
def get_arg_from_token_idx(goal: str, idx: int) -> str: goal_symbols = tokenizer.get_symbols(goal.strip(".")) if idx < len(goal_symbols): return goal_symbols[idx] else: return ""
def __call__(self, context: TacticContext) -> int: headToken = get_symbols(context.goal)[0] if headToken in self.headKeywords: return self.headKeywords.index(headToken) + 1 else: return 0