def tactic_substitutions(substitutions: Dict[str, str], sample: ScrapedTactic) \ -> ScrapedTactic: relevant_lemmas, prev_tactics, context, tactic = sample return ScrapedTactic(relevant_lemmas, prev_tactics, context, tactic if get_stem(tactic) not in substitutions else substitutions[get_stem(tactic)])
def add_tactic(self, predictions: List[PredictionResult], correct: str) -> None: self.num_tactics += 1 if predictions[0].grade == "goodcommand" or \ predictions[0].grade == "mostlygoodcommand": self.num_correct += 1 self.num_partial += 1 self.correctly_predicted_frequency[get_stem(correct)] += 1 elif predictions[0].grade == "okaycommand": self.num_partial += 1 else: self.num_failed += 1 for prediction, grade, certainty in predictions: if grade == "goodcommand" or \ grade == "mostlygoodcommand": self.num_topN += 1 break for prediction, grade, certainty in predictions: if grade == "goodcommand" or \ grade == "mostlygoodcommand": self.num_topNPartial += 1 break if grade == "okaycommand": self.num_topNPartial += 1 break self.actual_tactic_frequency[get_stem(correct)] += 1 self.predicted_tactic_frequency[get_stem( predictions[0].prediction)] += 1
def get_tokens(args: List[str]): parser = argparse.ArgumentParser(description="Pick a set of tokens") parser.add_argument("--type", choices=["mixed"], default="mixed") parser.add_argument("-v", "--verbose", action='count', default=0) parser.add_argument("-n", "--num-keywords", type=int, default=120) parser.add_argument("-s", "--num-samples", type=int, default=2000) parser.add_argument("-j", "--num-threads", type=int, default=None) parser.add_argument("scrapefile", type=Path2) parser.add_argument("dest") arg_values = parser.parse_args(args) with print_time("Reading scraped data", guard=arg_values.verbose): raw_data = list(data.read_text_data(arg_values.scrapefile)) embedding = SimpleEmbedding() subset = data.RawDataset(random.sample(raw_data, arg_values.num_samples)) relevance_pairs = [ (context.focused_goal, embedding.encode_token(serapi_instance.get_stem(tactic))) for relevant_lemmas, prev_tactics, context, tactic in subset ] with print_time("Calculating keywords", guard=arg_values.verbose): keywords = get_relevant_k_keywords2(relevance_pairs, arg_values.num_keywords, arg_values.num_threads) with (open(arg_values.dest, mode='w') if arg_values.dest != "-" else contextlib.nullcontext(sys.stdout)) as f: for keyword in keywords: f.write(keyword + "\n")
def predictKTacticsWithLoss_batch(self, in_data : List[TacticContext], k : int, corrects : List[str]) -> \ Tuple[List[List[Prediction]], float]: assert self._embedding assert self.training_args with self._lock: prediction_distributions = self._predictDistributions(in_data) correct_stems = [serapi_instance.get_stem(correct) for correct in corrects] output_var = maybe_cuda(Variable( LongTensor([self._embedding.encode_token(correct_stem) if self._embedding.has_token(correct_stem) else 0 for correct_stem in correct_stems]))) loss = self._criterion(prediction_distributions, output_var).item() if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() certainties_and_idxs_list = \ [single_distribution.view(-1).topk(k) if len(context.hypotheses) > 0 else topk_with_filter(single_distribution.view(-1), k, lambda certainty, idx: not serapi_instance.tacticTakesHypArgs( cast(Embedding, self._embedding).decode_token(idx))) for single_distribution, context in zip(prediction_distributions, in_data)] results = [[Prediction(self.add_arg(self._embedding.decode_token(stem_idx.item()), in_datum.goal, in_datum.hypotheses, self.training_args.max_length), math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs)] for certainties_and_idxs, in_datum in zip(certainties_and_idxs_list, in_data)] return results, loss
def __call__(self, context: TacticContext) -> int: prev_tactic = (serapi_instance.get_stem(context.prev_tactics[-1]) if len(context.prev_tactics) > 1 else "Proof") if prev_tactic in self.tacticKeywords: return self.tacticKeywords.index(prev_tactic) + 1 else: return 0
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: self.lock.acquire() prediction_distribution = self.predictDistribution(in_data) correct_stem = get_stem(correct) if self.embedding.has_token(correct_stem): output_var = maybe_cuda( Variable( torch.LongTensor( [self.embedding.encode_token(correct_stem)]))) loss = self.criterion(prediction_distribution, output_var).item() else: loss = 0 certainties_and_idxs = prediction_distribution.view(-1).topk(k) results = [ Prediction( self.embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] self.lock.release() return results, loss
def predictKTacticsWithLoss(self, in_data : TacticContext, k : int, correct : str) -> \ Tuple[List[Prediction], float]: assert self.training_args assert self._embedding with self._lock: prediction_distribution = self._predictDistributions([in_data])[0] if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() correct_stem = serapi_instance.get_stem(correct) if self._embedding.has_token(correct_stem): output_var = maybe_cuda(Variable( LongTensor([self._embedding.encode_token(correct_stem)]))) loss = self._criterion(prediction_distribution.view(1, -1), output_var).item() else: loss = 0 if len(in_data.hypotheses) == 0: certainties, idxs = topk_with_filter( prediction_distribution.view(-1), k, lambda certainty, idx: not serapi_instance.tacticTakesHypArgs( cast(Embedding, self._embedding).decode_token(idx))) else: certainties, idxs = prediction_distribution.view(-1).topk(k) results = [Prediction(self.add_arg(self._embedding.decode_token(stem_idx.item()), in_data.goal, in_data.hypotheses, self.training_args.max_length), math.exp(certainty.item())) for certainty, stem_idx in zip(certainties, idxs)] return results, loss
def __call__(self, context: TacticContext) -> List[float]: prev_tactic = (serapi_instance.get_stem(context.prev_tactics[-1]) if len(context.prev_tactics) > 1 else "Proof") oneHotPrevs = [0.] * len(self.tacticKeywords) if prev_tactic in self.tacticKeywords: oneHotPrevs[self.tacticKeywords.index(prev_tactic)] = 1. return oneHotPrevs
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: with self._lock: distribution = self.predictDistribution(in_data) stem = get_stem(correct) if self._embedding.has_token(stem): output_var = maybe_cuda( Variable( torch.LongTensor([self._embedding.encode_token(stem) ]))) loss = self._criterion(distribution, output_var).item() else: loss = 0 if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() probs_and_indices = distribution.squeeze().topk(k) predictions = [ Prediction( self._embedding.decode_token(idx.item()) + ".", math.exp(certainty.item())) for certainty, idx in zip(*probs_and_indices) ] return predictions, loss
def predictKTacticsWithLoss_batch(self, in_data: List[TacticContext], k: int, corrects: List[str]): assert self.training_args with self._lock: input_tensor = Variable( FloatTensor([ encode_ngram_classify_input(in_data_point.goal, self.training_args.num_grams, self._tokenizer) for in_data_point in in_data ])) prediction_distributions = self._lsoftmax( self._model(input_tensor)) correct_stems = [get_stem(correct) for correct in corrects] output_var = maybe_cuda( Variable( torch.LongTensor([ self._embedding.encode_token(correct_stem) if self._embedding.has_token(correct_stem) else 0 for correct_stem in correct_stems ]))) loss = self._criterion(prediction_distributions, output_var).item() if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() certainties_and_idxs_list = \ [single_distribution.view(-1).topk(k) for single_distribution in list(prediction_distributions)] results = [[ Prediction( self._embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] for certainties_and_idxs in certainties_and_idxs_list] return results, loss
def predictKTacticsWithLoss_batch(self, in_data : List[TacticContext], k : int, corrects : List[str]) -> \ Tuple[List[List[Prediction]], float]: assert self.training_args if len(in_data) == 0: return [], 0 with self._lock: tokenized_goals = [self._tokenizer.toTokenList(goal) for relevant_lemmas, prev_tactics, hypotheses, goal in in_data] input_tensor = LongTensor([inputFromSentence(tokenized_goal, self.training_args.max_length) for tokenized_goal in tokenized_goals]) prediction_distributions = self._model.run(input_tensor, batch_size=len(in_data)) correct_stems = [get_stem(correct) for correct in corrects] output_var = maybe_cuda(Variable( torch.LongTensor([self._embedding.encode_token(correct_stem) if self._embedding.has_token(correct_stem) else 0 for correct_stem in correct_stems]))) loss = self._criterion(prediction_distributions, output_var).item() if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() certainties_and_idxs_list = [single_distribution.view(-1).topk(k) for single_distribution in list(prediction_distributions)] results = [[Prediction(self._embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs)] for certainties_and_idxs in certainties_and_idxs_list] return results, loss
def encode_hyparg_data(data : RawDataset, tokenizer_type : Callable[[List[str], int], Tokenizer], num_keywords : int, num_reserved_tokens : int, max_args : int, max_hyps : int, encoded_length : int, entropy_data_size : int, num_threads : Optional[int] = None) -> \ Tuple[StructDataset, Tokenizer, SimpleEmbedding]: stem_embedding = SimpleEmbedding() data_list = list(data) if len(data_list) <= entropy_data_size: subset = data_list else: subset = random.sample(data_list, entropy_data_size) tokenizer = make_keyword_tokenizer_relevance( [(context, stem_embedding.encode_token(serapi_instance.get_stem(tactic))) for relevant_lemmas, prev_tactics, hyps, context, tactic in subset], tokenizer_type, num_keywords, num_reserved_tokens) termEncoder = functools.partial(getNGramTokenbagVector, 1, tokenizer.numTokens()) with multiprocessing.Pool(num_threads) as pool: hyps, contexts, tactics = zip(*data_list) encoded_contexts = pool.imap(functools.partial( _encode, tokenizer, termEncoder), contexts) encoded_hyps = pool.imap(functools.partial( _encode_hyps, tokenizer, termEncoder, max_hyps, encoded_length), contexts) encoded_tactics = pool.imap( functools.partial(encode_tactic_structure, stem_embedding, max_args), zip(hyps, tactics)) result = list(zip(encoded_hyps, encoded_contexts, encoded_tactics)) tokenizer.freezeTokenList() return result, tokenizer, stem_embedding
def _get_prev(self, in_data: TacticContext) -> int: stem = get_stem(in_data.prev_tactics[-1]) \ if len(in_data.prev_tactics) > 1 else "Proof" if self._embedding.has_token(stem): return self._embedding.encode_token(stem) else: return self._embedding.encode_token("eauto")
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: with self._lock: prediction_distribution = self._predictDistributions([in_data])[0] correct_stem = get_stem(correct) if self._embedding.has_token(correct_stem): output_var = maybe_cuda( Variable( torch.LongTensor( [self._embedding.encode_token(correct_stem)]))) loss = self._criterion(prediction_distribution.view(1, -1), output_var).item() else: loss = 0 if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() certainties_and_idxs = prediction_distribution.view(-1).topk(k) results = [ Prediction( self._embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] return results, loss
def grade_command_result(self, initial_context: str, predicted: str, predicted_context: str, actual: str, actual_context: str, exception: Optional[Exception]) -> str: if actual.strip() == predicted.strip(): return "goodcommand" elif (get_stem(actual) == get_stem(predicted)): return "okaycommand" elif type(exception) == ParseError or type(exception) == LexError: return "superfailedcommand" elif exception != None: return "failedcommand" elif predicted_context == actual_context: return "mostlygoodcommand" elif predicted_context == initial_context: return "uselesscommand" else: return "badcommand"
def _encode_tokenized_data(self, data : TokenizedDataset, arg_values : Namespace, tokenizer : Tokenizer, embedding : Embedding) \ -> PECDataset: return PECDataset([ PECSample( embedding.encode_token( get_stem(prev_tactics[-1] ) if len(prev_tactics) > 1 else "Proof"), goal, tactic) for prev_tactics, goal, tactic in data ])
def grade_prediction(correct_inter: ScrapedTactic, prediction: str): correct_tactic = correct_inter.tactic correct_tactic_normalized = \ serapi_instance.normalizeNumericArgs(correct_inter).tactic prediction_normalized = \ serapi_instance.normalizeNumericArgs(ScrapedTactic( correct_inter.relevant_lemmas, correct_inter.prev_tactics, correct_inter.context, prediction)).tactic if correct_tactic.strip() == prediction.strip() or\ correct_tactic_normalized.strip() == prediction_normalized.strip(): return "goodcommand" elif get_stem(correct_tactic).strip() == get_stem(prediction).strip(): return "okaycommand" elif (correct_tactic.strip() in proper_subs and proper_subs[correct_tactic.strip()] == prediction.strip()): return "mostlygoodcommand" else: return "badcommand"
def embed_data(data : RawDataset, embedding : Optional[Embedding] = None) \ -> Tuple[Embedding, StrictEmbeddedDataset]: if not embedding: embedding = SimpleEmbedding() start = time.time() print("Embedding data...", end="") sys.stdout.flush() dataset = StrictEmbeddedDataset([ EmbeddedSample(relevant_lemmas, prev_tactics, hypotheses, goal, embedding.encode_token(get_stem(tactic))) for relevant_lemmas, prev_tactics, hypotheses, goal, tactic in data ]) print("{:.2f}s".format(time.time() - start)) return embedding, dataset
def add_command_result(self, predictions: List[str], grades: List[str], actual: str, loss: float) -> None: add_to_freq_table(self.actual_tactic_frequency, get_stem(actual)) add_to_freq_table(self.predicted_tactic_frequency, get_stem(predictions[0])) self.total_loss += loss self.num_tactics += 1 if (grades[0] == "goodcommand" or grades[0] == "mostlygoodcommand"): add_to_freq_table(self.correctly_predicted_frequency, get_stem(predictions[0])) self.num_correct += 1 self.num_partial += 1 elif (grades[0] == "okaycommand"): self.num_partial += 1 elif (grades[0] == "failedcommand" or grades[0] == "superfailedcommand"): self.num_failed += 1 for grade in grades: if (grade == "goodcommand" or grade == "mostlygoodcommand"): self.num_topN += 1 self.num_topNPartial += 1 break if (grade == "okaycommand"): self.num_topNPartial += 1 break for grade in grades: if (grade == "goodcommand" or grade == "mostlygoodcommand"): self.num_searched += 1 break if (grade != "failedcommand" and grade != "superfailedcommand" and grade != "uselesscommand"): break pass
def __init__(self, init_dataset: List[TacticContext], args: argparse.Namespace) -> None: prevTacticsCounts: typing.Counter[str] = Counter() for relevant_lemmas, prev_tactics, hyps, goal in init_dataset: if len(prev_tactics) > 2: prevTacticsCounts[serapi_instance.get_stem( prev_tactics[-1])] += 1 if args.load_tactic_keywords and Path2( args.load_tactic_keywords).exists(): self.tacticKeywords = torch.load(args.load_tactic_keywords) else: self.tacticKeywords = ["Proof"] + \ [word for word, count in prevTacticsCounts.most_common(args.num_tactic_keywords)] eprint("Tactic keywords are {}".format(self.tacticKeywords), guard=args.print_keywords)
def _features(self, context: TacticContext, certainty: float) \ -> Tuple[List[int], List[float]]: if len(context.prev_tactics) > 1: prev_tactic = serapi_instance.get_stem(context.prev_tactics[-1]) prev_tactic_index = emap_lookup(self.tactic_map, 32, prev_tactic) else: prev_tactic_index = 0 if context.goal != "": goal_head_index = emap_lookup(self.token_map, 128, tokenizer.get_words(context.goal)[0]) else: goal_head_index = 0 goal_length_feature = min(len(tokenizer.get_words(context.goal)), 100) / 100 num_hyps_feature = min(len(context.hypotheses), 30) / 30 return [prev_tactic_index, goal_head_index], \ [goal_length_feature, num_hyps_feature, certainty]
def from_data(init_dataset: List[TacticContext], args: argparse.Namespace) -> 'PrevTactic': prevTacticsCounts: typing.Counter[str] = Counter() for relevant_lemmas, prev_tactics, hyps, goal in init_dataset: if len(prev_tactics) > 2: prevTacticsCounts[serapi_instance.get_stem( prev_tactics[-1])] += 1 if args.load_tactic_keywords and \ Path2(args.load_tactic_keywords).exists(): result = PrevTactic(torch.load(args.load_tactic_keywords)) else: result = PrevTactic(["Proof"] + [ word for word, count in prevTacticsCounts.most_common( args.num_tactic_keywords) ]) eprint("Tactic keywords are {}".format(result.tacticKeywords), guard=args.print_keywords) return result
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: distribution = self.predictDistribution(in_data) correct_stem = get_stem(correct) if self.embedding.has_token(correct_stem): loss = self.criterion( torch.FloatTensor(distribution).view(1, -1), Variable( torch.LongTensor( [self.embedding.encode_token(correct_stem)]))).item() else: loss = float("+inf") indices, probabilities = list_topk(list(distribution), k) predictions = [ Prediction( self.embedding.decode_token(idx) + ".", math.exp(certainty)) for certainty, idx in zip(probabilities, indices) ] return predictions, loss
def predictKTacticsWithLoss(self, in_data : TacticContext, k : int, correct : str) -> Tuple[List[Prediction], float]: with self._lock: distribution, hyp_var = self._predictDistribution(in_data) correct_stem = serapi_instance.get_stem(correct) if self._embedding.has_token(correct_stem): loss = self._criterion(distribution.view(1, -1), Variable(LongTensor([self._embedding.encode_token(correct_stem)]))).item() else: loss = float("+inf") indices, probabilities = list_topk(list(distribution), k) predictions : List[Prediction] = [] for certainty, idx in zip(probabilities, indices): stem = self._embedding.decode_token(idx) if serapi_instance.tacticTakesHypArgs(stem): predictions.append(Prediction(stem + " " + hyp_var + ".", math.exp(certainty))) else: predictions.append(Prediction(stem + ".", math.exp(certainty))) return predictions, loss
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: self.lock.acquire() distribution = self.predictDistribution(in_data) stem = get_stem(correct) if self.embedding.has_token(stem): output_var = maybe_cuda( Variable(torch.LongTensor([self.embedding.encode_token(stem) ]))) loss = self.criterion(distribution.view(1, -1), output_var).item() else: loss = 0 certainties, idxs = distribution.squeeze().topk(k) predictions_and_certainties = \ [Prediction(self.embedding.decode_token(idx.item()) + ".", math.exp(certainty.item())) for certainty, idx in zip(list(certainties), list(idxs))] self.lock.release() return predictions_and_certainties, loss
def encode_seq_classify_data(data : RawDataset, tokenizer_type : Callable[[List[str], int], Tokenizer], num_keywords : int, num_reserved_tokens : int, save_tokens : Optional[str] = None, load_tokens : Optional[str] = None, num_relevance_samples : int = 1000) \ -> Tuple[ClassifySequenceDataset, Tokenizer, SimpleEmbedding]: embedding = SimpleEmbedding() subset = RawDataset(random.sample(data, num_relevance_samples)) if load_tokens: print("Loading tokens from {}".format(load_tokens)) tokenizer = torch.load(load_tokens) else: start = time.time() print("Picking tokens...", end="") sys.stdout.flush() tokenizer = make_keyword_tokenizer_relevance([(context, embedding.encode_token( get_stem(tactic))) for prev_tactics, hyps, context, tactic in subset], tokenizer_type, num_keywords, num_reserved_tokens) print("{}s".format(time.time() - start)) if save_tokens: print("Saving tokens to {}".format(save_tokens)) torch.save(tokenizer, save_tokens) with multiprocessing.Pool(None) as pool: result = [(goal, embedding.encode_token(tactic)) for goal, tactic in chain.from_iterable(pool.imap(functools.partial( encode_seq_classify_data_worker__, tokenizer), chunks(data, 1024)))] tokenizer.freezeTokenList() return result, tokenizer, embedding
def predictKTacticsWithLoss( prediction_distribution: torch.FloatTensor, embedding: Embedding, k: int, correct: str, criterion: nn.Module) -> Tuple[List[Prediction], float]: if k > embedding.num_tokens(): k = embedding.num_tokens() correct_stem = get_stem(correct) if embedding.has_token(correct_stem): output_var = maybe_cuda( Variable(torch.LongTensor([embedding.encode_token(correct_stem)]))) loss = criterion(prediction_distribution.view(1, -1), output_var).item() else: loss = 0 certainties_and_idxs = prediction_distribution.view(-1).topk(k) results = [ Prediction( embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] return results, loss
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector", "scrapefile-rd", "scrapefile" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--no-truncate-semicolons", dest="truncate_semicolons", action='store_false') parser.add_argument("--max-length", dest="max_length", default=30, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("-j", "--num-threads", default=None, type=int) parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument('-v', "--verbose", action="count") parser.add_argument("--num-threads", "-j", type=int, default=None) parser.add_argument("--no-use-substitutions", action='store_false', dest='use_substitutions') parser.add_argument("--no-normalize-numeric-args", action='store_false', dest='normalize_numeric_args') parser.add_argument("--sort", action='store_true') arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") else: dataset = data.get_text_data(arg_values) if arg_values.sort: dataset = data.RawDataset( sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True)) if arg_values.format == "goals": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [ strip_scraped_output(scraped) for scraped in dataset ] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) # type: ignore for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[ feature(c) for feature in word_feature_functions ] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)])) elif arg_values.format == "scrapefile-rd": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "context": { "fg_goals": [{ "hypotheses": point.hypotheses, "goal": point.goal }], "bg_goals": [], "shelved_goals": [], "given_up_goals": [] }, "tactic": point.tactic })) elif arg_values.format == "scrapefile": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "prev_hyps": point.hypotheses, "prev_goal": point.goal, "tactic": point.tactic }))
def process_file(self, args : argparse.Namespace, file_idx : int, filename : str) \ -> None: global gresult fresult = FileResult(filename) if self.debug: print("Preprocessing...") commands = self.get_commands(args, file_idx, filename) command_results: List[CommandResult] = [] with serapi_instance.SerapiContext(self.coqargs, self.includes, self.prelude) as coq: coq.debug = self.debug nb_commands = len(commands) for i in range(nb_commands): command = commands[i] # print("Processing command {}/{}".format(str(i+1), str(nb_commands))) in_proof = (coq.proof_context and not re.match(".*Proof.*", command.strip())) if re.match("[{}]", command): coq.run_stmt(command) continue if in_proof: prev_tactics = coq.prev_tactics initial_context = coq.proof_context assert initial_context hyps = coq.hypotheses goals = coq.goals relevant_lemmas = coq.local_lemmas if self.baseline: predictions_and_certanties = [baseline_tactic + ".", 1] \ * num_predictions else: predictions_and_certainties, loss = net.predictKTacticsWithLoss( TacticContext(relevant_lemmas, prev_tactics, hyps, goals), num_predictions, command) prediction_runs = [ run_prediction(coq, prediction) for prediction, certainty in predictions_and_certainties ] try: coq.run_stmt(command) actual_result_context = coq.proof_context actual_result_goal = coq.goals actual_result_hypotheses = coq.hypotheses actual_result_lemmas = coq.local_lemmas assert isinstance(actual_result_context, str) except (AckError, CompletedError, CoqExn, BadResponse, ParseError, LexError, TimeoutError): print("In file {}:".format(filename)) raise prediction_results = [ (prediction, evaluate_prediction(fresult, initial_context, command, actual_result_context, prediction_run), certainty) for prediction_run, (prediction, certainty) in zip( prediction_runs, predictions_and_certainties) ] assert net.training_args if self.cfilter( TacticContext(relevant_lemmas, prev_tactics, hyps, goals), command, TacticContext(actual_result_lemmas, prev_tactics + [command], actual_result_hypotheses, actual_result_goal), net.training_args): fresult.add_command_result([ pred for pred, ctxt, ex in prediction_runs ], [ grade for pred, grade, certainty in prediction_results ], command, loss) command_results.append( (command, hyps, goals, prediction_results)) else: command_results.append((command, )) else: try: coq.run_stmt(command) except (AckError, CompletedError, CoqExn, BadResponse, ParseError, LexError, TimeoutError): print("In file {}:".format(filename)) raise command_results.append((command, )) write_csv(fresult.details_filename(), self.output_dir, gresult.options, command_results) doc, tag, text, line = Doc().ttl() with tag('html'): details_header(tag, doc, text, filename) with tag('div', id='overlay', onclick='event.stopPropagation();'): with tag('div', id='predicted'): pass with tag('div', id='context'): pass with tag('div', id='stats'): pass pass with tag('body', onclick='deselectTactic()', onload='setSelectedIdx()'), tag('pre'): for idx, command_result in enumerate(command_results): if len(command_result) == 1: with tag('code', klass='plaincommand'): text(command_result[0]) else: command, hyps, goal, prediction_results = \ cast(TacticResult, command_result) predictions, grades, certainties = zip( *prediction_results) search_index = 0 for pidx, prediction_result in enumerate( prediction_results): prediction, grade, certainty = prediction_result if (grade != "failedcommand" and grade != "superfailedcommand"): search_index = pidx break with tag( 'span', ('data-hyps', "\n".join(hyps)), ('data-goal', shorten_whitespace(goal)), ('data-num-total', str(fresult.num_tactics)), ('data-predictions', to_list_string(cast(List[str], predictions))), ('data-num-predicteds', to_list_string([ fresult.predicted_tactic_frequency.get( get_stem(prediction), 0) for prediction in cast( List[str], predictions) ])), ('data-num-corrects', to_list_string([ fresult.correctly_predicted_frequency.get( get_stem(prediction), 0) for prediction in cast( List[str], predictions) ])), ('data-certainties', to_list_string(cast(List[float], certainties))), ('data-num-actual-corrects', fresult.correctly_predicted_frequency.get( get_stem(command), 0)), ('data-num-actual-in-file', fresult.actual_tactic_frequency.get( get_stem(command))), ('data-actual-tactic', strip_comments(command)), ('data-grades', to_list_string(cast(List[str], grades))), ('data-search-idx', search_index), id='command-' + str(idx), onmouseover='hoverTactic({})'.format(idx), onmouseout='unhoverTactic()', onclick= 'selectTactic({}); event.stopPropagation();'. format(idx)): doc.stag("br") for idx, prediction_result in enumerate( prediction_results): prediction, grade, certainty = prediction_result if search_index == idx: with tag('code', klass=grade): text(" " + command.strip()) else: with tag('span', klass=grade): doc.asis(" ⬤") with open( "{}/{}.html".format(self.output_dir, fresult.details_filename()), "w") as fout: fout.write(doc.getvalue()) gresult.add_file_result(fresult) rows.put(fresult)
def write_html(output_dir: Path2, filename: Path2, command_results: List[CommandResult], stats: 'ResultStats') -> None: def details_header(tag: Any, doc: Doc, text: Text, filename: Path2) -> None: header(tag, doc, text, details_css, details_javascript, "Proverbot Detailed Report for {}".format(filename)) doc, tag, text, line = Doc().ttl() def write_highlighted(vernac: str) -> None: nonlocal text nonlocal tag substrings = syntax_highlight(vernac) for substring in substrings: if isinstance(substring, ColoredString): with tag('span', style=f'color:{substring.color}'): text(substring.contents) else: text(substring) with tag('html'): details_header(tag, doc, text, filename) with tag('div', id='overlay', onclick='event.stopPropagation();'): with tag('div', id='predicted'): pass with tag('div', id='context'): pass with tag('div', id='stats'): pass pass with tag('body', onclick='deselectTactic()', onload='init()'), tag('pre'): for region_idx, region in enumerate( split_into_regions(command_results)): if len(region) > 1 and len(region[1]) == 1: for cmd_idx, command_result in enumerate(region): assert isinstance(command_result[0], str) with tag('code', klass='plaincommand'): write_highlighted(command_result[0]) else: doc.stag("br") with tag('button', klass='collapsible', id='collapsible-{}'.format(region_idx)): with tag('code', klass='buttontext'): assert isinstance(region[0][0], str), region write_highlighted(region[0][0].strip("\n")) num_unfiltered = count_region_unfiltered(region) with tag( 'code', klass='numtacs ' + ('nonempty' if num_unfiltered > 3 else 'empty')): text(num_unfiltered) with tag('div', klass='region'): for cmd_idx, command_result in enumerate(region[1:]): command, hyps, goal, prediction_results = \ cast(TacticResult, command_result) predictions: List[str] grades: List[str] certainties: List[float] if len(prediction_results) > 0: predictions, grades, certainties = zip( *prediction_results) # type: ignore else: predictions, grades, certainties = [], [], [] with tag('span', ('data-hyps',"\n".join(hyps)), ('data-goal',goal), ('data-num-total', str(stats.num_tactics)), ('data-predictions', to_list_string(cast(List[str], predictions))), ('data-num-predicteds', to_list_string([stats.predicted_tactic_frequency .get(get_stem(prediction), 0) for prediction in cast(List[str], predictions)])), ('data-num-corrects', to_list_string([stats.correctly_predicted_frequency .get(get_stem(prediction), 0) for prediction in cast(List[str], predictions)])), ('data-certainties', to_list_string(cast(List[float], certainties))), ('data-num-actual-corrects', stats.correctly_predicted_frequency .get(get_stem(command), 0)), ('data-num-actual-in-file', stats.actual_tactic_frequency .get(get_stem(command), 0)), ('data-actual-tactic', strip_comments(command)), ('data-grades', to_list_string(cast(List[str], grades))), ('data-search-idx', 0), id='command-{}-{}'.format(region_idx, cmd_idx), onmouseover='hoverTactic("{}-{}")'\ .format(region_idx, cmd_idx), onmouseout='unhoverTactic()', onclick='selectTactic("{}-{}"); event.stopPropagation();' .format(region_idx, cmd_idx)): doc.stag("br") if len(grades) == 0: with tag('code', klass="plaincommand"): write_highlighted(command.strip("\n")) else: with tag('code', klass=grades[0]): text(command.strip("\n")) for grade in grades[1:]: with tag('span', klass=grade): doc.asis(" ⬤") with (output_dir / escape_filename(str(filename))).with_suffix(".html")\ .open(mode='w') as fout: fout.write(doc.getvalue()) pass