def tactic_substitutions(substitutions: Dict[str, str], sample: ScrapedTactic) \ -> ScrapedTactic: relevant_lemmas, prev_tactics, context, tactic = sample return ScrapedTactic( relevant_lemmas, prev_tactics, context, tactic if get_stem(tactic) not in substitutions else substitutions[get_stem(tactic)])
def add_tactic(self, predictions : List[PredictionResult], correct : str) -> None: self.num_tactics += 1 if predictions[0].grade == "goodcommand" or \ predictions[0].grade == "mostlygoodcommand": self.num_correct += 1 self.num_partial += 1 self.correctly_predicted_frequency[get_stem(correct)] += 1 elif predictions[0].grade == "okaycommand": self.num_partial += 1 else: self.num_failed += 1 for prediction, grade, certainty in predictions: if grade == "goodcommand" or \ grade == "mostlygoodcommand": self.num_topN += 1 break for prediction, grade, certainty in predictions: if grade == "goodcommand" or \ grade == "mostlygoodcommand": self.num_topNPartial += 1 break if grade == "okaycommand": self.num_topNPartial += 1 break self.actual_tactic_frequency[get_stem(correct)] += 1 self.predicted_tactic_frequency[get_stem(predictions[0].prediction)] += 1
def _get_prev(self, in_data: TacticContext) -> int: stem = get_stem(in_data.prev_tactics[-1]) \ if len(in_data.prev_tactics) > 1 else "Proof" if self._embedding.has_token(stem): return self._embedding.encode_token(stem) else: return self._embedding.encode_token("eauto")
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: with self._lock: distribution = self.predictDistribution(in_data) stem = get_stem(correct) if self._embedding.has_token(stem): output_var = maybe_cuda( Variable( torch.LongTensor([self._embedding.encode_token(stem) ]))) loss = self._criterion(distribution, output_var).item() else: loss = 0 if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() probs_and_indices = distribution.squeeze().topk(k) predictions = [ Prediction( self._embedding.decode_token(idx.item()) + ".", math.exp(certainty.item())) for certainty, idx in zip(*probs_and_indices) ] return predictions, loss
def predictKTacticsWithLoss_batch(self, in_data: List[TacticContext], k: int, corrects: List[str]): assert self.training_args with self._lock: input_tensor = Variable( FloatTensor([ encode_ngram_classify_input(in_data_point.goal, self.training_args.num_grams, self._tokenizer) for in_data_point in in_data ])) prediction_distributions = self._lsoftmax( self._model(input_tensor)) correct_stems = [get_stem(correct) for correct in corrects] output_var = maybe_cuda( Variable( torch.LongTensor([ self._embedding.encode_token(correct_stem) if self._embedding.has_token(correct_stem) else 0 for correct_stem in correct_stems ]))) loss = self._criterion(prediction_distributions, output_var).item() if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() certainties_and_idxs_list = \ [single_distribution.view(-1).topk(k) for single_distribution in list(prediction_distributions)] results = [[ Prediction( self._embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] for certainties_and_idxs in certainties_and_idxs_list] return results, loss
def __call__(self, context: TacticContext) -> int: prev_tactic = (serapi_instance.get_stem(context.prev_tactics[-1]) if len(context.prev_tactics) > 1 else "Proof") if prev_tactic in self.tacticKeywords: return self.tacticKeywords.index(prev_tactic) + 1 else: return 0
def __call__(self, context: TacticContext) -> List[float]: prev_tactic = (serapi_instance.get_stem(context.prev_tactics[-1]) if len(context.prev_tactics) > 1 else "Proof") oneHotPrevs = [0.] * len(self.tacticKeywords) if prev_tactic in self.tacticKeywords: oneHotPrevs[self.tacticKeywords.index(prev_tactic)] = 1. return oneHotPrevs
def predictKTacticsWithLoss(self, in_data : TacticContext, k : int, correct : str) -> \ Tuple[List[Prediction], float]: assert self.training_args assert self._embedding with self._lock: prediction_distribution = self._predictDistributions([in_data])[0] if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() correct_stem = serapi_instance.get_stem(correct) if self._embedding.has_token(correct_stem): output_var = maybe_cuda( Variable( LongTensor([self._embedding.encode_token(correct_stem)]))) loss = self._criterion(prediction_distribution.view(1, -1), output_var).item() else: loss = 0 if len(in_data.hypotheses) == 0: certainties, idxs = topk_with_filter( prediction_distribution.view(-1), k, lambda certainty, idx: not serapi_instance.tacticTakesHypArgs( cast(Embedding, self._embedding).decode_token(idx))) else: certainties, idxs = prediction_distribution.view(-1).topk(k) results = [ Prediction( self.add_arg(self._embedding.decode_token(stem_idx.item()), in_data.goal, in_data.hypotheses, self.training_args.max_length), math.exp(certainty.item())) for certainty, stem_idx in zip(certainties, idxs) ] return results, loss
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: self.lock.acquire() prediction_distribution = self.predictDistribution(in_data) correct_stem = get_stem(correct) if self.embedding.has_token(correct_stem): output_var = maybe_cuda( Variable( torch.LongTensor( [self.embedding.encode_token(correct_stem)]))) loss = self.criterion(prediction_distribution, output_var).item() else: loss = 0 certainties_and_idxs = prediction_distribution.view(-1).topk(k) results = [ Prediction( self.embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] self.lock.release() return results, loss
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: with self._lock: prediction_distribution = self._predictDistributions([in_data])[0] correct_stem = get_stem(correct) if self._embedding.has_token(correct_stem): output_var = maybe_cuda( Variable( torch.LongTensor( [self._embedding.encode_token(correct_stem)]))) loss = self._criterion(prediction_distribution.view(1, -1), output_var).item() else: loss = 0 if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() certainties_and_idxs = prediction_distribution.view(-1).topk(k) results = [ Prediction( self._embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] return results, loss
def encode_seq_classify_data(data : RawDataset, tokenizer_type : Callable[[List[str], int], Tokenizer], num_keywords : int, num_reserved_tokens : int, save_tokens : Optional[str] = None, load_tokens : Optional[str] = None, num_relevance_samples : int = 1000) \ -> Tuple[ClassifySequenceDataset, Tokenizer, SimpleEmbedding]: embedding = SimpleEmbedding() subset = RawDataset(random.sample(data, num_relevance_samples)) if load_tokens: print("Loading tokens from {}".format(load_tokens)) tokenizer = torch.load(load_tokens) else: start = time.time() print("Picking tokens...", end="") sys.stdout.flush() tokenizer = make_keyword_tokenizer_relevance( [(context, embedding.encode_token(get_stem(tactic))) for prev_tactics, hyps, context, tactic in subset], tokenizer_type, num_keywords, num_reserved_tokens) print("{}s".format(time.time() - start)) if save_tokens: print("Saving tokens to {}".format(save_tokens)) torch.save(tokenizer, save_tokens) with multiprocessing.Pool(None) as pool: result = [(goal, embedding.encode_token(tactic)) for goal, tactic in chain.from_iterable( pool.imap( functools.partial(encode_seq_classify_data_worker__, tokenizer), chunks(data, 1024)))] tokenizer.freezeTokenList() return result, tokenizer, embedding
def get_tokens(args: List[str]): parser = argparse.ArgumentParser(description="Pick a set of tokens") parser.add_argument("--type", choices=["mixed"], default="mixed") parser.add_argument("-v", "--verbose", action='count', default=0) parser.add_argument("-n", "--num-keywords", type=int, default=120) parser.add_argument("-s", "--num-samples", type=int, default=2000) parser.add_argument("-j", "--num-threads", type=int, default=None) parser.add_argument("scrapefile", type=Path2) parser.add_argument("dest") arg_values = parser.parse_args(args) with print_time("Reading scraped data", guard=arg_values.verbose): raw_data = list(data.read_text_data(arg_values.scrapefile)) embedding = SimpleEmbedding() subset = data.RawDataset(random.sample(raw_data, arg_values.num_samples)) relevance_pairs = [ (goal, embedding.encode_token(serapi_instance.get_stem(tactic))) for relevant_lemmas, prev_tactics, hyps, goal, tactic in subset ] with print_time("Calculating keywords", guard=arg_values.verbose): keywords = get_relevant_k_keywords2(relevance_pairs, arg_values.num_keywords, arg_values.num_threads) with (open(arg_values.dest, mode='w') if arg_values.dest != "-" else contextlib.nullcontext(sys.stdout)) as f: for keyword in keywords: f.write(keyword + "\n")
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: with self._lock: distribution, hyp_var = self._predictDistribution(in_data) correct_stem = serapi_instance.get_stem(correct) if self._embedding.has_token(correct_stem): loss = self._criterion( distribution.view(1, -1), Variable( LongTensor([ self._embedding.encode_token(correct_stem) ]))).item() else: loss = float("+inf") indices, probabilities = list_topk(list(distribution), k) predictions: List[Prediction] = [] for certainty, idx in zip(probabilities, indices): stem = self._embedding.decode_token(idx) if serapi_instance.tacticTakesHypArgs(stem): predictions.append( Prediction(stem + " " + hyp_var + ".", math.exp(certainty))) else: predictions.append(Prediction(stem + ".", math.exp(certainty))) return predictions, loss
def grade_prediction(correct_inter : ScrapedTactic, prediction : str): correct_tactic = correct_inter.tactic correct_tactic_normalized = \ serapi_instance.normalizeNumericArgs(correct_inter).tactic prediction_normalized = \ serapi_instance.normalizeNumericArgs(ScrapedTactic( correct_inter.prev_tactics, correct_inter.hypotheses, correct_inter.goal, prediction)).tactic if correct_tactic.strip() == prediction.strip() or\ correct_tactic_normalized.strip() == prediction_normalized.strip(): return "goodcommand" elif get_stem(correct_tactic).strip() == get_stem(prediction).strip(): return "okaycommand" elif correct_tactic.strip() in proper_subs and \ proper_subs[correct_tactic.strip()] == prediction.strip(): return "mostlygoodcommand" else: return "badcommand"
def grade_command_result(self, initial_context: str, predicted: str, predicted_context: str, actual: str, actual_context: str, exception: Optional[Exception]) -> str: if actual.strip() == predicted.strip(): return "goodcommand" elif (get_stem(actual) == get_stem(predicted)): return "okaycommand" elif type(exception) == ParseError or type(exception) == LexError: return "superfailedcommand" elif exception != None: return "failedcommand" elif predicted_context == actual_context: return "mostlygoodcommand" elif predicted_context == initial_context: return "uselesscommand" else: return "badcommand"
def _encode_tokenized_data(self, data : TokenizedDataset, arg_values : Namespace, tokenizer : Tokenizer, embedding : Embedding) \ -> PECDataset: return PECDataset([ PECSample( embedding.encode_token( get_stem(prev_tactics[-1] ) if len(prev_tactics) > 1 else "Proof"), goal, tactic) for prev_tactics, goal, tactic in data ])
def embed_data(data : RawDataset) -> Tuple[Embedding, StrictEmbeddedDataset]: embedding = SimpleEmbedding() start = time.time() print("Embedding data...", end="") sys.stdout.flush() dataset = StrictEmbeddedDataset([EmbeddedSample( prev_tactics, hypotheses, goal, embedding.encode_token(get_stem(tactic))) for prev_tactics, hypotheses, goal, tactic in data]) print("{:.2f}s".format(time.time() - start)) return embedding, dataset
def __init__(self, init_dataset: List[TacticContext], args: argparse.Namespace) -> None: prevTacticsCounts: typing.Counter[str] = Counter() for prev_tactics, hyps, goal in init_dataset: if len(prev_tactics) > 2: prevTacticsCounts[serapi_instance.get_stem( prev_tactics[-1])] += 1 self.tacticKeywords = ["Proof"] + \ [word for word, count in prevTacticsCounts.most_common(args.num_tactic_keywords)] eprint("Tactic keywords are {}".format(self.tacticKeywords), guard=args.print_keywords)
def predictKTacticsWithLoss(self, in_data : TacticContext, k : int, correct : str) -> Tuple[List[Prediction], float]: distribution = self.predictDistribution(in_data) correct_stem = get_stem(correct) if self.embedding.has_token(correct_stem): loss = self.criterion(torch.FloatTensor(distribution).view(1, -1), Variable(torch.LongTensor([self.embedding.encode_token(correct_stem)]))).item() else: loss = float("+inf") indices, probabilities = list_topk(list(distribution), k) predictions = [Prediction(self.embedding.decode_token(idx) + ".", math.exp(certainty)) for certainty, idx in zip(probabilities, indices)] return predictions, loss
def add_command_result(self, predictions: List[str], grades: List[str], actual: str, loss: float) -> None: add_to_freq_table(self.actual_tactic_frequency, get_stem(actual)) add_to_freq_table(self.predicted_tactic_frequency, get_stem(predictions[0])) self.total_loss += loss self.num_tactics += 1 if (grades[0] == "goodcommand" or grades[0] == "mostlygoodcommand"): add_to_freq_table(self.correctly_predicted_frequency, get_stem(predictions[0])) self.num_correct += 1 self.num_partial += 1 elif (grades[0] == "okaycommand"): self.num_partial += 1 elif (grades[0] == "failedcommand" or grades[0] == "superfailedcommand"): self.num_failed += 1 for grade in grades: if (grade == "goodcommand" or grade == "mostlygoodcommand"): self.num_topN += 1 self.num_topNPartial += 1 break if (grade == "okaycommand"): self.num_topNPartial += 1 break for grade in grades: if (grade == "goodcommand" or grade == "mostlygoodcommand"): self.num_searched += 1 break if (grade != "failedcommand" and grade != "superfailedcommand" and grade != "uselesscommand"): break pass
def _features(self, context: TacticContext) \ -> Tuple[List[int], List[float]]: if len(context.prev_tactics) > 1: prev_tactic = serapi_instance.get_stem(context.prev_tactics[-1]) prev_tactic_index = emap_lookup(self.tactic_map, 32, prev_tactic) else: prev_tactic_index = 0 if context.goal != "": goal_head_index = emap_lookup(self.token_map, 128, tokenizer.get_words(context.goal)[0]) else: goal_head_index = 0 goal_length_feature = min(len(tokenizer.get_words(context.goal)), 100) / 100 num_hyps_feature = min(len(context.hypotheses), 30) / 30 return [prev_tactic_index, goal_head_index], \ [goal_length_feature, num_hyps_feature]
def from_data(init_dataset: List[TacticContext], args: argparse.Namespace) -> 'PrevTactic': prevTacticsCounts: typing.Counter[str] = Counter() for relevant_lemmas, prev_tactics, hyps, goal in init_dataset: if len(prev_tactics) > 2: prevTacticsCounts[ serapi_instance.get_stem(prev_tactics[-1])] += 1 if args.load_tactic_keywords and \ Path2(args.load_tactic_keywords).exists(): result = PrevTactic(torch.load(args.load_tactic_keywords)) else: result = PrevTactic(["Proof"] + [word for word, count in prevTacticsCounts.most_common( args.num_tactic_keywords)]) eprint("Tactic keywords are {}".format(result.tacticKeywords), guard=args.print_keywords) return result
def predictKTacticsWithLoss_batch(self, in_data : List[TacticContext], k : int, corrects : List[str]) -> \ Tuple[List[List[Prediction]], float]: assert self.training_args if len(in_data) == 0: return [], 0 with self._lock: tokenized_goals = [ self._tokenizer.toTokenList(goal) for prev_tactics, hypotheses, goal in in_data ] input_tensor = LongTensor([ inputFromSentence(tokenized_goal, self.training_args.max_length) for tokenized_goal in tokenized_goals ]) prediction_distributions = self._model.run(input_tensor, batch_size=len(in_data)) correct_stems = [get_stem(correct) for correct in corrects] output_var = maybe_cuda( Variable( torch.LongTensor([ self._embedding.encode_token(correct_stem) if self._embedding.has_token(correct_stem) else 0 for correct_stem in correct_stems ]))) loss = self._criterion(prediction_distributions, output_var).item() if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() certainties_and_idxs_list = [ single_distribution.view(-1).topk(k) for single_distribution in list(prediction_distributions) ] results = [[ Prediction( self._embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] for certainties_and_idxs in certainties_and_idxs_list] return results, loss
def predictKTacticsWithLoss(prediction_distribution : torch.FloatTensor, embedding : Embedding, k : int, correct : str, criterion : nn.Module) -> Tuple[List[Prediction], float]: if k > embedding.num_tokens(): k = embedding.num_tokens() correct_stem = get_stem(correct) if embedding.has_token(correct_stem): output_var = maybe_cuda(Variable( torch.LongTensor([embedding.encode_token(correct_stem)]))) loss = criterion(prediction_distribution.view(1, -1), output_var).item() else: loss = 0 certainties_and_idxs = prediction_distribution.view(-1).topk(k) results = [Prediction(embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs)] return results, loss
def predictKTacticsWithLoss_batch(self, in_data : List[TacticContext], k : int, corrects : List[str]) -> \ Tuple[List[List[Prediction]], float]: assert self._embedding assert self.training_args with self._lock: prediction_distributions = self._predictDistributions(in_data) correct_stems = [ serapi_instance.get_stem(correct) for correct in corrects ] output_var = maybe_cuda( Variable( LongTensor([ self._embedding.encode_token(correct_stem) if self._embedding.has_token(correct_stem) else 0 for correct_stem in correct_stems ]))) loss = self._criterion(prediction_distributions, output_var).item() if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() certainties_and_idxs_list = \ [single_distribution.view(-1).topk(k) if len(context.hypotheses) > 0 else topk_with_filter(single_distribution.view(-1), k, lambda certainty, idx: not serapi_instance.tacticTakesHypArgs( cast(Embedding, self._embedding).decode_token(idx))) for single_distribution, context in zip(prediction_distributions, in_data)] results = [[ Prediction( self.add_arg(self._embedding.decode_token(stem_idx.item()), in_datum.goal, in_datum.hypotheses, self.training_args.max_length), math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] for certainties_and_idxs, in_datum in zip( certainties_and_idxs_list, in_data)] return results, loss
def predictKTacticsWithLoss( self, in_data: TacticContext, k: int, correct: str) -> Tuple[List[Prediction], float]: self.lock.acquire() distribution = self.predictDistribution(in_data) stem = get_stem(correct) if self.embedding.has_token(stem): output_var = maybe_cuda( Variable(torch.LongTensor([self.embedding.encode_token(stem) ]))) loss = self.criterion(distribution.view(1, -1), output_var).item() else: loss = 0 certainties, idxs = distribution.squeeze().topk(k) predictions_and_certainties = \ [Prediction(self.embedding.decode_token(idx.item()) + ".", math.exp(certainty.item())) for certainty, idx in zip(list(certainties), list(idxs))] self.lock.release() return predictions_and_certainties, loss
def encode_hyparg_data(data : RawDataset, tokenizer_type : Callable[[List[str], int], Tokenizer], num_keywords : int, num_reserved_tokens : int, max_args : int, max_hyps : int, encoded_length : int, entropy_data_size : int, num_threads : Optional[int] = None) -> \ Tuple[StructDataset, Tokenizer, SimpleEmbedding]: stem_embedding = SimpleEmbedding() data_list = list(data) if len(data_list) <= entropy_data_size: subset = data_list else: subset = random.sample(data_list, entropy_data_size) tokenizer = make_keyword_tokenizer_relevance( [(context, stem_embedding.encode_token( serapi_instance.get_stem(tactic))) for relevant_lemmas, prev_tactics, hyps, context, tactic in subset], tokenizer_type, num_keywords, num_reserved_tokens) termEncoder = functools.partial(getNGramTokenbagVector, 1, tokenizer.numTokens()) with multiprocessing.Pool(num_threads) as pool: hyps, contexts, tactics = zip(*data_list) encoded_contexts = pool.imap( functools.partial(_encode, tokenizer, termEncoder), contexts) encoded_hyps = pool.imap( functools.partial(_encode_hyps, tokenizer, termEncoder, max_hyps, encoded_length), contexts) encoded_tactics = pool.imap( functools.partial(encode_tactic_structure, stem_embedding, max_args), zip(hyps, tactics)) result = list(zip(encoded_hyps, encoded_contexts, encoded_tactics)) tokenizer.freezeTokenList() return result, tokenizer, stem_embedding
def main(arg_list: List[str]) -> None: parser = argparse.ArgumentParser(description="Autoencoder for coq terms") parser.add_argument("scrape_file") parser.add_argument("autoencoder_weights") parser.add_argument("save_file") parser.add_argument("--num-epochs", dest="num_epochs", default=15, type=int) parser.add_argument("--batch-size", dest="batch_size", default=256, type=int) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--print-every", dest="print_every", default=10, type=int) parser.add_argument("--learning-rate", dest="learning_rate", default=.7, type=float) parser.add_argument("--gamma", default=.9, type=float) parser.add_argument("--epoch-step", dest="epoch_step", default=5, type=int) parser.add_argument("--optimizer", choices=list(stdargs.optimizers.keys()), type=str, default=list(stdargs.optimizers.keys())[0]) parser.add_argument("--num-classifier-layers", dest="num_classifier_layers", default=3, type=int) parser.add_argument("--classifier-hidden-size", dest="classifier_hidden_size", default=128, type=int) parser.add_argument("--train-autoencoder", dest="train_autoencoder", default=False, const=True, action='store_const') args = parser.parse_args(arg_list) print("Loading autoencoder state...") autoenc_state = torch.load(args.autoencoder_weights) cfilter = autoenc_state['context-filter'] text_data = get_text_data(args) print("Encoding data...") start = time.time() tokenizer = autoenc_state['tokenizer'] embedding = SimpleEmbedding() dataset = [(tokenizer.toTokenList(goal), embedding.encode_token(get_stem(tactic))) for prev_tactics, hyps, goal, tactic in text_data] timeTaken = time.time() - start print("Encoded data in {:.2f}".format(timeTaken)) loadedAutoencoder = maybe_cuda( EncoderRNN(tokenizer.numTokens(), autoenc_state['hidden-size'], autoenc_state['num-encoder-layers'], args.batch_size)) loadedAutoencoder.load_state_dict(autoenc_state['encoder']) checkpoints = train( dataset, loadedAutoencoder, args.train_autoencoder, autoenc_state['max-length'], autoenc_state['hidden-size'], args.classifier_hidden_size, embedding.num_tokens(), args.num_classifier_layers, args.batch_size, args.learning_rate, args.gamma, args.epoch_step, args.num_epochs, args.print_every, stdargs.optimizers[args.optimizer]) for epoch, (decoder_state, autoencoder_state, training_loss) in enumerate(checkpoints): print("Autoenc training loss is {:.4f}".format( autoenc_state['training-loss'])) state = { 'epoch': epoch, 'training-loss': training_loss, 'autoenc-training-loss': autoenc_state['training-loss'], 'autoenc-epoch': autoenc_state['epoch'], 'tokenizer': tokenizer, 'tokenizer-name': autoenc_state['tokenizer-name'], 'optimizer': args.optimizer, 'autoenc-optimizer': autoenc_state['optimizer'], 'learning-rate': args.learning_rate, 'autoenc-learning-rate': autoenc_state['learning-rate'], 'encoder': autoencoder_state, 'decoder': decoder_state, 'num-decoder-layers': args.num_classifier_layers, 'num-encoder-layers': autoenc_state['num-encoder-layers'], 'context-filter': cfilter, 'max-length': autoenc_state['max-length'], 'encoded-size': autoenc_state['hidden-size'], 'hidden-size': args.classifier_hidden_size, 'num-keywords': autoenc_state['num-keywords'], 'stem-embedding': embedding, } with open(args.save_file, 'wb') as f: print("=> Saving checkpoint at epoch {}".format(epoch)) torch.save(state, f)
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--max-length", dest="max_length", default=None, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument("--verbose", action="store_true") arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") elif arg_values.format == "goals": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": dataset = data.get_text_data(arg_values) embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [strip_scraped_output(scraped) for scraped in dataset] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[feature(c) for feature in word_feature_functions] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)]))
def write_html(output_dir : Path2, filename : Path2, command_results : List[CommandResult], stats : 'ResultStats') -> None: def details_header(tag : Any, doc : Doc, text : Text, filename : Path2) -> None: header(tag, doc, text, details_css, details_javascript, "Proverbot Detailed Report for {}".format(filename)) doc, tag, text, line = Doc().ttl() with tag('html'): details_header(tag, doc, text, filename) with tag('div', id='overlay', onclick='event.stopPropagation();'): with tag('div', id='predicted'): pass with tag('div', id='context'): pass with tag('div', id='stats'): pass pass with tag('body', onclick='deselectTactic()', onload='init()'), tag('pre'): for region_idx, region in enumerate(split_into_regions(command_results)): if len(region) > 1 and len(region[1]) == 1: for cmd_idx, command_result in enumerate(region): assert isinstance(command_result[0], str) with tag('code', klass='plaincommand'): text("\n" + command_result[0].strip('\n')) else: doc.stag("br") with tag('button', klass='collapsible', id='collapsible-{}'.format(region_idx)): with tag('code', klass='buttontext'): assert isinstance(region[0][0], str), region text(region[0][0].strip("\n")) num_unfiltered = count_region_unfiltered(region) with tag('code', klass='numtacs ' + ('nonempty' if num_unfiltered > 3 else 'empty')): text(num_unfiltered) with tag('div', klass='region'): for cmd_idx, command_result in enumerate(region[1:]): if len(command_result) == 1: assert isinstance(command_result[0], str) with tag('code', klass='plaincommand'): text("\n" + command_result[0].strip('\n')) else: command, hyps, goal, prediction_results = \ cast(TacticResult, command_result) predictions : List[str] grades : List[str] certainties : List[float] if len(prediction_results) > 0: predictions, grades, certainties = zip(*prediction_results) # type: ignore else: predictions, grades, certainties = [], [], [] with tag('span', ('data-hyps',"\n".join(hyps)), ('data-goal',format_goal(goal)), ('data-num-total', str(stats.num_tactics)), ('data-predictions', to_list_string(cast(List[str], predictions))), ('data-num-predicteds', to_list_string([stats.predicted_tactic_frequency .get(get_stem(prediction), 0) for prediction in cast(List[str], predictions)])), ('data-num-corrects', to_list_string([stats.correctly_predicted_frequency .get(get_stem(prediction), 0) for prediction in cast(List[str], predictions)])), ('data-certainties', to_list_string(cast(List[float], certainties))), ('data-num-actual-corrects', stats.correctly_predicted_frequency .get(get_stem(command), 0)), ('data-num-actual-in-file', stats.actual_tactic_frequency .get(get_stem(command), 0)), ('data-actual-tactic', strip_comments(command)), ('data-grades', to_list_string(cast(List[str], grades))), ('data-search-idx', 0), id='command-{}-{}'.format(region_idx, cmd_idx), onmouseover='hoverTactic("{}-{}")'\ .format(region_idx, cmd_idx), onmouseout='unhoverTactic()', onclick='selectTactic("{}-{}"); event.stopPropagation();' .format(region_idx, cmd_idx)): doc.stag("br") if len(grades) == 0: with tag('code', klass="plaincommand"): text(command.strip("\n")) else: with tag('code', klass=grades[0]): text(command.strip("\n")) for grade in grades[1:]: with tag('span', klass=grade): doc.asis(" ⬤") with (output_dir / escape_filename(str(filename))).with_suffix(".html")\ .open(mode='w') as fout: fout.write(doc.getvalue()) pass