def main(args_list : List[str]) -> None: parser = argparse.ArgumentParser(description= "A second-tier predictor which predicts tactic " "stems based on word frequency in the goal") parser.add_argument("--context-filter", dest="context_filter", type=str, default="default") parser.add_argument("--num-keywords", dest="num_keywords", type=int, default=100) parser.add_argument("--max-tuples", dest="max_tuples", type=int, default=None) parser.add_argument("scrape_file") parser.add_argument("save_file") args = parser.parse_args(args_list) dataset = get_text_data(args) samples, tokenizer, embedding = encode_bag_classify_data(dataset, tokenizers["no-fallback"], args.num_keywords, 2) classifier, loss = train(samples, embedding.num_tokens()) state = {'stem-embeddings': embedding, 'tokenizer':tokenizer, 'classifier': classifier, 'options': [ ("dataset size", str(len(samples))), ("context filter", args.context_filter), ("training loss", loss), ("# stems", embedding.num_tokens()), ("# tokens", args.num_keywords), ]} with open(args.save_file, 'wb') as f: pickle.dump(state, f)
def main(arg_list: List[str]) -> None: args = take_std_args( arg_list, "non-recurrent neural network " "model for Proverbot9001") raw_dataset = get_text_data(args) dataset, tokenizer, embedding = encode_bag_classify_data( raw_dataset, tokenizers[args.tokenizer], args.num_keywords, 2) checkpoints = train(dataset, tokenizer.numTokens(), args.hidden_size, embedding.num_tokens(), args.num_decoder_layers, args.batch_size, args.learning_rate, args.gamma, args.epoch_step, args.num_epochs, args.print_every, optimizers[args.optimizer]) for epoch, (network_state, training_loss) in enumerate(checkpoints): state = { 'epoch': epoch, 'training-loss': training_loss, 'tokenizer': tokenizer, 'embedding': embedding, 'network-state': network_state, 'training-args': args, } with open(args.save_file, 'wb') as f: print("=> Saving checkpoint at epoch {}".format(epoch)) torch.save(state, f)
def main(args_list: List[str]) -> None: # Set up cleanup handler for Ctrl-C signal.signal(signal.SIGINT, exit_early) args = take_args(args_list) print("Reading dataset...") raw_dataset = get_text_data(args) dataset, context_tokenizer, tactic_tokenizer = \ encode_seq_seq_data(raw_dataset, lambda keywords, num_reserved: KeywordTokenizer(context_keywords, num_reserved), lambda keywords, num_reserved: KeywordTokenizer(tactic_keywords, num_reserved), 0, 2) checkpoints = train(dataset, args.hidden_size, args.learning_rate, args.num_encoder_layers, args.num_decoder_layers, args.max_length, args.num_epochs, args.batch_size, args.print_every, context_tokenizer.numTokens(), tactic_tokenizer.numTokens()) for epoch, (encoder_state, decoder_state) in enumerate(checkpoints): state = { 'epoch': epoch, 'context-tokenizer': context_tokenizer, 'tactic-tokenizer': tactic_tokenizer, 'neural-encoder': encoder_state, 'neural-decoder': decoder_state, 'num-encoder-layers': args.num_encoder_layers, 'num-decoder-layers': args.num_decoder_layers, 'hidden-size': args.hidden_size, 'max-length': args.max_length } with open(args.save_file, 'wb') as f: print("=> Saving checkpoint at epoch {}".format(epoch)) torch.save(state, f)
def train(self, args: List[str]) -> None: argparser = argparse.ArgumentParser(self._description()) self.add_args_to_parser(argparser) arg_values = argparser.parse_args(args) text_data = get_text_data(arg_values) encoded_data, encdec_state = self._encode_data(text_data, arg_values) del text_data gc.collect() self._optimize_model_to_disc(encoded_data, encdec_state, arg_values)
def main(arg_list: List[str]) -> None: argparser = argparse.ArgumentParser( description="a structural pytorch model for proverbot") add_std_args(argparser) argparser.add_argument("--num-decoder-layers", dest="num_decoder_layers", default=3, type=int) args = argparser.parse_args(arg_list) filtered_data = get_text_data(args.scrape_file, args.context_filter, verbose=True) print("Encoding data...") start = time.time() dataset, tokenizer, embedding = encode_seq_structural_data( filtered_data, tokenizers[args.tokenizer], args.num_keywords, 2) timeTaken = time.time() - start print("Encoded data in {:.2f}".format(timeTaken)) checkpoints = train(dataset, tokenizer.numTokens(), embedding.num_tokens(), args.hidden_size, args.learning_rate, args.num_encoder_layers, args.num_decoder_layers, args.max_length, args.num_epochs, args.batch_size, args.print_every, optimizers[args.optimizer]) for epoch, (encoder_state, stem_decoder_state, arg_decoder_state, training_loss) \ in enumerate(checkpoints): state = { 'epoch': epoch, 'training-loss': training_loss, 'tokenizer': tokenizer, 'tokenizer-name': args.tokenizer, 'optimizer': args.optimizer, 'learning-rate': args.learning_rate, 'embedding': embedding, 'encoder': encoder_state, 'stem-decoder': stem_decoder_state, 'arg-decoder': arg_decoder_state, 'num-encoder-layers': args.num_encoder_layers, 'num-decoder-layers': args.num_decoder_layers, 'max-length': args.max_length, 'hidden-size': args.hidden_size, 'num-keywords': args.num_keywords, 'context-filter': args.context_filter, } with open(args.save_file, 'wb') as f: print("=> Saving checkpoint at epoch {}".format(epoch)) torch.save(state, f)
def main(args_list : List[str]) -> None: parser = argparse.ArgumentParser(description= "A second-tier predictor which predicts tactic " "stems based on word frequency in the goal") parser.add_argument("--learning-rate", dest="learning_rate", default=.5, type=float) parser.add_argument("--num-epochs", dest="num_epochs", default=10, type=int) parser.add_argument("--batch-size", dest="batch_size", default=256, type=int) parser.add_argument("--print-every", dest="print_every", default=10, type=int) parser.add_argument("--epoch-step", dest="epoch_step", default=5, type=int) parser.add_argument("--gamma", dest="gamma", default=0.5, type=float) parser.add_argument("--optimizer", default="SGD", choices=list(optimizers.keys()), type=str) parser.add_argument("--context-filter", dest="context_filter", type=str, default="default") parser.add_argument("scrape_file") parser.add_argument("save_file") args = parser.parse_args(args_list) print("Loading dataset...") text_dataset = get_text_data(args) samples, tokenizer, embedding = encode_bag_classify_data(text_dataset, tokenizers["char-fallback"], 100, 2) checkpoints = train(samples, args.learning_rate, args.num_epochs, args.batch_size, embedding.num_tokens(), args.print_every, args.gamma, args.epoch_step, args.optimizer) for epoch, (linear_state, loss) in enumerate(checkpoints, start=1): state = {'epoch':epoch, 'text-encoder':tokenizer, 'linear-state': linear_state, 'stem-embeddings': embedding, 'options': [ ("# epochs", str(epoch)), ("learning rate", str(args.learning_rate)), ("batch size", str(args.batch_size)), ("epoch step", str(args.epoch_step)), ("gamma", str(args.gamma)), ("dataset size", str(len(samples))), ("optimizer", args.optimizer), ("training loss", "{:10.2f}".format(loss)), ("context filter", args.context_filter), ]} with open(args.save_file, 'wb') as f: print("=> Saving checkpoint at epoch {}". format(epoch)) torch.save(state, f)
def main(args_list : List[str]): arg_parser = start_std_args("A proverbot9001 model template that can predict " "tactics with hypothesis arguments") arg_parser.add_argument("--max-hyps", dest="max_hyps", default=10, type=int) arg_parser.add_argument("--max-args", dest="max_args", default=2, type=int) arg_parser.add_argument("--entropy-data-size", dest="entropy_data_size", default=1000, type=int) args = arg_parser.parse_args(args_list) dataset = get_text_data(args) curtime = time.time() print("Encoding data...", end="") sys.stdout.flush() encoded_term_size = args.num_keywords + TOKEN_START + 1 samples, tokenizer, embedding = encode_hyparg_data(dataset, tokenizers[args.tokenizer], args.num_keywords, TOKEN_START, args.max_args, args.max_hyps, encoded_term_size, args.entropy_data_size) print(" {:.2f}s".format(time.time() - curtime)) checkpoints : List[Checkpoint] = train(samples, args, embedding.num_tokens(), encoded_term_size) for initial_encoder, stem_decoder, arg_decoder, loss in checkpoints: state = {'max-args': args.max_args, 'max-hyps': args.max_hyps, 'hidden-size': args.hidden_size, 'tokenizer': tokenizer, 'embedding': embedding, 'stem-decoder': stem_decoder, 'arg-decoder': arg_decoder, 'initial-encoder': initial_encoder, 'options': [ ("dataset size", str(len(samples))), ("context filter", args.context_filter), ("training loss", loss), ("# stems", embedding.num_tokens()), ("# tokens", args.num_keywords), ("hidden size", args.hidden_size), ("max # tactic args", args.max_args), ("max # of hypotheses", args.max_hyps), ("tokenizer entropy sample size", args.entropy_data_size), ]} with open(args.save_file, 'wb') as f: torch.save(state, f)
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector", "scrapefile-rd", "scrapefile" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--no-truncate-semicolons", dest="truncate_semicolons", action='store_false') parser.add_argument("--max-length", dest="max_length", default=30, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("-j", "--num-threads", default=None, type=int) parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument('-v', "--verbose", action="count") parser.add_argument("--num-threads", "-j", type=int, default=None) parser.add_argument("--no-use-substitutions", action='store_false', dest='use_substitutions') parser.add_argument("--no-normalize-numeric-args", action='store_false', dest='normalize_numeric_args') parser.add_argument("--sort", action='store_true') arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") else: dataset = data.get_text_data(arg_values) if arg_values.sort: dataset = data.RawDataset( sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True)) if arg_values.format == "goals": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [ strip_scraped_output(scraped) for scraped in dataset ] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) # type: ignore for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[ feature(c) for feature in word_feature_functions ] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)])) elif arg_values.format == "scrapefile-rd": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "context": { "fg_goals": [{ "hypotheses": point.hypotheses, "goal": point.goal }], "bg_goals": [], "shelved_goals": [], "given_up_goals": [] }, "tactic": point.tactic })) elif arg_values.format == "scrapefile": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "prev_hyps": point.hypotheses, "prev_goal": point.goal, "tactic": point.tactic }))
def main(arg_list: List[str]) -> None: parser = argparse.ArgumentParser(description="Autoencoder for coq terms") parser.add_argument("scrape_file") parser.add_argument("autoencoder_weights") parser.add_argument("save_file") parser.add_argument("--num-epochs", dest="num_epochs", default=15, type=int) parser.add_argument("--batch-size", dest="batch_size", default=256, type=int) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--print-every", dest="print_every", default=10, type=int) parser.add_argument("--learning-rate", dest="learning_rate", default=.7, type=float) parser.add_argument("--gamma", default=.9, type=float) parser.add_argument("--epoch-step", dest="epoch_step", default=5, type=int) parser.add_argument("--optimizer", choices=list(stdargs.optimizers.keys()), type=str, default=list(stdargs.optimizers.keys())[0]) parser.add_argument("--num-classifier-layers", dest="num_classifier_layers", default=3, type=int) parser.add_argument("--classifier-hidden-size", dest="classifier_hidden_size", default=128, type=int) parser.add_argument("--train-autoencoder", dest="train_autoencoder", default=False, const=True, action='store_const') args = parser.parse_args(arg_list) print("Loading autoencoder state...") autoenc_state = torch.load(args.autoencoder_weights) cfilter = autoenc_state['context-filter'] text_data = get_text_data(args) print("Encoding data...") start = time.time() tokenizer = autoenc_state['tokenizer'] embedding = SimpleEmbedding() dataset = [(tokenizer.toTokenList(goal), embedding.encode_token(get_stem(tactic))) for prev_tactics, hyps, goal, tactic in text_data] timeTaken = time.time() - start print("Encoded data in {:.2f}".format(timeTaken)) loadedAutoencoder = maybe_cuda( EncoderRNN(tokenizer.numTokens(), autoenc_state['hidden-size'], autoenc_state['num-encoder-layers'], args.batch_size)) loadedAutoencoder.load_state_dict(autoenc_state['encoder']) checkpoints = train( dataset, loadedAutoencoder, args.train_autoencoder, autoenc_state['max-length'], autoenc_state['hidden-size'], args.classifier_hidden_size, embedding.num_tokens(), args.num_classifier_layers, args.batch_size, args.learning_rate, args.gamma, args.epoch_step, args.num_epochs, args.print_every, stdargs.optimizers[args.optimizer]) for epoch, (decoder_state, autoencoder_state, training_loss) in enumerate(checkpoints): print("Autoenc training loss is {:.4f}".format( autoenc_state['training-loss'])) state = { 'epoch': epoch, 'training-loss': training_loss, 'autoenc-training-loss': autoenc_state['training-loss'], 'autoenc-epoch': autoenc_state['epoch'], 'tokenizer': tokenizer, 'tokenizer-name': autoenc_state['tokenizer-name'], 'optimizer': args.optimizer, 'autoenc-optimizer': autoenc_state['optimizer'], 'learning-rate': args.learning_rate, 'autoenc-learning-rate': autoenc_state['learning-rate'], 'encoder': autoencoder_state, 'decoder': decoder_state, 'num-decoder-layers': args.num_classifier_layers, 'num-encoder-layers': autoenc_state['num-encoder-layers'], 'context-filter': cfilter, 'max-length': autoenc_state['max-length'], 'encoded-size': autoenc_state['hidden-size'], 'hidden-size': args.classifier_hidden_size, 'num-keywords': autoenc_state['num-keywords'], 'stem-embedding': embedding, } with open(args.save_file, 'wb') as f: print("=> Saving checkpoint at epoch {}".format(epoch)) torch.save(state, f)
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--max-length", dest="max_length", default=None, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument("--verbose", action="store_true") arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") elif arg_values.format == "goals": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": dataset = data.get_text_data(arg_values) embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [strip_scraped_output(scraped) for scraped in dataset] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[feature(c) for feature in word_feature_functions] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)]))