def encode_seq_structural_data(data : RawDataset, context_tokenizer_type : \ Callable[[List[str], int], Tokenizer], num_keywords : int, num_reserved_tokens: int) -> \ Tuple[StructDataset, Tokenizer, SimpleEmbedding]: embedding = SimpleEmbedding() hyps_and_goals = [ hyp_or_goal for hyp_and_goal in [ zip(hyps + [goal], itertools.repeat(embedding.encode_token(tactic))) for prev_tactics, hyps, goal, tactic in data ] for hyp_or_goal in hyp_and_goal ] context_tokenizer = make_keyword_tokenizer_relevance( hyps_and_goals, context_tokenizer_type, num_keywords, num_reserved_tokens) encodedData = [] for prev_tactics, hyps, goal, tactic in data: stem, rest = serapi_instance.split_tactic(tactic) encodedData.append( ([context_tokenizer.toTokenList(hyp) for hyp in hyps], context_tokenizer.toTokenList(goal), (embedding.encode_token(stem), [hyp_index(hyps, arg) for arg in get_symbols(rest)]))) return encodedData, context_tokenizer, embedding
def get_tokens(args: List[str]): parser = argparse.ArgumentParser(description="Pick a set of tokens") parser.add_argument("--type", choices=["mixed"], default="mixed") parser.add_argument("-v", "--verbose", action='count', default=0) parser.add_argument("-n", "--num-keywords", type=int, default=120) parser.add_argument("-s", "--num-samples", type=int, default=2000) parser.add_argument("-j", "--num-threads", type=int, default=None) parser.add_argument("scrapefile", type=Path2) parser.add_argument("dest") arg_values = parser.parse_args(args) with print_time("Reading scraped data", guard=arg_values.verbose): raw_data = list(data.read_text_data(arg_values.scrapefile)) embedding = SimpleEmbedding() subset = data.RawDataset(random.sample(raw_data, arg_values.num_samples)) relevance_pairs = [ (context.focused_goal, embedding.encode_token(serapi_instance.get_stem(tactic))) for relevant_lemmas, prev_tactics, context, tactic in subset ] with print_time("Calculating keywords", guard=arg_values.verbose): keywords = get_relevant_k_keywords2(relevance_pairs, arg_values.num_keywords, arg_values.num_threads) with (open(arg_values.dest, mode='w') if arg_values.dest != "-" else contextlib.nullcontext(sys.stdout)) as f: for keyword in keywords: f.write(keyword + "\n")
def encode_hyparg_data(data : RawDataset, tokenizer_type : Callable[[List[str], int], Tokenizer], num_keywords : int, num_reserved_tokens : int, max_args : int, max_hyps : int, encoded_length : int, entropy_data_size : int, num_threads : Optional[int] = None) -> \ Tuple[StructDataset, Tokenizer, SimpleEmbedding]: stem_embedding = SimpleEmbedding() data_list = list(data) if len(data_list) <= entropy_data_size: subset = data_list else: subset = random.sample(data_list, entropy_data_size) tokenizer = make_keyword_tokenizer_relevance( [(context, stem_embedding.encode_token(serapi_instance.get_stem(tactic))) for relevant_lemmas, prev_tactics, hyps, context, tactic in subset], tokenizer_type, num_keywords, num_reserved_tokens) termEncoder = functools.partial(getNGramTokenbagVector, 1, tokenizer.numTokens()) with multiprocessing.Pool(num_threads) as pool: hyps, contexts, tactics = zip(*data_list) encoded_contexts = pool.imap(functools.partial( _encode, tokenizer, termEncoder), contexts) encoded_hyps = pool.imap(functools.partial( _encode_hyps, tokenizer, termEncoder, max_hyps, encoded_length), contexts) encoded_tactics = pool.imap( functools.partial(encode_tactic_structure, stem_embedding, max_args), zip(hyps, tactics)) result = list(zip(encoded_hyps, encoded_contexts, encoded_tactics)) tokenizer.freezeTokenList() return result, tokenizer, stem_embedding
def encode_seq_classify_data(data : RawDataset, tokenizer_type : Callable[[List[str], int], Tokenizer], num_keywords : int, num_reserved_tokens : int, save_tokens : Optional[str] = None, load_tokens : Optional[str] = None, num_relevance_samples : int = 1000) \ -> Tuple[ClassifySequenceDataset, Tokenizer, SimpleEmbedding]: embedding = SimpleEmbedding() subset = RawDataset(random.sample(data, num_relevance_samples)) if load_tokens: print("Loading tokens from {}".format(load_tokens)) tokenizer = torch.load(load_tokens) else: start = time.time() print("Picking tokens...", end="") sys.stdout.flush() tokenizer = make_keyword_tokenizer_relevance( [(context, embedding.encode_token(get_stem(tactic))) for prev_tactics, hyps, context, tactic in subset], tokenizer_type, num_keywords, num_reserved_tokens) print("{}s".format(time.time() - start)) if save_tokens: print("Saving tokens to {}".format(save_tokens)) torch.save(tokenizer, save_tokens) with multiprocessing.Pool(None) as pool: result = [(goal, embedding.encode_token(tactic)) for goal, tactic in chain.from_iterable( pool.imap( functools.partial(encode_seq_classify_data_worker__, tokenizer), chunks(data, 1024)))] tokenizer.freezeTokenList() return result, tokenizer, embedding
def embed_data(data : RawDataset) -> Tuple[Embedding, StrictEmbeddedDataset]: embedding = SimpleEmbedding() start = time.time() print("Embedding data...", end="") sys.stdout.flush() dataset = StrictEmbeddedDataset([EmbeddedSample( prev_tactics, hypotheses, goal, embedding.encode_token(get_stem(tactic))) for prev_tactics, hypotheses, goal, tactic in data]) print("{:.2f}s".format(time.time() - start)) return embedding, dataset
def _encode_data(self, data : RawDataset, args : Namespace) \ -> Tuple[DatasetType, TokenizerEmbeddingState]: preprocessed_data = self._preprocess_data(data, args) embedding = SimpleEmbedding() embedded_data: EmbeddedDataset with multiprocessing.Pool(args.num_threads) as pool: stemmed_data = pool.imap(stemmify_data, preprocessed_data, chunksize=10240) lazy_embedded_data = LazyEmbeddedDataset( (EmbeddedSample(prev_tactics, hypotheses, goal, embedding.encode_token(tactic)) for (prev_tactics, hypotheses, goal, tactic) in stemmed_data)) if args.load_tokens: print("Loading tokens from {}".format(args.load_tokens)) with open(args.load_tokens, 'rb') as f: tokenizer = pickle.load(f) assert isinstance(tokenizer, Tokenizer) embedded_data = lazy_embedded_data else: # Force the embedded data for picking keywords forced_embedded_data = StrictEmbeddedDataset( list(lazy_embedded_data.data)) subset = StrictEmbeddedDataset( random.sample(forced_embedded_data, args.num_relevance_samples)) embedded_data = forced_embedded_data start = time.time() print("Picking tokens...", end="") sys.stdout.flush() tokenizer = make_keyword_tokenizer_relevance([ (goal, next_tactic) for prev_tactics, hypotheses, goal, next_tactic in subset ], tokenizers[args.tokenizer], args.num_keywords, TOKEN_START, args.num_threads) del subset print("{}s".format(time.time() - start)) if args.save_tokens: print("Saving tokens to {}".format(args.save_tokens)) assert isinstance(tokenizer, Tokenizer) with open(args.save_tokens, 'wb') as f: pickle.dump(tokenizer, f) if args.print_keywords: print("Keywords are {}".format(tokenizer.listTokens())) print("Tokenizing...") tokenized_data = tokenize_data(tokenizer, embedded_data, args.num_threads) gc.collect() return self._encode_tokenized_data(tokenized_data, args, tokenizer, embedding), \ TokenizerEmbeddingState(tokenizer, embedding)
def decode_tactic_structure(stem_embedding: SimpleEmbedding, struct: TacticStructure, hyps: List[str]) -> str: stem_idx, arg_hyp_idxs = struct return " ".join([stem_embedding.decode_token(stem_idx)] + [ serapi_instance.get_first_var_in_hyp(hyps[hyp_idx - TOKEN_START]) for hyp_idx in arg_hyp_idxs ])
def decode_tactic_structure(term_tokenizer: Tokenizer, stem_embedding: SimpleEmbedding, struct: TacticStructure, hyps: List[str]) -> str: def get_var(idx: int) -> str: if idx == 0: return "UNKNOWN" else: return serapi_instance.get_first_var_in_hyp(hyps[idx - 1]) stem_idx, arg_hyp_idxs = struct return " ".join([stem_embedding.decode_token(stem_idx)] + [ get_var(hyp_idx) for hyp_idx in takewhile(lambda idx: idx > 0, arg_hyp_idxs) ])
def encode_tactic_structure(stem_embedding : SimpleEmbedding, max_args : int, hyps_and_tactic : Tuple[List[str], str]) \ -> TacticStructure: hyps, tactic = hyps_and_tactic tactic_stem, args_str = serapi_instance.split_tactic(tactic) arg_strs = args_str.split()[:max_args] stem_idx = stem_embedding.encode_token(tactic_stem) arg_idxs = [get_arg_idx(hyps, arg.strip()) for arg in args_str.split()] if len(arg_idxs) < max_args: arg_idxs += [EOS_token] * (max_args - len(arg_idxs)) # If any arguments aren't hypotheses, ignore the arguments if not all(arg_idxs): arg_idxs = [EOS_token] * max_args return TacticStructure(stem_idx=stem_idx, hyp_idxs=arg_idxs)
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector", "scrapefile-rd", "scrapefile" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--no-truncate-semicolons", dest="truncate_semicolons", action='store_false') parser.add_argument("--max-length", dest="max_length", default=30, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("-j", "--num-threads", default=None, type=int) parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument('-v', "--verbose", action="count") parser.add_argument("--num-threads", "-j", type=int, default=None) parser.add_argument("--no-use-substitutions", action='store_false', dest='use_substitutions') parser.add_argument("--no-normalize-numeric-args", action='store_false', dest='normalize_numeric_args') parser.add_argument("--sort", action='store_true') arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") else: dataset = data.get_text_data(arg_values) if arg_values.sort: dataset = data.RawDataset( sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True)) if arg_values.format == "goals": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [ strip_scraped_output(scraped) for scraped in dataset ] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) # type: ignore for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[ feature(c) for feature in word_feature_functions ] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)])) elif arg_values.format == "scrapefile-rd": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "context": { "fg_goals": [{ "hypotheses": point.hypotheses, "goal": point.goal }], "bg_goals": [], "shelved_goals": [], "given_up_goals": [] }, "tactic": point.tactic })) elif arg_values.format == "scrapefile": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "prev_hyps": point.hypotheses, "prev_goal": point.goal, "tactic": point.tactic }))
def main(arg_list: List[str]) -> None: parser = argparse.ArgumentParser(description="Autoencoder for coq terms") parser.add_argument("scrape_file") parser.add_argument("autoencoder_weights") parser.add_argument("save_file") parser.add_argument("--num-epochs", dest="num_epochs", default=15, type=int) parser.add_argument("--batch-size", dest="batch_size", default=256, type=int) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--print-every", dest="print_every", default=10, type=int) parser.add_argument("--learning-rate", dest="learning_rate", default=.7, type=float) parser.add_argument("--gamma", default=.9, type=float) parser.add_argument("--epoch-step", dest="epoch_step", default=5, type=int) parser.add_argument("--optimizer", choices=list(stdargs.optimizers.keys()), type=str, default=list(stdargs.optimizers.keys())[0]) parser.add_argument("--num-classifier-layers", dest="num_classifier_layers", default=3, type=int) parser.add_argument("--classifier-hidden-size", dest="classifier_hidden_size", default=128, type=int) parser.add_argument("--train-autoencoder", dest="train_autoencoder", default=False, const=True, action='store_const') args = parser.parse_args(arg_list) print("Loading autoencoder state...") autoenc_state = torch.load(args.autoencoder_weights) cfilter = autoenc_state['context-filter'] text_data = get_text_data(args) print("Encoding data...") start = time.time() tokenizer = autoenc_state['tokenizer'] embedding = SimpleEmbedding() dataset = [(tokenizer.toTokenList(goal), embedding.encode_token(get_stem(tactic))) for prev_tactics, hyps, goal, tactic in text_data] timeTaken = time.time() - start print("Encoded data in {:.2f}".format(timeTaken)) loadedAutoencoder = maybe_cuda( EncoderRNN(tokenizer.numTokens(), autoenc_state['hidden-size'], autoenc_state['num-encoder-layers'], args.batch_size)) loadedAutoencoder.load_state_dict(autoenc_state['encoder']) checkpoints = train( dataset, loadedAutoencoder, args.train_autoencoder, autoenc_state['max-length'], autoenc_state['hidden-size'], args.classifier_hidden_size, embedding.num_tokens(), args.num_classifier_layers, args.batch_size, args.learning_rate, args.gamma, args.epoch_step, args.num_epochs, args.print_every, stdargs.optimizers[args.optimizer]) for epoch, (decoder_state, autoencoder_state, training_loss) in enumerate(checkpoints): print("Autoenc training loss is {:.4f}".format( autoenc_state['training-loss'])) state = { 'epoch': epoch, 'training-loss': training_loss, 'autoenc-training-loss': autoenc_state['training-loss'], 'autoenc-epoch': autoenc_state['epoch'], 'tokenizer': tokenizer, 'tokenizer-name': autoenc_state['tokenizer-name'], 'optimizer': args.optimizer, 'autoenc-optimizer': autoenc_state['optimizer'], 'learning-rate': args.learning_rate, 'autoenc-learning-rate': autoenc_state['learning-rate'], 'encoder': autoencoder_state, 'decoder': decoder_state, 'num-decoder-layers': args.num_classifier_layers, 'num-encoder-layers': autoenc_state['num-encoder-layers'], 'context-filter': cfilter, 'max-length': autoenc_state['max-length'], 'encoded-size': autoenc_state['hidden-size'], 'hidden-size': args.classifier_hidden_size, 'num-keywords': autoenc_state['num-keywords'], 'stem-embedding': embedding, } with open(args.save_file, 'wb') as f: print("=> Saving checkpoint at epoch {}".format(epoch)) torch.save(state, f)
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--max-length", dest="max_length", default=None, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument("--verbose", action="store_true") arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") elif arg_values.format == "goals": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": dataset = data.get_text_data(arg_values) embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [strip_scraped_output(scraped) for scraped in dataset] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[feature(c) for feature in word_feature_functions] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)]))