def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[ApplyDataset, Tokenizer]: preprocessed_data = list(self._preprocess_data(data, arg_values)) isRelevants = [ self._determine_relevance(inter) for inter in preprocessed_data ] embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) tokenized_hyp_lists = tokenize_hyps(RawDataset(preprocessed_data), arg_values, tokenizer) with multiprocessing.Pool(None) as pool: encoded_hyp_lists = list( pool.imap( functools.partial(encodeHypList, arg_values.num_grams, tokenizer.numTokens()), tokenized_hyp_lists)) encoded_goals = list( pool.imap( functools.partial(getNGramTokenbagVector, arg_values.num_grams, tokenizer.numTokens()), tokenized_goals)) samples = ApplyDataset([ HypothesisRelevanceSample(encoded_hyp, encoded_goal, isRelevant) for encoded_goal, encoded_hyps_list, relevanceList in zip( encoded_goals, encoded_hyp_lists, isRelevants) for encoded_hyp, isRelevant in zip(encoded_hyps_list, relevanceList) ]) return samples, tokenizer
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[HypStemDataset, Tuple[Tokenizer, Embedding]]: preprocessed_data = list(self._preprocess_data(data, arg_values)) embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) print("Encoding hyps...") with multiprocessing.Pool(arg_values.num_threads) as pool: relevant_hyps, relevances = \ zip(*list(pool.imap(most_relevant_hyp, preprocessed_data))) encoded_relevant_hyps = [ getNGramTokenbagVector(arg_values.num_grams, tokenizer.numTokens(), tokenizer.toTokenList(hyp_term)) for hyp_term in relevant_hyps ] print("Encoding goals...") encoded_goals = [ getNGramTokenbagVector(arg_values.num_grams, tokenizer.numTokens(), term) for term in tokenized_goals ] print("Done") return HypStemDataset([ HypStemSample(hyp, relevance, goal, inter.tactic) for hyp, relevance, goal, inter in zip( encoded_relevant_hyps, relevances, encoded_goals, embedded_data) ]), (tokenizer, embedding)
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[CopyArgDataset, Tuple[Tokenizer, Embedding, List[WordFeature], List[VecFeature]]]: for datum in data: assert not re.match("induction\s+\d+\.", datum.tactic) stripped_data = [strip_scraped_output(dat) for dat in data] self._word_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors ] self._vec_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors ] embedding, embedded_data = embed_data(data) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) with multiprocessing.Pool(arg_values.num_threads) as pool: arg_idxs = pool.imap( functools.partial(get_arg_idx, arg_values.max_length), data) start = time.time() print("Creating dataset...", end="") sys.stdout.flush() result_data = CopyArgDataset( list( pool.imap( functools.partial(mkCopySample, arg_values.max_length, self._word_feature_functions, self._vec_feature_functions), zip(embedded_data, tokenized_goals, arg_idxs)))) print("{:.2f}s".format(time.time() - start)) return result_data, (tokenizer, embedding, self._word_feature_functions, self._vec_feature_functions)
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[EncFeaturesDataset, Tuple[Tokenizer, Embedding, List[VecFeature], List[WordFeature]]]: preprocessed_data = list(self._preprocess_data(data, arg_values)) stripped_data = [ strip_scraped_output(dat) for dat in preprocessed_data ] self._vec_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors ] self._word_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors ] embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) result_data = EncFeaturesDataset([ EncFeaturesSample( self._get_vec_features( TacticContext(prev_tactics, hypotheses, goal)), self._get_word_features( TacticContext(prev_tactics, hypotheses, goal)), normalizeSentenceLength(tokenized_goal, arg_values.max_length), tactic) for (prev_tactics, hypotheses, goal, tactic), tokenized_goal in zip(embedded_data, tokenized_goals) ]) return result_data, (tokenizer, embedding, self._vec_feature_functions, self._word_feature_functions)
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[HypFeaturesDataset, Tuple[Tokenizer, Embedding, List[WordFeature], List[VecFeature]]]: preprocessed_data = list(self._preprocess_data(data, arg_values)) start = time.time() print("Stripping...", end="") sys.stdout.flush() stripped_data = [ strip_scraped_output(dat) for dat in preprocessed_data ] print("{:.2f}s".format(time.time() - start)) self._word_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors ] self._vec_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors ] embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) with multiprocessing.Pool(arg_values.num_threads) as pool: start = time.time() print("Getting closest hyps...", end="") sys.stdout.flush() tokenized_hyps = list( pool.imap( functools.partial(get_closest_hyp_type, tokenizer, arg_values.max_length), preprocessed_data)) print("{:.2f}s".format(time.time() - start)) start = time.time() print("Creating dataset...", end="") sys.stdout.flush() result_data = HypFeaturesDataset( list( pool.imap( functools.partial(mkHFSample, arg_values.max_length, self._word_feature_functions, self._vec_feature_functions), zip(embedded_data, tokenized_goals, tokenized_hyps)))) print("{:.2f}s".format(time.time() - start)) return result_data, (tokenizer, embedding, self._word_feature_functions, self._vec_feature_functions)