Пример #1
0
    def _encode_data(self, data : RawDataset, arg_values : Namespace) \
        -> Tuple[ApplyDataset, Tokenizer]:
        preprocessed_data = list(self._preprocess_data(data, arg_values))
        isRelevants = [
            self._determine_relevance(inter) for inter in preprocessed_data
        ]
        embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
        tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
        tokenized_hyp_lists = tokenize_hyps(RawDataset(preprocessed_data),
                                            arg_values, tokenizer)
        with multiprocessing.Pool(None) as pool:
            encoded_hyp_lists = list(
                pool.imap(
                    functools.partial(encodeHypList, arg_values.num_grams,
                                      tokenizer.numTokens()),
                    tokenized_hyp_lists))
            encoded_goals = list(
                pool.imap(
                    functools.partial(getNGramTokenbagVector,
                                      arg_values.num_grams,
                                      tokenizer.numTokens()), tokenized_goals))
        samples = ApplyDataset([
            HypothesisRelevanceSample(encoded_hyp, encoded_goal, isRelevant)
            for encoded_goal, encoded_hyps_list, relevanceList in zip(
                encoded_goals, encoded_hyp_lists, isRelevants) for encoded_hyp,
            isRelevant in zip(encoded_hyps_list, relevanceList)
        ])

        return samples, tokenizer
Пример #2
0
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[HypStemDataset, Tuple[Tokenizer, Embedding]]:
     preprocessed_data = list(self._preprocess_data(data, arg_values))
     embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
     tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
     print("Encoding hyps...")
     with multiprocessing.Pool(arg_values.num_threads) as pool:
         relevant_hyps, relevances = \
             zip(*list(pool.imap(most_relevant_hyp, preprocessed_data)))
     encoded_relevant_hyps = [
         getNGramTokenbagVector(arg_values.num_grams, tokenizer.numTokens(),
                                tokenizer.toTokenList(hyp_term))
         for hyp_term in relevant_hyps
     ]
     print("Encoding goals...")
     encoded_goals = [
         getNGramTokenbagVector(arg_values.num_grams, tokenizer.numTokens(),
                                term) for term in tokenized_goals
     ]
     print("Done")
     return HypStemDataset([
         HypStemSample(hyp, relevance, goal,
                       inter.tactic) for hyp, relevance, goal, inter in zip(
                           encoded_relevant_hyps, relevances, encoded_goals,
                           embedded_data)
     ]), (tokenizer, embedding)
Пример #3
0
    def _encode_data(self, data : RawDataset, arg_values : Namespace) \
        -> Tuple[CopyArgDataset, Tuple[Tokenizer, Embedding,
                                       List[WordFeature], List[VecFeature]]]:
        for datum in data:
            assert not re.match("induction\s+\d+\.", datum.tactic)
        stripped_data = [strip_scraped_output(dat) for dat in data]
        self._word_feature_functions = [
            feature_constructor(stripped_data, arg_values) for  # type: ignore
            feature_constructor in word_feature_constructors
        ]
        self._vec_feature_functions = [
            feature_constructor(stripped_data, arg_values) for  # type: ignore
            feature_constructor in vec_feature_constructors
        ]
        embedding, embedded_data = embed_data(data)
        tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
        with multiprocessing.Pool(arg_values.num_threads) as pool:
            arg_idxs = pool.imap(
                functools.partial(get_arg_idx, arg_values.max_length), data)

            start = time.time()
            print("Creating dataset...", end="")
            sys.stdout.flush()
            result_data = CopyArgDataset(
                list(
                    pool.imap(
                        functools.partial(mkCopySample, arg_values.max_length,
                                          self._word_feature_functions,
                                          self._vec_feature_functions),
                        zip(embedded_data, tokenized_goals, arg_idxs))))
            print("{:.2f}s".format(time.time() - start))
        return result_data, (tokenizer, embedding,
                             self._word_feature_functions,
                             self._vec_feature_functions)
Пример #4
0
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[EncFeaturesDataset, Tuple[Tokenizer, Embedding,
                                        List[VecFeature], List[WordFeature]]]:
     preprocessed_data = list(self._preprocess_data(data, arg_values))
     stripped_data = [
         strip_scraped_output(dat) for dat in preprocessed_data
     ]
     self._vec_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in vec_feature_constructors
     ]
     self._word_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in word_feature_constructors
     ]
     embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
     tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
     result_data = EncFeaturesDataset([
         EncFeaturesSample(
             self._get_vec_features(
                 TacticContext(prev_tactics, hypotheses, goal)),
             self._get_word_features(
                 TacticContext(prev_tactics, hypotheses, goal)),
             normalizeSentenceLength(tokenized_goal, arg_values.max_length),
             tactic)
         for (prev_tactics, hypotheses, goal,
              tactic), tokenized_goal in zip(embedded_data, tokenized_goals)
     ])
     return result_data, (tokenizer, embedding, self._vec_feature_functions,
                          self._word_feature_functions)
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[HypFeaturesDataset, Tuple[Tokenizer, Embedding,
                                        List[WordFeature], List[VecFeature]]]:
     preprocessed_data = list(self._preprocess_data(data, arg_values))
     start = time.time()
     print("Stripping...", end="")
     sys.stdout.flush()
     stripped_data = [
         strip_scraped_output(dat) for dat in preprocessed_data
     ]
     print("{:.2f}s".format(time.time() - start))
     self._word_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in word_feature_constructors
     ]
     self._vec_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in vec_feature_constructors
     ]
     embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
     tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
     with multiprocessing.Pool(arg_values.num_threads) as pool:
         start = time.time()
         print("Getting closest hyps...", end="")
         sys.stdout.flush()
         tokenized_hyps = list(
             pool.imap(
                 functools.partial(get_closest_hyp_type, tokenizer,
                                   arg_values.max_length),
                 preprocessed_data))
         print("{:.2f}s".format(time.time() - start))
         start = time.time()
         print("Creating dataset...", end="")
         sys.stdout.flush()
         result_data = HypFeaturesDataset(
             list(
                 pool.imap(
                     functools.partial(mkHFSample, arg_values.max_length,
                                       self._word_feature_functions,
                                       self._vec_feature_functions),
                     zip(embedded_data, tokenized_goals, tokenized_hyps))))
         print("{:.2f}s".format(time.time() - start))
     return result_data, (tokenizer, embedding,
                          self._word_feature_functions,
                          self._vec_feature_functions)