Exemplo n.º 1
0
 def _predictDistributions(
         self, in_datas: List[TacticContext]) -> torch.FloatTensor:
     assert self._tokenizer
     assert self._embedding
     assert self.training_args
     goals_batch = [
         normalizeSentenceLength(self._tokenizer.toTokenList(goal),
                                 self.training_args.max_length)
         for _, _, _, goal in in_datas
     ]
     hyps = [
         get_closest_hyp(hyps, goal, self.training_args.max_length)
         for _, _, hyps, goal in in_datas
     ]
     hyp_types = [serapi_instance.get_hyp_type(hyp) for hyp in hyps]
     hyps_batch = [
         normalizeSentenceLength(self._tokenizer.toTokenList(hyp_type),
                                 self.training_args.max_length)
         for hyp_type in hyp_types
     ]
     word_features_batch = [
         self._get_word_features(in_data) for in_data in in_datas
     ]
     vec_features_batch = [
         self._get_vec_features(in_data) for in_data in in_datas
     ]
     stem_distribution = self._model(LongTensor(goals_batch),
                                     LongTensor(hyps_batch),
                                     FloatTensor(vec_features_batch),
                                     LongTensor(word_features_batch))
     return stem_distribution
def mkHFSample(max_length : int,
               word_feature_functions : List[WordFeature],
               vec_feature_functions : List[VecFeature],
               zipped : Tuple[EmbeddedSample, List[int], List[int]]) \
    -> HypFeaturesSample:
    context, goal, best_hyp = zipped
    (prev_tactic_list, hypotheses, goal_str, tactic) = context
    tac_context = TacticContext(prev_tactic_list, hypotheses, goal_str)
    return HypFeaturesSample(
        [feature(tac_context) for feature in word_feature_functions], [
            feature_val for feature in vec_feature_functions
            for feature_val in feature(tac_context)
        ], normalizeSentenceLength(goal, max_length),
        normalizeSentenceLength(best_hyp, max_length), tactic)
Exemplo n.º 3
0
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[EncFeaturesDataset, Tuple[Tokenizer, Embedding,
                                        List[VecFeature], List[WordFeature]]]:
     preprocessed_data = list(self._preprocess_data(data, arg_values))
     stripped_data = [
         strip_scraped_output(dat) for dat in preprocessed_data
     ]
     self._vec_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in vec_feature_constructors
     ]
     self._word_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in word_feature_constructors
     ]
     embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
     tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
     result_data = EncFeaturesDataset([
         EncFeaturesSample(
             self._get_vec_features(
                 TacticContext(prev_tactics, hypotheses, goal)),
             self._get_word_features(
                 TacticContext(prev_tactics, hypotheses, goal)),
             normalizeSentenceLength(tokenized_goal, arg_values.max_length),
             tactic)
         for (prev_tactics, hypotheses, goal,
              tactic), tokenized_goal in zip(embedded_data, tokenized_goals)
     ])
     return result_data, (tokenizer, embedding, self._vec_feature_functions,
                          self._word_feature_functions)
Exemplo n.º 4
0
 def _predictCompositeDistributionFromStemDistribution(
         self, beam_width : int, stem_distribution : torch.FloatTensor,
         in_datas : List[TacticContext]) \
     -> Tuple[torch.FloatTensor, torch.LongTensor]:
     assert self.training_args
     assert self._tokenizer
     goals_batch = torch.LongTensor([
         normalizeSentenceLength(self._tokenizer.toTokenList(goal),
                                 self.training_args.max_length)
         for _, _, _, goal in in_datas
     ])
     batch_size = stem_distribution.size()[0]
     num_stem_poss = stem_distribution.size()[1]
     stem_width = min(beam_width, num_stem_poss)
     probs, indices = stem_distribution.topk(stem_width)
     stems_batch = indices.view(batch_size * stem_width)
     probs_batch = probs.view(batch_size * stem_width)
     goals_batch = goals_batch.view(batch_size, 1, self.training_args.max_length)\
                              .expand(-1, stem_width, -1).contiguous()\
                              .view(batch_size * stem_width,
                                    self.training_args.max_length)
     conditional_distributions = \
         self._model.find_arg_rnn(goals_batch, stems_batch)[:,1:]
     num_probs = conditional_distributions.size()[1]
     all_batch_probs = (conditional_distributions.t() +
                        probs_batch.view(-1)).t()
     all_prob_batches = all_batch_probs\
         .contiguous().view(batch_size, stem_width * num_probs)
     return all_prob_batches, indices
Exemplo n.º 5
0
 def predictDistribution(self, in_data : TacticContext) \
     -> torch.FloatTensor:
     return self.decoder.run(
         self.encoder.run(
             LongTensor(
                 normalizeSentenceLength(
                     self.tokenizer.toTokenList(in_data.goal),
                     self.max_length)).view(1, -1)))
Exemplo n.º 6
0
 def _data_tensors(self, encoded_data : PECDataset, arg_values : Namespace) \
     -> List[torch.Tensor]:
     prevs, goals, nexts = zip(*encoded_data)
     goal_stream = torch.LongTensor([
         normalizeSentenceLength(goal, arg_values.max_length)
         for goal in goals
     ])
     prev_stream = torch.LongTensor(prevs)
     out_stream = torch.LongTensor(nexts)
     return [goal_stream, prev_stream, out_stream]
Exemplo n.º 7
0
 def _predictDistributions(self, in_datas : List[TacticContext]) \
     -> torch.FloatTensor:
     assert self.training_args
     tokenized_goals = [
         self._tokenizer.toTokenList(in_data.goal) for in_data in in_datas
     ]
     goal_list = [
         normalizeSentenceLength(tokenized_goal,
                                 self.training_args.max_length)
         for tokenized_goal in tokenized_goals
     ]
     goal_tensor = LongTensor(goal_list).view(len(in_datas), -1)
     prev_tensor = LongTensor([self._get_prev(in_data) for in_data in in_datas])\
         .view(len(in_datas), -1)
     return self._model.run(goal_tensor, prev_tensor)
Exemplo n.º 8
0
    def run(self, hidden: torch.FloatTensor, max_length: int) -> Sentence:
        decoder_hidden = hidden
        assert self.batch_size == 1
        decoder_input = self.initInput()
        prediction: Sentence = []

        for di in range(max_length):
            decoder_output, decoder_hidden = self(decoder_input,
                                                  decoder_hidden)
            probability, decoder_input = decoder_output.view(1, -1).topk(1)
            decoded_char = decoder_input.item()
            prediction.append(decoded_char)
            if decoded_char == EOS_token:
                prediction = normalizeSentenceLength(prediction, max_length)
                break
        return prediction
Exemplo n.º 9
0
def mkCopySample(max_length : int,
                 word_feature_functions : List[WordFeature],
                 vec_feature_functions : List[VecFeature],
                 zipped : Tuple[EmbeddedSample, List[int], int]) \
                 -> CopyArgSample:
    context, goal, arg_idx = zipped
    (prev_tactic_list, hypotheses, goal_str, tactic_idx) = context
    tac_context = TacticContext(prev_tactic_list, hypotheses, goal_str)
    word_features = [feature(tac_context)
                     for feature in word_feature_functions]
    assert len(word_features) == 3
    return CopyArgSample(normalizeSentenceLength(goal, max_length),
                         word_features,
                         [feature_val for feature in vec_feature_functions
                          for feature_val in feature(tac_context)],
                         tactic_idx, arg_idx)
Exemplo n.º 10
0
 def _predictDistributions(
         self, in_datas: List[TacticContext]) -> torch.FloatTensor:
     assert self.training_args
     vec_features_batch = [
         self._get_vec_features(in_data) for in_data in in_datas
     ]
     word_features_batch = [
         self._get_word_features(in_data) for in_data in in_datas
     ]
     goals_batch = [
         normalizeSentenceLength(self._tokenizer.toTokenList(goal),
                                 self.training_args.max_length)
         for _, _, goal in in_datas
     ]
     return self._model(torch.FloatTensor(vec_features_batch),
                        torch.LongTensor(word_features_batch),
                        torch.LongTensor(goals_batch))
Exemplo n.º 11
0
def run_test(args_list: List[str]):
    parser = argparse.ArgumentParser()
    parser.add_argument("save_file", type=str)
    parser.add_argument("--print-inputs",
                        dest="print_inputs",
                        default=False,
                        action='store_const',
                        const=True)
    arg_values = parser.parse_args(args_list)

    checkpoint = torch.load(arg_values.save_file)

    assert checkpoint['max-length']
    assert checkpoint['tokenizer']
    assert checkpoint['tokenizer-name']
    assert checkpoint['encoder']
    assert checkpoint['num-encoder-layers']
    assert checkpoint['decoder']
    assert checkpoint['num-decoder-layers']
    assert checkpoint['hidden-size']
    assert checkpoint['context-filter']

    tokenizer = checkpoint['tokenizer']
    encoder = maybe_cuda(
        EncoderRNN(tokenizer.numTokens(), checkpoint['hidden-size'],
                   checkpoint['num-encoder-layers']))
    encoder.load_state_dict(checkpoint['encoder'])

    decoder = maybe_cuda(
        DecoderRNN(checkpoint['hidden-size'], tokenizer.numTokens(),
                   checkpoint['num-decoder-layers']))
    decoder.load_state_dict(checkpoint['decoder'])

    for term in sys.stdin:
        data_in = torch.LongTensor(
            normalizeSentenceLength(tokenizer.toTokenList(term),
                                    checkpoint['max-length'])).view(1, -1)
        if arg_values.print_inputs:
            print("{} ({}) -> ".format(term.strip(), data_in), end="")
        data_out = decoder.run(encoder.run(data_in), checkpoint['max-length'])
        print(
            tokenizer.toString(
                list(itertools.takewhile(lambda x: x != EOS_token, data_out))))
Exemplo n.º 12
0
 def predictKTacticsWithLoss_batch(self,
                                   in_data : List[TacticContext],
                                   k : int, corrects : List[str]) -> \
                                   Tuple[List[List[Prediction]], float]:
     assert self.training_args
     if len(in_data) == 0:
         return [], 0
     with self._lock:
         goals_tensor = LongTensor([
             normalizeSentenceLength(self._tokenizer.toTokenList(goal),
                                     self.training_args.max_length)
             for relevant_lemmas, prev_tactics, hypotheses, goal in in_data
         ])
         prevs_tensor = LongTensor(
             [self._get_prev(in_datum) for in_datum in in_data])
         correct_stems = [get_stem(correct) for correct in corrects]
         prediction_distributions = self._model.run(goals_tensor,
                                                    prevs_tensor)
         output_var = maybe_cuda(
             Variable(
                 torch.LongTensor([
                     self._embedding.encode_token(correct_stem)
                     if self._embedding.has_token(correct_stem) else 0
                     for correct_stem in correct_stems
                 ])))
         loss = self._criterion(prediction_distributions, output_var).item()
         if k > self._embedding.num_tokens():
             k = self._embedding.num_tokens()
         certainties_and_idxs_list = [
             single_distribution.view(-1).topk(k)
             for single_distribution in list(prediction_distributions)
         ]
         results = [[
             Prediction(
                 self._embedding.decode_token(stem_idx.item()) + ".",
                 math.exp(certainty.item()))
             for certainty, stem_idx in zip(*certainties_and_idxs)
         ] for certainties_and_idxs in certainties_and_idxs_list]
     return results, loss
Exemplo n.º 13
0
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector", "scrapefile-rd", "scrapefile"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--no-truncate-semicolons",
                        dest="truncate_semicolons",
                        action='store_false')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=30,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("-j", "--num-threads", default=None, type=int)
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument('-v', "--verbose", action="count")
    parser.add_argument("--num-threads", "-j", type=int, default=None)
    parser.add_argument("--no-use-substitutions",
                        action='store_false',
                        dest='use_substitutions')
    parser.add_argument("--no-normalize-numeric-args",
                        action='store_false',
                        dest='normalize_numeric_args')
    parser.add_argument("--sort", action='store_true')
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    else:
        dataset = data.get_text_data(arg_values)
        if arg_values.sort:
            dataset = data.RawDataset(
                sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True))
        if arg_values.format == "goals":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                print(goal)
        elif arg_values.format == "hyps+goal":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
        elif arg_values.format == "hyps+goal+tactic":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
                print("====> {}".format(tactic))
            pass
        elif arg_values.format == "tacvector":
            embedding = SimpleEmbedding()
            eprint("Encoding tactics...", guard=arg_values.verbose)
            answers = [
                embedding.encode_token(serapi_instance.get_stem(datum.tactic))
                for datum in dataset
            ]
            stripped_data = [
                strip_scraped_output(scraped) for scraped in dataset
            ]
            eprint("Constructing features...", guard=arg_values.verbose)
            word_feature_functions = [
                word_feature_constructor(stripped_data,
                                         arg_values)  # type: ignore
                for word_feature_constructor in
                features.word_feature_constructors
            ]
            vec_features_functions = [
                vec_feature_constructor(stripped_data, arg_values) for
                vec_feature_constructor in features.vec_feature_constructors
            ]
            eprint("Extracting features...", guard=arg_values.verbose)
            word_features = [[
                feature(c) for feature in word_feature_functions
            ] for c in stripped_data]
            vec_features = [[
                feature_val for feature in vec_features_functions
                for feature_val in feature(c)
            ] for c in stripped_data]
            eprint("Done", guard=arg_values.verbose)
            for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                                   answers):
                print(",".join(
                    list(map(str, word_feat)) + list(map(str, vec_feat)) +
                    [str(tactic)]))
        elif arg_values.format == "scrapefile-rd":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "context": {
                            "fg_goals": [{
                                "hypotheses": point.hypotheses,
                                "goal": point.goal
                            }],
                            "bg_goals": [],
                            "shelved_goals": [],
                            "given_up_goals": []
                        },
                        "tactic": point.tactic
                    }))
        elif arg_values.format == "scrapefile":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "prev_hyps": point.hypotheses,
                        "prev_goal": point.goal,
                        "tactic": point.tactic
                    }))
Exemplo n.º 14
0
def train(dataset : ClassifySequenceDataset,
          autoencoder : EncoderRNN, train_autoencoder: bool, max_length : int,
          encoder_hidden_size : int, classifier_hidden_size : int,
          output_vocab_size : int, num_layers : int, batch_size : int,
          learning_rate : float, gamma : float, epoch_step : int, num_epochs : int,
          print_every : int, optimizer_f : Callable[..., Optimizer]) \
          -> Iterable[Checkpoint]:
    print("Initializing PyTorch...")
    in_stream = [
        normalizeSentenceLength(goal, max_length) for goal, tactic in dataset
    ]
    out_stream = [tactic for goal, tactic in dataset]
    dataloader = \
        torchdata.DataLoader(torchdata.TensorDataset(torch.LongTensor(in_stream),
                                                     torch.LongTensor(out_stream)),
                             batch_size=batch_size, num_workers=0,
                             shuffle=True, pin_memory=True, drop_last=True)

    classifier = maybe_cuda(
        ClassifierDNN(encoder_hidden_size, classifier_hidden_size,
                      output_vocab_size, num_layers, batch_size))
    optimizers = [optimizer_f(classifier.parameters(), lr=learning_rate)]
    if train_autoencoder:
        optimizers += [optimizer_f(autoencoder.parameters(), lr=learning_rate)]
    criterion = maybe_cuda(nn.NLLLoss())
    adjusters = [
        scheduler.StepLR(optimizer, epoch_step, gamma)
        for optimizer in optimizers
    ]

    start = time.time()
    num_items = len(dataset) * num_epochs
    total_loss = 0

    print("Training...")
    for epoch in range(num_epochs):
        print("Epoch {}".format(epoch))
        for adjuster in adjusters:
            adjuster.step()
        for batch_num, (input_batch, output_batch) in enumerate(dataloader):

            # Reset the optimizer
            for optimizer in optimizers:
                optimizer.zero_grad()

            # Run the classifier on pre-encoded vectors
            encoded_input_batch = autoencoder.run(
                cast(torch.LongTensor, input_batch))
            prediction_distribution = classifier.run(encoded_input_batch)

            # Get the loss
            output_var = maybe_cuda(Variable(output_batch))
            loss = criterion(prediction_distribution, output_var)

            # Update the weights
            loss.backward()
            for optimizer in optimizers:
                optimizer.step()

            # Report progress
            items_processed = (batch_num +
                               1) * batch_size + epoch * len(dataset)
            total_loss += loss.item() * batch_size
            assert isinstance(total_loss, float)

            if (batch_num + 1) % print_every == 0:

                progress = items_processed / num_items
                print("{} ({:7} {:5.2f}%) {:.4f}".format(
                    timeSince(start, progress), items_processed,
                    progress * 100, total_loss / items_processed))

        yield Checkpoint(classifier_state=classifier.state_dict(),
                         autoencoder_state=autoencoder.state_dict(),
                         training_loss=total_loss / items_processed)
Exemplo n.º 15
0
def use_tokenizer(tokenizer: tk.Tokenizer, max_length: int, term_strings: str):
    return [
        normalizeSentenceLength(tokenizer.toTokenList(term_string), max_length)
        for term_string in term_strings
    ]
Exemplo n.º 16
0
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=None,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument("--verbose", action="store_true")
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    elif arg_values.format == "goals":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            print(goal)
    elif arg_values.format == "hyps+goal":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
    elif arg_values.format == "hyps+goal+tactic":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
            print("====> {}".format(tactic))
        pass
    elif arg_values.format == "tacvector":
        dataset = data.get_text_data(arg_values)
        embedding = SimpleEmbedding()
        eprint("Encoding tactics...", guard=arg_values.verbose)
        answers = [
            embedding.encode_token(serapi_instance.get_stem(datum.tactic))
            for datum in dataset
        ]
        stripped_data = [strip_scraped_output(scraped) for scraped in dataset]
        eprint("Constructing features...", guard=arg_values.verbose)
        word_feature_functions = [
            word_feature_constructor(stripped_data, arg_values)
            for word_feature_constructor in features.word_feature_constructors
        ]
        vec_features_functions = [
            vec_feature_constructor(stripped_data, arg_values)
            for vec_feature_constructor in features.vec_feature_constructors
        ]
        eprint("Extracting features...", guard=arg_values.verbose)
        word_features = [[feature(c) for feature in word_feature_functions]
                         for c in stripped_data]
        vec_features = [[
            feature_val for feature in vec_features_functions
            for feature_val in feature(c)
        ] for c in stripped_data]
        eprint("Done", guard=arg_values.verbose)
        for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                               answers):
            print(",".join(
                list(map(str, word_feat)) + list(map(str, vec_feat)) +
                [str(tactic)]))