예제 #1
0
    def setup_from_data(self, data: Iterable[Tuple[QASetting, List[Answer]]]):
        vocab = self.shared_resources.vocab
        if not vocab.frozen:
            preprocessing.fill_vocab(
                (q for q, _ in data),
                vocab,
                lowercase=self.shared_resources.config.get('lowercase', True))
            vocab.freeze()
            if vocab.emb is not None:
                self.shared_resources.embeddings = np.zeros(
                    [len(vocab), vocab.emb_length])
                for w, i in self.shared_resources.vocab.sym2id.items():
                    e = vocab.emb.get(w)
                    if e is not None:
                        self.shared_resources.embeddings[i] = e

        if not hasattr(self.shared_resources, 'answer_vocab'
                       ) or not self.shared_resources.answer_vocab.frozen:
            self.shared_resources.answer_vocab = util.create_answer_vocab(
                qa_settings=(q for q, _ in data),
                answers=(a for _, ass in data for a in ass))
            self.shared_resources.answer_vocab.freeze()
        self.shared_resources.config['answer_size'] = len(
            self.shared_resources.answer_vocab)
        self.shared_resources.char_vocab = preprocessing.char_vocab_from_vocab(
            self.shared_resources.vocab)
예제 #2
0
 def setup_from_data(self, data: Iterable[Tuple[QASetting, List[Answer]]]):
     # create character vocab + word lengths + char ids per word
     if not self.shared_resources.vocab.frozen:
         preprocessing.fill_vocab(
             (q for q, _ in data), self.shared_resources.vocab,
             self.shared_resources.config.get("lowercase", False))
         self.shared_resources.vocab.freeze()
     self.shared_resources.char_vocab = preprocessing.char_vocab_from_vocab(
         self.shared_resources.vocab)
예제 #3
0
 def setup_from_data(self, data: Iterable[Tuple[QASetting, List[Answer]]]):
     vocab = self.shared_resources.vocab
     if not vocab.frozen:
         preprocessing.fill_vocab(
             (q for q, _ in data), vocab, lowercase=self.shared_resources.config.get('lowercase', True))
         vocab.freeze()
     if not hasattr(self.shared_resources, 'answer_vocab') or not self.shared_resources.answer_vocab.frozen:
         self.shared_resources.answer_vocab = util.create_answer_vocab(
             qa_settings=(q for q, _ in data), answers=(a for _, ass in data for a in ass))
         self.shared_resources.answer_vocab.freeze()
     self.shared_resources.char_vocab = preprocessing.char_vocab_from_vocab(self.shared_resources.vocab)
예제 #4
0
 def setup_from_data(self, data: Iterable[Tuple[QASetting, List[Answer]]]):
     if not self.shared_resources.vocab.frozen:
         self.shared_resources.vocab = preprocessing.fill_vocab(
             (q for q, _ in data), self.shared_resources.vocab, lowercase=True)
         self.shared_resources.vocab.freeze()
     if not hasattr(self.shared_resources, 'answer_vocab') or not self.shared_resources.answer_vocab.frozen:
         self.shared_resources.answer_vocab = create_answer_vocab(answers=(a for _, ass in data for a in ass))
         self.shared_resources.answer_vocab.freeze()
     self.shared_resources.config['answer_size'] = self.shared_resources.config.get(
         'answer_size', len(self.shared_resources.answer_vocab))
     self.shared_resources.char_vocab = {chr(i): i for i in range(256)}
예제 #5
0
def test_vocab():
    train_data = [
        QASetting(question='A person is training his horse for a competition.',
                  support=['A person on a horse jumps over a broken down airplane.'],
                  candidates=['entailment', 'neutral', 'contradiction'])
    ]

    print('build vocab based on train data')
    train_vocab = preprocessing.fill_vocab(train_data)
    train_vocab.freeze()
    pprint(train_vocab._sym2freqs)
    pprint(train_vocab._sym2id)

    MIN_VOCAB_FREQ, MAX_VOCAB_CNT = 2, 10
    train_vocab = train_vocab.prune(MIN_VOCAB_FREQ, MAX_VOCAB_CNT)

    pprint(train_vocab._sym2freqs)
    pprint(train_vocab._sym2id)

    print('encode train data')
    train_data = preprocessing.nlp_preprocess(train_data[0].question, train_vocab)[0]
    print(train_data)