def setup_from_data(self, data: Iterable[Tuple[QASetting, List[Answer]]]): vocab = self.shared_resources.vocab if not vocab.frozen: preprocessing.fill_vocab( (q for q, _ in data), vocab, lowercase=self.shared_resources.config.get('lowercase', True)) vocab.freeze() if vocab.emb is not None: self.shared_resources.embeddings = np.zeros( [len(vocab), vocab.emb_length]) for w, i in self.shared_resources.vocab.sym2id.items(): e = vocab.emb.get(w) if e is not None: self.shared_resources.embeddings[i] = e if not hasattr(self.shared_resources, 'answer_vocab' ) or not self.shared_resources.answer_vocab.frozen: self.shared_resources.answer_vocab = util.create_answer_vocab( qa_settings=(q for q, _ in data), answers=(a for _, ass in data for a in ass)) self.shared_resources.answer_vocab.freeze() self.shared_resources.config['answer_size'] = len( self.shared_resources.answer_vocab) self.shared_resources.char_vocab = preprocessing.char_vocab_from_vocab( self.shared_resources.vocab)
def setup_from_data(self, data: Iterable[Tuple[QASetting, List[Answer]]]): # create character vocab + word lengths + char ids per word if not self.shared_resources.vocab.frozen: preprocessing.fill_vocab( (q for q, _ in data), self.shared_resources.vocab, self.shared_resources.config.get("lowercase", False)) self.shared_resources.vocab.freeze() self.shared_resources.char_vocab = preprocessing.char_vocab_from_vocab( self.shared_resources.vocab)
def setup_from_data(self, data: Iterable[Tuple[QASetting, List[Answer]]]): vocab = self.shared_resources.vocab if not vocab.frozen: preprocessing.fill_vocab( (q for q, _ in data), vocab, lowercase=self.shared_resources.config.get('lowercase', True)) vocab.freeze() if not hasattr(self.shared_resources, 'answer_vocab') or not self.shared_resources.answer_vocab.frozen: self.shared_resources.answer_vocab = util.create_answer_vocab( qa_settings=(q for q, _ in data), answers=(a for _, ass in data for a in ass)) self.shared_resources.answer_vocab.freeze() self.shared_resources.char_vocab = preprocessing.char_vocab_from_vocab(self.shared_resources.vocab)
def setup_from_data(self, data: Iterable[Tuple[QASetting, List[Answer]]]): if not self.shared_resources.vocab.frozen: self.shared_resources.vocab = preprocessing.fill_vocab( (q for q, _ in data), self.shared_resources.vocab, lowercase=True) self.shared_resources.vocab.freeze() if not hasattr(self.shared_resources, 'answer_vocab') or not self.shared_resources.answer_vocab.frozen: self.shared_resources.answer_vocab = create_answer_vocab(answers=(a for _, ass in data for a in ass)) self.shared_resources.answer_vocab.freeze() self.shared_resources.config['answer_size'] = self.shared_resources.config.get( 'answer_size', len(self.shared_resources.answer_vocab)) self.shared_resources.char_vocab = {chr(i): i for i in range(256)}
def test_vocab(): train_data = [ QASetting(question='A person is training his horse for a competition.', support=['A person on a horse jumps over a broken down airplane.'], candidates=['entailment', 'neutral', 'contradiction']) ] print('build vocab based on train data') train_vocab = preprocessing.fill_vocab(train_data) train_vocab.freeze() pprint(train_vocab._sym2freqs) pprint(train_vocab._sym2id) MIN_VOCAB_FREQ, MAX_VOCAB_CNT = 2, 10 train_vocab = train_vocab.prune(MIN_VOCAB_FREQ, MAX_VOCAB_CNT) pprint(train_vocab._sym2freqs) pprint(train_vocab._sym2id) print('encode train data') train_data = preprocessing.nlp_preprocess(train_data[0].question, train_vocab)[0] print(train_data)