def build_data(self, inputs: Iterable[str], outputs: Iterable[str], splits: Iterable[str]): maxlen_in, maxlen_out = 0, 0 eid = 0 gold_map = torch.arange( 0, self.query_encoder.vocab.number_of_ids(last_nonrare=False)) rare_tokens = self.query_encoder.vocab.rare_tokens - set( self.sentence_encoder.vocab.D.keys()) for rare_token in rare_tokens: gold_map[self.query_encoder.vocab[rare_token]] = \ self.query_encoder.vocab[self.query_encoder.vocab.unktoken] for inp, out, split in zip(inputs, outputs, splits): inp_tensor, inp_tokens = self.sentence_encoder.convert( inp, return_what="tensor,tokens") out_tensor, out_tokens = self.query_encoder.convert( out, return_what="tensor,tokens") out_tensor = gold_map[out_tensor] state = TreeDecoderState([inp], [out], inp_tensor[None, :], out_tensor[None, :], [inp_tokens], [out_tokens], self.sentence_encoder.vocab, self.query_encoder.vocab) state.eids = np.asarray([eid], dtype="int64") maxlen_in, maxlen_out = max(maxlen_in, len(state.inp_tokens[0])), max( maxlen_out, len(state.gold_tokens[0])) if split not in self.data: self.data[split] = [] self.data[split].append(state) eid += 1 self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out
def build_data(self, inputs: Iterable[str], outputs: Iterable[str], splits: Iterable[str]): maxlen_in, maxlen_out = 0, 0 eid = 0 for inp, out, split in zip(inputs, outputs, splits): state = TreeDecoderState([inp], [out], self.sentence_encoder, self.query_encoder) state.eids = np.asarray([eid], dtype="int64") maxlen_in, maxlen_out = max(maxlen_in, len(state.inp_tokens[0])), max( maxlen_out, len(state.gold_tokens[0])) if split not in self.data: self.data[split] = [] self.data[split].append(state) eid += 1 self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out