def build_data(self, inputs: Iterable[str], outputs: Iterable[str], splits: Iterable[str]): maxlen_in, maxlen_out = 0, 0 eid = 0 gold_map = torch.arange( 0, self.query_encoder.vocab.number_of_ids(last_nonrare=False)) rare_tokens = self.query_encoder.vocab.rare_tokens - set( self.sentence_encoder.vocab.D.keys()) for rare_token in rare_tokens: gold_map[self.query_encoder.vocab[rare_token]] = \ self.query_encoder.vocab[self.query_encoder.vocab.unktoken] for inp, out, split in zip(inputs, outputs, splits): inp_tensor, inp_tokens = self.sentence_encoder.convert( inp, return_what="tensor,tokens") out_tensor, out_tokens = self.query_encoder.convert( out, return_what="tensor,tokens") out_tensor = gold_map[out_tensor] state = TreeDecoderState([inp], [out], inp_tensor[None, :], out_tensor[None, :], [inp_tokens], [out_tokens], self.sentence_encoder.vocab, self.query_encoder.vocab) state.eids = np.asarray([eid], dtype="int64") maxlen_in, maxlen_out = max(maxlen_in, len(state.inp_tokens[0])), max( maxlen_out, len(state.gold_tokens[0])) if split not in self.data: self.data[split] = [] self.data[split].append(state) eid += 1 self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out
def build_data(self, inputs:Iterable[str], outputs:Iterable[str], splits:Iterable[str], unktokens:Set[str]=None): gold_map = None maxlen_in, maxlen_out = 0, 0 maxlins = 0 numlins_counts = [0] * (self.max_lins_allowed + 1) if unktokens is not None: gold_map = torch.arange(0, self.query_encoder.vocab.number_of_ids(last_nonrare=False)) for rare_token in unktokens: gold_map[self.query_encoder.vocab[rare_token]] = \ self.query_encoder.vocab[self.query_encoder.vocab.unktoken] for inp, out, split in zip(inputs, outputs, splits): inp_tensor, inp_tokens = self.sentence_encoder.convert(inp, return_what="tensor,tokens") gold_tree = lisp_to_tree(out) assert(gold_tree is not None) out_tensor, out_tokens = self.query_encoder.convert(out, return_what="tensor,tokens") if split == "train": gold_tree_ = tensor2tree(out_tensor, self.query_encoder.vocab) numlins = 0 for gold_tree_reordered in get_tree_permutations(gold_tree_, orderless={"and", "or"}): if numlins >= self.max_lins_allowed: break out_ = tree_to_lisp(gold_tree_reordered) out_tensor_, out_tokens_ = self.query_encoder.convert(out_, return_what="tensor,tokens") if gold_map is not None: out_tensor = gold_map[out_tensor] state = TreeDecoderState([inp], [gold_tree_reordered], inp_tensor[None, :], out_tensor_[None, :], [inp_tokens], [out_tokens_], self.sentence_encoder.vocab, self.query_encoder.vocab, token_specs=self.token_specs) if split not in self.data: self.data[split] = [] self.data[split].append(state) numlins += 1 numlins_counts[numlins] += 1 maxlins = max(maxlins, numlins) else: if gold_map is not None: out_tensor = gold_map[out_tensor] state = TreeDecoderState([inp], [gold_tree], inp_tensor[None, :], out_tensor[None, :], [inp_tokens], [out_tokens], self.sentence_encoder.vocab, self.query_encoder.vocab, token_specs=self.token_specs) if split not in self.data: self.data[split] = [] self.data[split].append(state) maxlen_in = max(maxlen_in, len(inp_tokens)) maxlen_out = max(maxlen_out, len(out_tensor)) self.maxlen_input = maxlen_in self.maxlen_output = maxlen_out
def build_data(self, inputs: Iterable[str], outputs: Iterable[str], splits: Iterable[str], unktokens: Set[str] = None): gold_map = None maxlen_in, maxlen_out = 0, 0 if unktokens is not None: gold_map = torch.arange(0, self.query_encoder.vocab.number_of_ids()) for rare_token in unktokens: gold_map[self.query_encoder.vocab[rare_token]] = \ self.query_encoder.vocab[self.query_encoder.vocab.unktoken] for inp, out, split in zip(inputs, outputs, splits): inp_tensor, inp_tokens = self.sentence_encoder.convert( inp, return_what="tensor,tokens") gold_tree = lisp_to_tree(out) assert (gold_tree is not None) out_tensor, out_tokens = self.query_encoder.convert( out, return_what="tensor,tokens") if gold_map is not None: out_tensor = gold_map[out_tensor] state = TreeDecoderState([inp], [gold_tree], inp_tensor[None, :], out_tensor[None, :], [inp_tokens], [out_tokens], self.sentence_encoder.vocab, self.query_encoder.vocab, token_specs=self.token_specs) if split == "train" and self.reorder_random is True: gold_tree_ = tensor2tree(out_tensor, self.query_encoder.vocab) random_gold_tree = random.choice( get_tree_permutations(gold_tree_, orderless={"and"})) out_ = tree_to_lisp(random_gold_tree) out_tensor_, out_tokens_ = self.query_encoder.convert( out_, return_what="tensor,tokens") if gold_map is not None: out_tensor_ = gold_map[out_tensor_] state.gold_tensor = out_tensor_[None] if split not in self.data: self.data[split] = [] self.data[split].append(state) maxlen_in = max(maxlen_in, len(inp_tokens)) maxlen_out = max(maxlen_out, len(out_tensor)) self.maxlen_input = maxlen_in self.maxlen_output = maxlen_out
def build_data(self, inputs:Iterable[str], outputs:Iterable[str], splits:Iterable[str], unktokens:Set[str]=None): gold_map = None maxlen_in, maxlen_out = 0, 0 if unktokens is not None: gold_map = torch.arange(0, self.query_encoder.vocab.number_of_ids(last_nonrare=False)) for rare_token in unktokens: gold_map[self.query_encoder.vocab[rare_token]] = \ self.query_encoder.vocab[self.query_encoder.vocab.unktoken] for inp, out, split in zip(inputs, outputs, splits): inp_tensor, inp_tokens = self.sentence_encoder.convert(inp, return_what="tensor,tokens") gold_tree = lisp_to_tree(out) assert(gold_tree is not None) out_tensor, out_tokens = self.query_encoder.convert(out, return_what="tensor,tokens") if gold_map is not None: out_tensor = gold_map[out_tensor] state = TreeDecoderState([inp], [gold_tree], inp_tensor[None, :], out_tensor[None, :], [inp_tokens], [out_tokens], self.sentence_encoder.vocab, self.query_encoder.vocab) if split not in self.data: self.data[split] = [] self.data[split].append(state) maxlen_in = max(maxlen_in, len(inp_tokens)) maxlen_out = max(maxlen_out, len(out_tensor)) self.maxlen_input = maxlen_in self.maxlen_output = maxlen_out
def build_data(self, inputs: Iterable[str], outputs: Iterable[str], splits: Iterable[str]): maxlen_in, maxlen_out = 0, 0 eid = 0 for inp, out, split in zip(inputs, outputs, splits): state = TreeDecoderState([inp], [out], self.sentence_encoder, self.query_encoder) state.eids = np.asarray([eid], dtype="int64") maxlen_in, maxlen_out = max(maxlen_in, len(state.inp_tokens[0])), max( maxlen_out, len(state.gold_tokens[0])) if split not in self.data: self.data[split] = [] self.data[split].append(state) eid += 1 self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out
def build_data(self, inputs: Iterable[str], outputs: Iterable[str], splits: Iterable[str]): maxlen_in, maxlen_out = 0, 0 for inp, out, split in zip(inputs, outputs, splits): # tokenize both input and output inp_tokens = self.sentence_encoder.convert(inp, return_what="tokens")[0] out_tokens = self.query_encoder.convert(out, return_what="tokens")[0] # get gold tree gold_tree = lisp_to_tree(" ".join(out_tokens[:-1])) assert (gold_tree is not None) # replace words in output that can't be copied from given input to UNK tokens unktoken = self.query_encoder.vocab.unktoken inp_tokens_ = set(inp_tokens) out_tokens = [ out_token if out_token in inp_tokens_ or (out_token in self.query_encoder.vocab and not out_token in self.query_encoder.vocab.rare_tokens) else unktoken for out_token in out_tokens ] # convert token sequences to ids inp_tensor = self.sentence_encoder.convert(inp_tokens, return_what="tensor")[0] out_tensor = self.query_encoder.convert(out_tokens, return_what="tensor")[0] state = TreeDecoderState([inp], [gold_tree], inp_tensor[None, :], out_tensor[None, :], [inp_tokens], [out_tokens], self.sentence_encoder.vocab, self.query_encoder.vocab, token_specs=self.token_specs) if split not in self.data: self.data[split] = [] self.data[split].append(state) maxlen_in = max(maxlen_in, len(inp_tokens)) maxlen_out = max(maxlen_out, len(out_tensor)) self.maxlen_input = maxlen_in self.maxlen_output = maxlen_out