def tokenize_data(data, token_to_id, char_to_id, limit=None): """ Tokenize a data set, with mapping of tokens to index in origin. Also create and update the vocabularies. :param: data: a flat, organize view of the data, as a list of qid, passage, query and answer indexes. :param: vocab: a dict of token to id; updated. :param: c_vocab: a dict of char to id; update. :return: a tokenized view of the data, as a list of qid, passage, query, answer indexes, and token to char indexes mapping. Passage and queries are tokenized into a tuple (token, chars). Answer indexes are start:stop range of tokens. """ tokenized = [] for qid, passage, query, (start, stop) in data: q_tokens, q_chars, _, _, _ = \ rich_tokenize(query, token_to_id, char_to_id, update=True) p_tokens, p_chars, _, _, mapping = \ rich_tokenize(passage['passage_text'], token_to_id, char_to_id, update=True) if start == 0 and stop == 0: pass # No answer; nop, since 0 == 0 elif start == 0 and stop == len(passage): stop = len(p_tokens) # Now point to just after last token. else: t_start = None t_end = len(p_tokens) for t_ind, (_start, _end) in enumerate(mapping): if start < _end: t_start = t_ind break assert t_start is not None for t_ind, (_start, _end) in \ enumerate(mapping[t_start:], t_start): if stop < _start: t_end = t_ind break start = t_start # Now point to first token in answer. stop = t_end # Now point to after the last token in answer. # Keep or not based on length of passage. if limit is not None and len(p_tokens) > limit: if stop <= limit: # Passage is too long, but it can be trimmed. p_tokens = p_tokens[:limit] else: # Passage is too long, but it cannot be trimmed. continue tokenized.append( (qid, (p_tokens, p_chars), (q_tokens, q_chars), (start, stop), mapping)) return tokenized
def tokenize_data(data, token_to_id, char_to_id, limit=None): """ Tokenize a data set, with mapping of tokens to index in origin. Also create and update the vocabularies. :param: data: a flat, organize view of the data, as a list of qid, passage, query and answer indexes. :param: vocab: a dict of token to id; updated. :param: c_vocab: a dict of char to id; update. :return: a tokenized view of the data, as a list of qid, passage, query, answer indexes, and token to char indexes mapping. Passage and queries are tokenized into a tuple (token, chars). Answer indexes are start:stop range of tokens. """ tokenized = [] for qid, passage, query, (start, stop) in data: q_tokens, q_chars, _, _, _ = \ rich_tokenize(query, token_to_id, char_to_id, update=True) p_tokens, p_chars, _, _, mapping = \ rich_tokenize(passage['passage_text'], token_to_id, char_to_id, update=True) if start == 0 and stop == 0: pass # No answer; nop, since 0 == 0 elif start == 0 and stop == len(passage): stop = len(p_tokens) # Now point to just after last token. else: t_start = None t_end = len(p_tokens) for t_ind, (_start, _end) in enumerate(mapping): if start < _end: t_start = t_ind break assert t_start is not None for t_ind, (_start, _end) in \ enumerate(mapping[t_start:], t_start): if stop < _start: t_end = t_ind break start = t_start # Now point to first token in answer. stop = t_end # Now point to after the last token in answer. # Keep or not based on length of passage. if limit is not None and len(p_tokens) > limit: if stop <= limit: # Passage is too long, but it can be trimmed. p_tokens = p_tokens[:limit] else: # Passage is too long, but it cannot be trimmed. continue tokenized.append((qid, (p_tokens, p_chars), (q_tokens, q_chars), (start, stop), mapping)) return tokenized
def _to_batch(self, texts): mappings = [] lengths = [] c_lengths = [] tokens = [] chars = [] for text in texts: _tokens, _chars, length, _c_lengths, mapping = \ rich_tokenize(text, self.vocab, self.c_vocab, {}, update=False) mappings.append(mapping) lengths.append(length) c_lengths.append(_c_lengths) tokens.append(_tokens) chars.append(_chars) lengths = np.array(lengths) p_length = lengths.max() p_c_length = max(max(_c_lengths) for _c_lengths in c_lengths) b_tokens = [] b_chars = [] for _tokens, _chars in zip(tokens, chars): _tokens, _chars = pad_to_size(_tokens, _chars, p_length, max(5, p_c_length)) b_tokens.append(_tokens) b_chars.append(_chars) b_tokens = np.concatenate(b_tokens) b_chars = np.concatenate(b_chars) return b_tokens, b_chars, lengths, mappings
def _to_batch(self, texts): mappings = [] lengths = [] c_lengths = [] tokens = [] chars = [] for text in texts: _tokens, _chars, length, _c_lengths, mapping = \ rich_tokenize(text, self.vocab, self.c_vocab, update=False) mappings.append(mapping) lengths.append(length) c_lengths.append(_c_lengths) tokens.append(_tokens) chars.append(_chars) lengths = np.array(lengths) p_length = lengths.max() p_c_length = max(max(_c_lengths) for _c_lengths in c_lengths) b_tokens = [] b_chars = [] for _tokens, _chars in zip(tokens, chars): _tokens, _chars = pad_to_size(_tokens, _chars, p_length, max(5, p_c_length)) b_tokens.append(_tokens) b_chars.append(_chars) b_tokens = np.concatenate(b_tokens) b_chars = np.concatenate(b_chars) return b_tokens, b_chars, lengths, mappings
def tokenize_data(data, token_to_id, char_to_id, limit=None): """ Tokenize a data set, with mapping of tokens to index in origin. Also create and update the vocabularies. :param: data: a flat, organize view of the data, as a list of qid, passage, query and answer indexes. :param: vocab: a dict of token to id; updated. :param: c_vocab: a dict of char to id; update. :return: a tokenized view of the data, as a list of qid, passage, query, answer indexes, and token to char indexes mapping. Passage and queries are tokenized into a tuple (token, chars). Answer indexes are start:stop range of tokens. """ tokenized = [] for qid, passage, query, (start, stop) in data: q_tokens, q_chars, _, _, _ = \ rich_tokenize(query, token_to_id, char_to_id, update=True) p_tokens, p_chars, _, _, mapping = \ rich_tokenize(passage['passage_text'], token_to_id, char_to_id, update=True) """Convert char position to token position.""" if start == 0 and stop == 0: pass # No answer; nop, since 0 == 0 elif start == 0 and stop == len(passage): stop = len(p_tokens) # Now point to just after last token. else: t_start = None # token idx t_end = len(p_tokens) for t_ind, (_start, _end) in enumerate(mapping): if start < _end: # char idx t_start = t_ind break assert t_start is not None """ >>> for idx,(i,j) in enumerate(np.array([[1,2],[3,4]]), 6): ... print(idx, i, j) ... 6 1 2 7 3 4 """ for t_ind, (_start, _end) in \ enumerate(mapping[t_start:], t_start): # do so to ensure t_start < t_end. if stop < _start: t_end = t_ind break start = t_start # Now point to first token in answer. stop = t_end # Now point to after the last token in answer. # Keep or not based on length of passage. if limit is not None and len(p_tokens) > limit: # limit=None (default) if stop <= limit: # Passage is too long, but it can be trimmed, because the answer end position is in limited length. p_tokens = p_tokens[:limit] else: # Passage is too long, but it cannot be trimmed. continue tokenized.append(( qid, # query id (p_tokens, p_chars), (q_tokens, q_chars), (start, stop), mapping)) return tokenized