def __init__( self, lowercase_tokens: bool = False, start_tokens: Optional[List[str]] = None, end_tokens: Optional[List[str]] = None, token_min_padding_length: int = 0, ) -> None: super().__init__(token_min_padding_length) self.lowercase_tokens = lowercase_tokens self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])]
def tokenize(self, text: str) -> List[Token]: konoha_tokens = self._tokenizer.tokenize(text) tokens = [ Token(text=token.surface, lemma_=token.base_form, pos_=token.postag) for token in konoha_tokens ] for start_token in self._start_tokens: tokens.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: tokens.append(Token(end_token, -1)) return tokens
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). # Parameters batch : `List[List[str]]`, required A list of tokenized sentences. # Returns A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()["elmo"]["character_ids"]["elmo_tokens"]
def _create_dummy_input(self): sentence = "<S> placeholder </S>" tokens = [Token(word) for word in sentence.split()] character_indices = self.indexer.tokens_to_indices( tokens, self.vocab)["elmo_tokens"] indices_tensor = torch.LongTensor([character_indices]) return indices_tensor
def generate_sentence_embeddings(sentence="", max_len=200): sen_list = [] if isinstance(sentence, str): tokens = [ Token(word) for word in sentence.split() if isinstance(word, str) ] embed = google_word_2_vec() for idx, t in enumerate(tokens): try: if idx >= max_len: break sen_list.append(embed[t.text]) except Exception as e: #print(e) sen_list.append([0.0] * embedding_dim) for x in range(len(tokens), max_len): #print(x) sen_list.append([0.0] * embedding_dim) else: print('no vector for a sentence') for x in range(0, max_len): # print(x) sen_list.append([0.0] * embedding_dim) return np.asarray(sen_list, dtype=float)
def _encode_concepts(self, concepts): concept_tensors = [] for concept in concepts: concept = [Token(word) for word in concept.split()] concept_indices = self.indexer.tokens_to_indices( concept, self.vocab)["elmo_tokens"] concept_tensors.append(torch.LongTensor(concept_indices)) return concept_tensors
def tokenize(self, text: str) -> List[Token]: """ This method only handles a single sentence (or sequence) of text. """ max_length = self._max_length if max_length is not None and not self._add_special_tokens: max_length += self.num_special_tokens_for_sequence() encoded_tokens = self.tokenizer.encode_plus( text=text, add_special_tokens=True, max_length=max_length, stride=self._stride, return_tensors=None, return_offsets_mapping=self.tokenizer.is_fast, return_attention_mask=False, return_token_type_ids=True, return_special_tokens_mask=True, ) # token_ids contains a final list with ids for both regular and special tokens token_ids, token_type_ids, special_tokens_mask, token_offsets = ( encoded_tokens["input_ids"], encoded_tokens["token_type_ids"], encoded_tokens["special_tokens_mask"], encoded_tokens.get("offset_mapping"), ) # If we don't have token offsets, try to calculate them ourselves. if token_offsets is None: token_offsets = self._estimate_character_indices(text, token_ids) tokens = [] for token_id, token_type_id, special_token_mask, offsets in zip( token_ids, token_type_ids, special_tokens_mask, token_offsets): # In `special_tokens_mask`, 1s indicate special tokens and 0s indicate regular tokens. # NOTE: in transformers v3.4.0 (and probably older versions) the docstring # for `encode_plus` was incorrect as it had the 0s and 1s reversed. # https://github.com/huggingface/transformers/pull/7949 fixed this. if not self._add_special_tokens and special_token_mask == 1: continue if offsets is None or offsets[0] >= offsets[1]: start = None end = None else: start, end = offsets tokens.append( Token( text=self.tokenizer.convert_ids_to_tokens( token_id, skip_special_tokens=False), text_id=token_id, type_id=token_type_id, idx=start, idx_end=end, )) return tokens
def _sentence_to_srl_instances(self, json_dict: JsonDict) -> List[Instance]: sentence = json_dict["sentence"] if "verbs" in json_dict.keys(): text = sentence.split() pos = ["VERB" if i == json_dict["verbs"] else "NOUN" for i, _ in enumerate(text)] tokens = [Token(t, i, i + len(text), pos_=p) for i, (t, p) in enumerate(zip(text, pos))] else: tokens = self._tokenizer.tokenize(sentence) return self.tokens_to_instances(tokens)
def text_to_instance( self, sentences: List[str], labels: List[str] = None, confidences: List[float] = None, additional_features: List[float] = None, ) -> Instance: if not self.predict: assert len(sentences) == len(labels) if confidences is not None: assert len(sentences) == len(confidences) if additional_features is not None: assert len(sentences) == len(additional_features) if self.use_sep: tokenized_sentences = [ self._tokenizer.tokenize(s)[:self.sent_max_len] + [Token("[SEP]")] for s in sentences ] sentences = [ list(itertools.chain.from_iterable(tokenized_sentences))[:-1] ] else: # Tokenize the sentences sentences = [ self._tokenizer.tokenize(sentence_text)[:self.sent_max_len] for sentence_text in sentences ] fields: Dict[str, Field] = {} fields["sentences"] = ListField( [TextField(sentence) for sentence in sentences]) if labels is not None: if isinstance(labels[0], list): fields["labels"] = ListField( [MultiLabelField(label) for label in labels]) else: # make the labels strings for easier identification of the neutral label # probably not strictly necessary if self.sci_sum: fields["labels"] = ArrayField(np.array(labels)) else: fields["labels"] = ListField([ LabelField(str(label) + "_label") for label in labels ]) if confidences is not None: fields['confidences'] = ArrayField(np.array(confidences)) if additional_features is not None: fields["additional_features"] = ArrayField( np.array(additional_features)) return Instance(fields)
def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[Token]: """ Converts spaCy tokens to allennlp tokens. Is a no-op if keep_spacy_tokens is True """ if not self._keep_spacy_tokens: tokens = [ Token( token.text, token.idx, token.idx + len(token.text), token.lemma_, token.pos_, token.tag_, token.dep_, token.ent_type_, ) for token in tokens ] for start_token in self._start_tokens: tokens.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: tokens.append(Token(end_token, -1)) return tokens
def _intra_word_tokenize( self, string_tokens: List[str] ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]: tokens: List[Token] = [] offsets: List[Optional[Tuple[int, int]]] = [] for token_string in string_tokens: wordpieces = self.tokenizer.encode_plus( token_string, add_special_tokens=False, return_tensors=None, return_offsets_mapping=False, return_attention_mask=False, ) wp_ids = wordpieces["input_ids"] if len(wp_ids) > 0: offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1)) tokens.extend( Token(text=wp_text, text_id=wp_id) for wp_id, wp_text in zip( wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids))) else: offsets.append(None) return tokens, offsets
def tokenize(self, text: str) -> List[Token]: if self._lowercase_characters: text = text.lower() if self._byte_encoding is not None: # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out # of this. tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)] else: tokens = [Token(t) for t in list(text)] for start_token in self._start_tokens: if isinstance(start_token, int): token = Token(text_id=start_token, idx=0) else: token = Token(text=start_token, idx=0) tokens.insert(0, token) for end_token in self._end_tokens: if isinstance(end_token, int): token = Token(text_id=end_token, idx=0) else: token = Token(text=end_token, idx=0) tokens.append(token) return tokens
def tokenize(self, text: str) -> List[Token]: # We use the [^\W\d_] pattern as a trick to match unicode letters tokens = [Token(m.group(), idx=m.start()) for m in re.finditer(r"[^\W\d_]+|\d+|\S", text)] return tokens
def tokenize(self, text: str) -> List[Token]: tokens = self.tokenizer.parse(text).split(' ') return [Token(t) for t in tokens]
def _reverse_engineer_special_tokens( self, token_a: str, token_b: str, model_name: str, tokenizer_kwargs: Optional[Dict[str, Any]], ): # storing the special tokens self.sequence_pair_start_tokens = [] self.sequence_pair_mid_tokens = [] self.sequence_pair_end_tokens = [] # storing token type ids for the sequences self.sequence_pair_first_token_type_id = None self.sequence_pair_second_token_type_id = None # storing the special tokens self.single_sequence_start_tokens = [] self.single_sequence_end_tokens = [] # storing token type id for the sequence self.single_sequence_token_type_id = None # Reverse-engineer the tokenizer for two sequences from allennlp.common import cached_transformers tokenizer_with_special_tokens = cached_transformers.get_tokenizer( model_name, add_special_tokens=True, **(tokenizer_kwargs or {})) dummy_output = tokenizer_with_special_tokens.encode_plus( token_a, token_b, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=False, ) if len(dummy_output["token_type_ids"]) != len( dummy_output["input_ids"]): logger.warning( "Tokenizer library did not return valid token type ids. We will assume they are all zero." ) dummy_output["token_type_ids"] = [0] * len( dummy_output["input_ids"]) dummy_a = self.tokenizer.encode(token_a, add_special_tokens=False)[0] assert dummy_a in dummy_output["input_ids"] dummy_b = self.tokenizer.encode(token_b, add_special_tokens=False)[0] assert dummy_b in dummy_output["input_ids"] assert dummy_a != dummy_b seen_dummy_a = False seen_dummy_b = False for token_id, token_type_id in zip(dummy_output["input_ids"], dummy_output["token_type_ids"]): if token_id == dummy_a: if seen_dummy_a or seen_dummy_b: # seeing a twice or b before a raise ValueError( "Cannot auto-determine the number of special tokens added." ) seen_dummy_a = True assert ( self.sequence_pair_first_token_type_id is None or self.sequence_pair_first_token_type_id == token_type_id ), "multiple different token type ids found for the first sequence" self.sequence_pair_first_token_type_id = token_type_id continue if token_id == dummy_b: if seen_dummy_b: # seeing b twice raise ValueError( "Cannot auto-determine the number of special tokens added." ) seen_dummy_b = True assert ( self.sequence_pair_second_token_type_id is None or self.sequence_pair_second_token_type_id == token_type_id ), "multiple different token type ids found for the second sequence" self.sequence_pair_second_token_type_id = token_type_id continue token = Token( tokenizer_with_special_tokens.convert_ids_to_tokens(token_id), text_id=token_id, type_id=token_type_id, ) if not seen_dummy_a: self.sequence_pair_start_tokens.append(token) elif not seen_dummy_b: self.sequence_pair_mid_tokens.append(token) else: self.sequence_pair_end_tokens.append(token) assert (len(self.sequence_pair_start_tokens) + len(self.sequence_pair_mid_tokens) + len(self.sequence_pair_end_tokens) ) == self.tokenizer.num_special_tokens_to_add(pair=True) # Reverse-engineer the tokenizer for one sequence dummy_output = tokenizer_with_special_tokens.encode_plus( token_a, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=False, ) if len(dummy_output["token_type_ids"]) != len( dummy_output["input_ids"]): logger.warning( "Tokenizer library did not return valid token type ids. We will assume they are all zero." ) dummy_output["token_type_ids"] = [0] * len( dummy_output["input_ids"]) seen_dummy_a = False for token_id, token_type_id in zip(dummy_output["input_ids"], dummy_output["token_type_ids"]): if token_id == dummy_a: if seen_dummy_a: raise ValueError( "Cannot auto-determine the number of special tokens added." ) seen_dummy_a = True assert ( self.single_sequence_token_type_id is None or self.single_sequence_token_type_id == token_type_id ), "multiple different token type ids found for the sequence" self.single_sequence_token_type_id = token_type_id continue token = Token( tokenizer_with_special_tokens.convert_ids_to_tokens(token_id), text_id=token_id, type_id=token_type_id, ) if not seen_dummy_a: self.single_sequence_start_tokens.append(token) else: self.single_sequence_end_tokens.append(token) assert (len(self.single_sequence_start_tokens) + len(self.single_sequence_end_tokens) ) == self.tokenizer.num_special_tokens_to_add(pair=False)
def tokenize(self, text: str) -> List[Token]: return [Token(t) for t in text.split()]
def tokenize(self, text: str) -> List[Token]: return [Token(t) for t in jieba.lcut(text)]