def _dict_to_samples(self, dictionary, all_dicts=None): assert len( all_dicts ) > 1, "Need at least 2 documents to sample random sentences from" doc = dictionary["doc"] samples = [] # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible) for idx in range(len(doc) - 1): tokenized = {} if self.next_sent_pred: text_a, text_b, is_next_label = get_sentence_pair( doc, all_dicts, idx) sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "nextsentence_label": is_next_label, } # tokenize tokenized["text_a"] = tokenize_with_metadata( text_a, self.tokenizer) tokenized["text_b"] = tokenize_with_metadata( text_b, self.tokenizer) # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], tokenized["text_b"][ seq_name], _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=tokenized["text_b"][seq_name], tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) samples.append( Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)) # if we don't do next sentence prediction, we should feed in a single sentence else: text_a = doc[idx] sample_in_clear_text = { "text_a": text_a, "text_b": None, "nextsentence_label": None, } # tokenize tokenized["text_a"] = tokenize_with_metadata( text_a, self.tokenizer) # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], _, _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) samples.append( Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)) return samples
def test_fast_tokenizer_with_metadata_with_examples(caplog, model_name): fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True) tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False) for text in TEXTS: # our tokenizer with metadata on "whitespace tokenized words" tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer) fast_tokenized_meta = tokenize_with_metadata(text=text, tokenizer=fast_tokenizer) # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words" assert tokenized_meta == fast_tokenized_meta, f"Failed using {tokenizer.__class__.__name__}"
def _dict_to_samples(self, dictionary: dict, **kwargs) -> List[Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) if len(tokenized["tokens"]) == 0: text = dictionary["text"] logger.warning( f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}" ) return [] # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences( seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) # Samples don't have labels during Inference mode for task_name, task in self.tasks.items(): if task_name in dictionary: label = float(dictionary[task_name]) scaled_label = (label - task["label_list"][0]) / task["label_list"][1] dictionary[task_name] = scaled_label if self.features: feats_embed = dictionary.pop("features") return [ FeaturesEmbeddingSample(id=None, clear_text=dictionary, tokenized=tokenized, feat_embeds=feats_embed) ] return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def _dict_to_samples(self, dictionary: dict, **kwargs) -> List[Sample]: # this tokenization also stores offsets, which helps to map our entity tags back to original positions tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) if len(tokenized["tokens"]) == 0: text = dictionary["text"] logger.warning( f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}" ) return [] # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences( seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) # Samples don't have labels during Inference mode for task_name, task in self.tasks.items(): if task_name in dictionary: scaled_dict_labels = [] for label in dictionary[task_name]: label = float(label) scaled_label = ( label - task["label_list"][0]) / task["label_list"][1] scaled_dict_labels.append(scaled_label) dictionary[task_name] = scaled_dict_labels return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def test_bert_tokenizer_all_meta(caplog): caplog.set_level(logging.CRITICAL) lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" # original tokenizer from transformer repo tokenized = tokenizer.tokenize(basic_text) assert tokenized == [ 'Some', 'Text', 'with', 'never', '##see', '##nto', '##ken', '##s', 'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch', '##ars' ] # ours with metadata tokenized_meta = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer) assert tokenized_meta["tokens"] == tokenized assert tokenized_meta["offsets"] == [ 0, 5, 10, 15, 20, 23, 26, 29, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72 ] assert tokenized_meta["start_of_word"] == [ True, True, True, True, False, False, False, False, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False ]
def test_all_tokenizer_on_special_cases(caplog): caplog.set_level(logging.CRITICAL) lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] tokenizers = [] for lang_name in lang_names: t = Tokenizer.load(lang_name, lower_case=False) tokenizers.append(t) texts = [ "This is a sentence", "Der entscheidende Pass", "This is a sentence with multiple spaces", "力加勝北区ᴵᴺᵀᵃছজটডণত", "Thiso text is included tolod makelio sure Unicodeel is handled properly:", "This is a sentence...", "Let's see all on this text and. !23# neverseenwordspossible", """This is a sentence. With linebreak""", "This is a sentence with tab"] for tokenizer in tokenizers: for text in texts: # Important: we don't assume to preserve whitespaces after tokenization. # This means: \t, \n " " etc will all resolve to a single " ". # This doesn't make a difference for BERT + XLNet but it does for roBERTa # 1. original tokenize function from transformer repo on full sentence standardized_whitespace_text = ' '.join(text.split()) # remove multiple whitespaces tokenized = tokenizer.tokenize(standardized_whitespace_text) tokenized_by_word = [] # 2. original tokenize function from transformer repo on "whitespace tokenized words" for i, tok in enumerate(text.split(" ")): if i == 0: tokenized_tok = tokenizer.tokenize(tok) else: try: tokenized_tok = tokenizer.tokenize(tok, add_prefix_space=True) except TypeError: tokenized_tok = tokenizer.tokenize(tok) tokenized_by_word.extend(tokenized_tok) assert tokenized == tokenized_by_word # 3. our tokenizer with metadata on "whitespace tokenized words" tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer) # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words" assert tokenized_meta["tokens"] == tokenized, f"Failed using {tokenizer.__class__.__name__}" # verify that offsets align back to original text if text == "力加勝北区ᴵᴺᵀᵃছজটডণত": # contains [UNK] that are impossible to match back to original text space continue for tok, offset in zip(tokenized_meta["tokens"], tokenized_meta["offsets"]): #subword-tokens have special chars depending on model type. In order to align with original text we need to get rid of them tok = re.sub(r"^(##|Ġ|▁)", "", tok) #tok = tokenizer.decode(tokenizer.convert_tokens_to_ids(tok)) original_tok = text[offset:offset+len(tok)] assert tok == original_tok, f"Offset alignment wrong for {tokenizer.__class__.__name__} and text '{text}'"
def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len) # Samples don't have labels during Inference mode if "label" in dict: dict["label"] = float(dict["label"]) return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
def split_text_token_wise_with_metadata(text, tokenizer, min_chunk_size=30, max_chunk_size=100): tokenized_text = tokenize_with_metadata(text, tokenizer) token_len = len(tokenized_text["tokens"]) chunk_size = random.randint(min_chunk_size, max_chunk_size) # calculate nr of even chunks with chunksize < chunk_size nr_of_chunks = math.ceil(token_len / chunk_size) chunks = [] for i, key in enumerate(tokenized_text.keys()): key_chunks = np.array_split(np.array(tokenized_text[key]), nr_of_chunks) # update each dict with chunked key for j in range(nr_of_chunks): if len(chunks) > j: chunks[j][key] = key_chunks[j].tolist() else: chunks.append({key: key_chunks[j].tolist()}) # reconstruct clear text from offsets for k in range(nr_of_chunks): chunks[k]["clear_text"] = text[ chunks[k]["offsets"][0]:chunks[k]["offsets"][-1] + 1] return chunks
def test_fast_bert_custom_vocab(caplog): caplog.set_level(logging.CRITICAL) lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False, use_fast=True) #deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt") tokenizer.add_tokens(new_tokens=["neverseentokens"]) basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" # original tokenizer from transformer repo tokenized = tokenizer.tokenize(basic_text) assert tokenized == [ 'Some', 'Text', 'with', 'neverseentokens', 'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch', '##ars' ] # ours with metadata tokenized_meta = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer) assert tokenized_meta["tokens"] == tokenized assert tokenized_meta["offsets"] == [ 0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72 ] assert tokenized_meta["start_of_word"] == [ True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False ]
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets, which helps to map our entity tags back to original positions tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dict["text"], self.tokenizer, self.max_seq_len) # Samples don't have labels during Inference mode if "label" in dict: label = float(dict["label"]) scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1] dict["label"] = scaled_label return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
def apply_tokenization(self, dictionary): """ This performs tokenization on all documents and questions. The result is a list (unnested) where each entry is a dictionary for one document-question pair (potentially mutliple answers). """ raw_baskets = [] document_text = dictionary["context"] document_tokenized = tokenize_with_metadata(document_text, self.tokenizer) document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]] questions = dictionary["qas"] for question in questions: answers = [] # For training and dev where labelled samples are read in from a SQuAD style file try: squad_id = question["id"] question_text = question["question"] for answer in question["answers"]: a = {"text": answer["text"], "offset": answer["answer_start"]} answers.append(a) # For inference where samples are read in as dicts without an id or answers except TypeError: squad_id = None question_text = question question_tokenized = tokenize_with_metadata(question_text, self.tokenizer) question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]] if "is_impossible" not in question: is_impossible = False else: is_impossible = question["is_impossible"] raw = {"document_text": document_text, "document_tokens": document_tokenized["tokens"], "document_offsets": document_tokenized["offsets"], "document_start_of_word": document_start_of_word, "question_text": question_text, "question_tokens": question_tokenized["tokens"], "question_offsets": question_tokenized["offsets"], "question_start_of_word": question_start_of_word, "answers": answers, "is_impossible": is_impossible, "squad_id": squad_id} raw_baskets.append(raw) return raw_baskets
def apply_tokenization(self, dictionary): """ This performs tokenization on all documents and questions. The result is a list (unnested) where each entry is a dictionary for one document-question pair (potentially mutliple answers). """ raw_baskets = [] document_text = dictionary["context"] document_tokenized = tokenize_with_metadata(document_text, self.tokenizer) document_start_of_word = [ int(x) for x in document_tokenized["start_of_word"] ] questions = dictionary["qas"] for question in questions: squad_id = question["id"] question_text = question["question"] question_tokenized = tokenize_with_metadata( question_text, self.tokenizer) question_start_of_word = [ int(x) for x in question_tokenized["start_of_word"] ] answers = [] for answer in question["answers"]: a = {"text": answer["text"], "offset": answer["answer_start"]} answers.append(a) if "is_impossible" not in question: is_impossible = False else: is_impossible = question["is_impossible"] raw = { "document_text": document_text, "document_tokens": document_tokenized["tokens"], "document_offsets": document_tokenized["offsets"], "document_start_of_word": document_start_of_word, "question_text": question_text, "question_tokens": question_tokenized["tokens"], "question_offsets": question_tokenized["offsets"], "question_start_of_word": question_start_of_word, "answers": answers, "is_impossible": is_impossible, "squad_id": squad_id } raw_baskets.append(raw) return raw_baskets
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: if "paragraphs" not in dictionary: # TODO change this inference mode hack dictionary = self._convert_rest_api_dict(infer_dict=dictionary) samples = create_samples_squad(entry=dictionary) for sample in samples: tokenized = tokenize_with_metadata(text=" ".join( sample.clear_text["doc_tokens"]), tokenizer=self.tokenizer) sample.tokenized = tokenized return samples
def create_samples_sentence_pairs(baskets, tokenizer, max_seq_len): """Creates examples for Language Model Finetuning that consist of two sentences and the isNext label indicating if the two are subsequent sentences from one doc""" all_docs = [b.raw["doc"] for b in baskets] for basket in tqdm(baskets): doc = basket.raw["doc"] basket.samples = [] for idx in range(len(doc) - 1): id = "%s-%s" % (basket.id, idx) text_a, text_b, is_next_label = get_sentence_pair(doc, all_docs, idx) sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "is_next_label": is_next_label, } tokenized = {} tokenized["text_a"] = tokenize_with_metadata(text_a, tokenizer, max_seq_len) tokenized["text_b"] = tokenize_with_metadata(text_b, tokenizer, max_seq_len) basket.samples.append(Sample(id=id, clear_text=sample_in_clear_text, tokenized=tokenized)) return baskets
def test_save_load(caplog): caplog.set_level(logging.CRITICAL) lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] tokenizers = [] for lang_name in lang_names: t = Tokenizer.load(lang_name, lower_case=False) t.add_tokens(new_tokens=["neverseentokens"]) tokenizers.append(t) basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" for tokenizer in tokenizers: save_dir = f"testsave" tokenizer_type = tokenizer.__class__.__name__ tokenizer.save_pretrained(save_dir) tokenizer_loaded = Tokenizer.load(save_dir, tokenizer_class=tokenizer_type) tokenized_before = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer) tokenized_after = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer_loaded) assert tokenized_before == tokenized_after
def _dict_to_samples(cls, dict, all_dicts=None): doc = dict["doc"] samples = [] for idx in range(len(doc) - 1): text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx) sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "is_next_label": is_next_label, } tokenized = {} tokenized["text_a"] = tokenize_with_metadata( text_a, cls.tokenizer, cls.max_seq_len ) tokenized["text_b"] = tokenize_with_metadata( text_b, cls.tokenizer, cls.max_seq_len ) samples.append( Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized) ) return samples
def _get_predictions_inner(sentence, tokenizer, model, device): meta = tokenize_with_metadata(sentence, tokenizer) sent_tokens, offsets, start_of_words = meta["tokens"], meta[ "offsets"], meta["start_of_word"] indexed_tokens = tokenizer.convert_tokens_to_ids(sent_tokens) # create 1 * T input token tensor tokens_tensor = torch.tensor(indexed_tokens).unsqueeze(0) tokens_tensor = tokens_tensor.to(device) with torch.no_grad(): log_probs = model(tokens_tensor)[0].log_softmax(dim=2).squeeze() return list( zip(sent_tokens, indexed_tokens, (None, ) + log_probs.unbind(), offsets, start_of_words))
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) # Samples don't have labels during Inference mode if "label" in dictionary: label = float(dictionary["label"]) scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1] dictionary["label"] = scaled_label return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]: # TODO split samples that are too long in this function, related to todo in self._sample_to_features if "paragraphs" not in dict: # TODO change this inference mode hack dict = cls._convert_inference(infer_dict=dict) samples = create_samples_squad(entry=dict) for sample in samples: tokenized = tokenize_with_metadata( text=" ".join(sample.clear_text["doc_tokens"]), tokenizer=cls.tokenizer, max_seq_len=cls.max_seq_len, ) sample.tokenized = tokenized return samples
def test_detokenization_in_fast_tokenizers(model_name): tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, use_fast=True) for text in TEXTS: tokens_with_metadata = tokenize_with_metadata(text, tokenizer) tokens = tokens_with_metadata["tokens"] detokenized = " ".join(tokens) detokenized = re.sub(r"(^|\s+)(##)", "", detokenized) detokenized_ids = tokenizer(detokenized, add_special_tokens=False)["input_ids"] detokenized_tokens = [ tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids ] assert tokens == detokenized_tokens
def _dict_to_samples(cls, dict, all_dicts=None): """ Converts a dict with a document to a sample (which will subsequently be featurized). It is used during prediction. This is a modified version of BertStyleLMProcessor._dict_to_samples from farm/data_handler/processor.py. It has been modified to create samples with just a single text, rather than two, as is the case for a normal BERT model. """ doc = dict["doc"] samples = [] for idx in range(len(doc) - 1): tokenized = {} tokenized["text_a"] = tokenize_with_metadata( doc[idx], cls.tokenizer, cls.max_seq_len) samples.append( Sample(id=None, clear_text={"doc": doc[idx]}, tokenized=tokenized)) return samples
def get_sequence_pair(doc, chunk, chunk_clear_text, all_baskets, tokenizer, max_num_tokens, prob_next_sentence=0.5): """ Get one sample from corpus consisting of two sequences. A sequence can consist of more than one sentence. With prob. 50% these are two subsequent sequences from one doc. With 50% the second sequence will be a random one from another document. :param doc: The current document. :type doc: [str] :param chunk: List of subsequent, tokenized sentences. :type chunk: [dict] :param chunk_clear_text: List of subsequent sentences. :type chunk_clear_text: [str] :param all_baskets: SampleBaskets containing multiple other docs from which we can sample the second sequence if we need a random one. :type all_baskets: [dict] :param tokenizer: Used to split a sentence (str) into tokens. :param max_num_tokens: Samples are truncated after this many tokens. :type max_num_tokens: int :return: (list, list, dict, int) tokenized seq a, tokenized seq b, sample in clear text with label, number of unused sentences in chunk """ sequence_a = [] sequence_b = [] sample_in_clear_text = {"text_a": "", "text_b": ""} # determine how many segments from chunk go into sequence_a len_sequence_a = 0 a_end = 1 if len(chunk) >= 2: a_end = random.randrange(1, len(chunk)) for i in range(a_end): sequence_a.append(chunk[i]) sample_in_clear_text["text_a"] += f"{chunk_clear_text[i]} " len_sequence_a += len(chunk[i]["tokens"]) sample_in_clear_text["text_a"].strip() # actual next sequence if (random.random() > prob_next_sentence) and (len(chunk) > 1): label = True for i in range(a_end, len(chunk)): sequence_b.append(chunk[i]) sample_in_clear_text["text_b"] += f"{chunk_clear_text[i]} " sample_in_clear_text["text_b"].strip() sample_in_clear_text["nextsentence_label"] = True num_unused_segments = 0 # edge case: split sequence in half elif (len(chunk) == 1) and len_sequence_a >= max_num_tokens: sequence_a = {} sequence_b = {} if int(len(chunk[0]["tokens"]) / 2) >= max_num_tokens: boundary = int(max_num_tokens / 2) else: boundary = int(len(chunk[0]["tokens"]) / 2) sequence_a["tokens"] = chunk[0]["tokens"][:boundary] sequence_a["offsets"] = chunk[0]["offsets"][:boundary] sequence_a["start_of_word"] = chunk[0]["start_of_word"][:boundary] sequence_b["tokens"] = chunk[0]["tokens"][boundary:] sequence_b["start_of_word"] = chunk[0]["start_of_word"][boundary:] # get offsets for sequence_b right seq_b_offset_start = chunk[0]["offsets"][boundary] sequence_b["offsets"] = [ offset - seq_b_offset_start for offset in chunk[0]["offsets"][boundary:] ] # get clear text clear_text_boundary = chunk[0]["offsets"][boundary] sample_in_clear_text["text_a"] = chunk_clear_text[ 0][:clear_text_boundary] sample_in_clear_text["text_b"] = chunk_clear_text[0][ clear_text_boundary:] sample_in_clear_text["text_a"].strip() sample_in_clear_text["text_b"].strip() sample_in_clear_text["nextsentence_label"] = True return [sequence_a], [sequence_b], sample_in_clear_text, 0 # random next sequence else: label = False sequence_b_length = 0 target_b_length = max_num_tokens - len_sequence_a random_doc = _get_random_doc(all_baskets, forbidden_doc=doc) random_start = random.randrange(len(random_doc)) for i in range(random_start, len(random_doc)): current_sentence_tokenized = tokenize_with_metadata( random_doc[i], tokenizer) sequence_b.append(current_sentence_tokenized) sample_in_clear_text["text_b"] += f"{random_doc[i]} " sequence_b_length += len(current_sentence_tokenized["tokens"]) if sequence_b_length >= target_b_length: break sample_in_clear_text["text_b"].strip() sample_in_clear_text["nextsentence_label"] = False # We didn't use all of the segments in chunk => put them back num_unused_segments = len(chunk) - a_end assert len(sequence_a) > 0 assert len(sequence_b) > 0 return sequence_a, sequence_b, sample_in_clear_text, num_unused_segments
def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets, which helps to map our entity tags back to original positions tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len) return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len) return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
def fit_s3e_on_corpus(processor, model, corpus, n_clusters=10, mean_removal=True, pca_removal=True, pca_n_components=300, pca_n_top_components=10, default_token_weight=1, min_token_occurrences=0, svd_postprocessing=False, use_gpu=False, batch_size=50): """ Pooling of word/token embeddings as described by Wang et al in the paper "Efficient Sentence Embedding via Semantic Subspace Analysis" (https://arxiv.org/abs/2002.09620) Adjusted their implementation from here: https://github.com/BinWang28/Sentence-Embedding-S3E This method fits the "model" on a custom corpus. This includes the derivation of token_weights depending on token occurences in the corpus, creation of the semantic clusters via k-means and a couple of pre-/post-processing steps to normalize the embeddings. The resulting objects can be saved or directly passed to the Inferencer to get the actual embeddings for your sentences. Note: Some operations like `mean_removal` imply changes on the AdaptiveModel or Processor. That's why we return them. :param processor: FARM Processor with a Tokenizer used for reading the corpus (e.g. Inference Processor) :param model: FARM AdaptiveModel with an embedding layer in the LM (currently only supporting 'WordEmbedding_LM' as a language model) :param corpus: Path to a text file or a str :param n_clusters: Number of clusters for S3E. The more clusters, the higher the dimensionality of the resulting embeddings. :param mean_removal: Bool, whether to remove the mean from the token embeddings (preprocessing) :param pca_removal: Bool, whether to remove pca components from the token embeddings (preprocessing) :param pca_n_components: int, how many PCA components to fit if `pca_removal` is enabled :param pca_n_top_components: int, how many top PCA components to remove if `pca_removal` is enabled :param default_token_weight: float, what weight to assign for tokens that are in vocab but not in corpus :param min_token_occurrences: int, mininum number of token occurrences in the corpus for keeping it in the vocab. Helps to shrink the model & speed it up. :param svd_postprocessing: Bool, whether to remove the top truncated SVD / LSA components from the sentence embeddings (postprocessing). Note: Requires creating all sentence embeddings once for the corpus slowing down this method substantially. Doesn't impact later inference speed though. :param use_gpu: bool, whether to use a GPU :param batch_size: int, size of batch for the inferencer (only needed when `svd_postprocessing` is enabled) :return: model, processor, s3e_stats """ from farm.infer import Inferencer from farm.modeling.tokenization import tokenize_with_metadata # Get tokens of corpus if isinstance(corpus, Path): logger.info("Reading corpus for fitting S3E ") with open(corpus, "r") as f: corpus = f.read() else: assert type(corpus) == str, "`corpus` must be of type str or Path()" tokenized_corpus = tokenize_with_metadata(corpus, processor.tokenizer)["tokens"] token_counts = dict(Counter(tokenized_corpus)) n_tokens = sum(token_counts.values()) # Trim vocab & embeddings to most frequent tokens (only to improve speed & ram consumption) model.language_model.trim_vocab(token_counts, processor, min_threshold=min_token_occurrences) # Normalize embeddings model.language_model.normalize_embeddings( zero_mean=mean_removal, pca_removal=pca_removal, pca_n_components=pca_n_components, pca_n_top_components=pca_n_top_components) normalized_word_embs = model.language_model.model.embeddings.cpu().numpy() # Get token weights token_weights = {} eps = 1e-3 for word, id in processor.tokenizer.vocab.items(): if word in token_counts: token_weights[id] = eps / (eps + token_counts[word] / n_tokens) else: # words that are in vocab but not present in corpus get the default weight token_weights[id] = default_token_weight # Construct Cluster weight_list = np.array(list(token_weights.values())) logger.info('Creating clusters for S3E embeddings') kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(normalized_word_embs, sample_weight=weight_list) s3e_stats = { "token_to_cluster": kmeans.labels_, "centroids": kmeans.cluster_centers_, "token_weights": token_weights, "svd_components": None } if svd_postprocessing: logger.info( 'Post processing sentence embeddings using principal component removal' ) # Input sentences = [{ "text": s } for s in corpus.split("\n") if len(s.strip()) > 0] # Get embeddings try: inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats) result = inferencer.inference_from_dicts(dicts=sentences) finally: inferencer.close_multiprocessing_pool() sentence_embeddings = [s["vec"] for s in result] sentence_embeddings = np.vstack(sentence_embeddings) # Principal Component Removal svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0) svd.fit(sentence_embeddings) s3e_stats["svd_components"] = svd.components_ return model, processor, s3e_stats
def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets, which helps to map our entity tags back to original positions words = re.findall(r"<t>(.*?)</t>", dict["text"], flags=0) word_one = words[0] term_one_idx = -1 term_two_idx = -1 term_one_idxs = [m.start() for m in re.finditer(re.escape(word_one), dict["text"])] for idx, k in enumerate(term_one_idxs): try: if dict["text"][k-3:k] == '<t>': term_one_idx = idx except: pass if len(words) > 1: word_two = words[1] word_two_tokenized = tokenize_with_metadata(word_two, self.tokenizer, self.max_seq_len)['tokens'] term_two_idxs = [m.start() for m in re.finditer(re.escape(word_two), dict["text"])] for idx, k in enumerate(term_two_idxs): try: if dict["text"][k-3:k] == '<t>': term_two_idx = idx except: pass dict["text"] = re.sub(r'<t>','', dict["text"]) dict["text"] = re.sub(r'</t>','', dict["text"]) tokenized = tokenize_with_metadata(dict["text"], self.tokenizer, self.max_seq_len) word_one_tokenized = tokenize_with_metadata(word_one, self.tokenizer, self.max_seq_len)['tokens'] x1, y = [], [] for token in tokenized['tokens']: if token == '[CLS]': x1.append(5) y.append('[CLS]') elif token == '[SEP]': x1.append(4) y.append('[SEP]') else: x1.append(0) y.append('N') idx = find_overlap(word_one_tokenized, tokenized['tokens'], term_one_idx) if idx > -1: for x in range(0,len(word_one_tokenized)): x1[idx+x] = 1 y[idx+x] = 'Y' else: print("-1--") print(word_one_tokenized) print(tokenized['tokens']) x1, y = [], [] for token in tokenized['tokens']: if token == '[CLS]': x1.append(5) y.append('[CLS]') elif token == '[SEP]': x1.append(4) y.append('[SEP]') else: x1.append(0) y.append('N') if len(words) > 1: idx = find_overlap(word_two_tokenized, tokenized['tokens'], term_two_idx) if idx > -1: for x in range(0,len(word_two_tokenized)): y[idx+x] = 'Y' x1[idx+x] = 1 else: print("-2--") print(word_two_tokenized) print(tokenized['tokens']) x1, y = [], [] for token in tokenized['tokens']: if token == '[CLS]': x1.append(5) y.append('[CLS]') elif token == '[SEP]': x1.append(4) y.append('[SEP]') else: x1.append(0) y.append('N') tokenized['custom_data'] = x1 tokenized['ner_label'] = y dict['custom_data'] = x1 dict['ner_label'] = y return [Sample(id=None, clear_text=dict, tokenized=tokenized)]