示例#1
0
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets and a start_of_word mask
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(
             seq_a=tokenized[seq_name],
             seq_b=None,
             tokenizer=self.tokenizer,
             max_seq_len=self.max_seq_len)
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
示例#2
0
文件: processor.py 项目: svmihar/FARM
    def _dict_to_samples(self, dictionary, all_dicts=None):
        assert len(all_dicts) > 1, "Need at least 2 documents to sample random sentences from"
        doc = dictionary["doc"]
        samples = []

        # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible)
        for idx in range(len(doc) - 1):
            tokenized = {}
            if self.next_sent_pred:
                text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx)
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": text_b,
                    "nextsentence_label": is_next_label,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer
                )
                tokenized["text_b"] = tokenize_with_metadata(
                    text_b, self.tokenizer
                )
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], tokenized["text_b"][seq_name], _ = truncate_sequences(
                        seq_a=tokenized["text_a"][seq_name],
                        seq_b=tokenized["text_b"][seq_name],
                        tokenizer=self.tokenizer,
                        max_seq_len=self.max_seq_len)
                samples.append(Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized))
            # if we don't do next sentence prediction, we should feed in a single sentence
            else:
                text_a = doc[idx]
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": None,
                    "nextsentence_label": None,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer
                )
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], _, _ = truncate_sequences(
                        seq_a=tokenized["text_a"][seq_name],
                        seq_b=None,
                        tokenizer=self.tokenizer,
                        max_seq_len=self.max_seq_len)
                samples.append(Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized))
        return samples
示例#3
0
文件: processor.py 项目: svmihar/FARM
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None,
                                                        tokenizer=self.tokenizer,
                                                        max_seq_len=self.max_seq_len)
     # Samples don't have labels during Inference mode
     if "label" in dictionary:
         label = float(dictionary["label"])
         scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1]
         dictionary["label"] = scaled_label
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
示例#4
0
def test_truncate_sequences(caplog):
    caplog.set_level(logging.CRITICAL)

    lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"]
    tokenizers = []
    for lang_name in lang_names:
        t = Tokenizer.load(lang_name, lower_case=False)
        tokenizers.append(t)

    # artificial sequences (could be tokens, offsets, or anything else)
    seq_a = list(range(10))
    seq_b = list(range(15))
    max_seq_len = 20
    for tokenizer in tokenizers:
        for strategy in ["longest_first", "only_first","only_second"]:
            trunc_a, trunc_b, overflow = truncate_sequences(seq_a=seq_a,seq_b=seq_b,tokenizer=tokenizer,
                                                        max_seq_len=max_seq_len, truncation_strategy=strategy)

            assert len(trunc_a) + len(trunc_b) + tokenizer.num_added_tokens(pair=True) == max_seq_len
示例#5
0
    def parts_to_sample(self, admission_part, discharge_part, label) -> Sample:
        tokenized = {"text_a": admission_part, "text_b": discharge_part}
        sample_in_clear_text = {
            "text_a": admission_part["clear_text"],
            "text_b": discharge_part["clear_text"],
            "nextsentence_label": label,
        }

        # truncate to max_seq_len
        for seq_name in ["tokens", "offsets", "start_of_word"]:
            tokenized["text_a"][seq_name], tokenized["text_b"][
                seq_name], _ = truncate_sequences(
                    seq_a=tokenized["text_a"][seq_name],
                    seq_b=tokenized["text_b"][seq_name],
                    tokenizer=self.tokenizer,
                    max_seq_len=self.max_seq_len)

        return Sample(id=None,
                      clear_text=sample_in_clear_text,
                      tokenized=tokenized)