Exemplo n.º 1
0
    def _dict_to_samples(self, dictionary, all_dicts=None):
        assert len(
            all_dicts
        ) > 1, "Need at least 2 documents to sample random sentences from"
        doc = dictionary["doc"]
        samples = []

        # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible)
        for idx in range(len(doc) - 1):
            tokenized = {}
            if self.next_sent_pred:
                text_a, text_b, is_next_label = get_sentence_pair(
                    doc, all_dicts, idx)
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": text_b,
                    "nextsentence_label": is_next_label,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer)
                tokenized["text_b"] = tokenize_with_metadata(
                    text_b, self.tokenizer)
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], tokenized["text_b"][
                        seq_name], _ = truncate_sequences(
                            seq_a=tokenized["text_a"][seq_name],
                            seq_b=tokenized["text_b"][seq_name],
                            tokenizer=self.tokenizer,
                            max_seq_len=self.max_seq_len)
                    samples.append(
                        Sample(id=None,
                               clear_text=sample_in_clear_text,
                               tokenized=tokenized))
            # if we don't do next sentence prediction, we should feed in a single sentence
            else:
                text_a = doc[idx]
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": None,
                    "nextsentence_label": None,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer)
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], _, _ = truncate_sequences(
                        seq_a=tokenized["text_a"][seq_name],
                        seq_b=None,
                        tokenizer=self.tokenizer,
                        max_seq_len=self.max_seq_len)
                    samples.append(
                        Sample(id=None,
                               clear_text=sample_in_clear_text,
                               tokenized=tokenized))
        return samples
Exemplo n.º 2
0
def create_samples_sentence_pairs(baskets):
    """Creates examples for Language Model Finetuning that consist of two sentences and the isNext label indicating if
     the two are subsequent sentences from one doc"""
    all_docs = [b.raw["doc"] for b in baskets]
    for basket in baskets:
        doc = basket.raw["doc"]
        basket.samples = []
        for idx in range(len(doc) - 1):
            id = "%s-%s" % (basket.id, idx)
            text_a, text_b, is_next_label = get_sentence_pair(doc, all_docs, idx)
            sample_in_clear_text = {
                "text_a": text_a,
                "text_b": text_b,
                "is_next_label": is_next_label,
            }
            basket.samples.append(Sample(id=id, clear_text=sample_in_clear_text))
    return baskets