示例#1
0
    def __init__(self,
                 text: str,
                 context: List[str],
                 tokenizers: Dict[str, BaseTokenizer] = None):
        if tokenizers is None:
            tokenizers = {"tokens": WordTokenizer()}
        self.text = text
        self.context = context
        self.tokenizers = tokenizers
        self.tokens: Dict[str, List[Any]] = defaultdict(list)
        self.namespaces = list(tokenizers.keys())
        for namespace in tokenizers.keys():
            self.namespaces.append(f"contextual_{namespace}")

        # add tokens for the word tokens
        for namespace, tokenizer in self.tokenizers.items():
            tokens = tokenizer.tokenize(text)
            for token in tokens:
                self.add_token(token=token, namespace=namespace)

        # add tokens for the contextual lines
        for namespace, tokenizer in self.tokenizers.items():
            for contextual_line in self.context:
                tokens = tokenizer.tokenize(contextual_line)
                tokens = [Token(tok) for tok in tokens]
                self.tokens[f"contextual_{namespace}"].append(tokens)

        self.line = Line(text=text, tokenizers=self.tokenizers)
        self.context_lines = []
        for text in self.context:
            context_line = Line(text=text, tokenizers=self.tokenizers)
            self.context_lines.append(context_line)
    def get_docs_labels_refs(
            self) -> (List[List[Line]], List[SeqLabel], List[Line]):
        docs: List[List[Line]] = []
        labels: List[SeqLabel] = []
        refs: List[Line] = []

        with open(self.filename, "r", encoding="utf-8") as fp:
            for line in fp:
                line = line.strip()
                if not bool(line):
                    continue
                line_sents, line_labels, line_ref = line.strip().split("###")
                sents: List[str] = [
                    sent.strip() for sent in line_sents.split("\t")
                ]
                sents_labels: List[str] = [
                    sent_label.strip() for sent_label in line_labels.split(",")
                ]
                sents_refs: str = line_ref

                doc = [
                    Line(text=sent, tokenizers=self.tokenizers)
                    for sent in sents
                ]
                label = SeqLabel(labels={"seq_label": sents_labels})
                ref = Line(text=sents_refs, tokenizers=self.tokenizers)
                docs.append(doc)
                labels.append(label)
                refs.append(ref)

        return docs, labels, refs
def setup_scorer(abs_sum_dataset_manager):
    dataset_manager = abs_sum_dataset_manager
    scorer = SummarizationMetrics(dataset_manager)

    lines = [
        Line("word11_train word21_train"),
        Line("word12_train word22_train word32_train"),
    ]
    true_summary = [
        Line("word11_label word21_label"),
        Line("word11_label word22_label"),
    ]
    true_summary_tokens = ["word11_label", "word22_label", "word33_label"]
    pred_summary_tokens = [
        "word11_label",
        "word22_label",
        "word23_label",
        "word33_label",
    ]
    predicted_tags = {"predicted_tags_tokens": [[0, 2], [1, 4, 5]]}
    return (
        scorer,
        (lines, true_summary, predicted_tags),
        (true_summary_tokens, pred_summary_tokens),
    )
def setup_lstm2seqdecoder(request, ):
    HIDDEN_DIM = 1024
    NUM_LAYERS = request.param[0]
    BIDIRECTIONAL = request.param[1]
    TEACHER_FORCING_RATIO = request.param[3]
    MAX_LENGTH = 5

    lines = []
    words = []
    # texts = ["First", "second", "Third"]
    texts = ["First sentence", "second sentence", "Third long sentence here"]
    for text in texts:
        line = Line(text=text)
        word = Line(text=text.split()[0])
        lines.append(line)
        words.append(word)
    flat_texts = [[word for sentence in texts for word in sentence]]
    vocab = Vocab(flat_texts)
    vocab.build_vocab()

    num_direction = 2 if BIDIRECTIONAL else 1
    h0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.1
    c0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.2

    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder_outputs = (torch.ones(len(texts), 5, num_direction * HIDDEN_DIM) *
                       0.5 if request.param[2] else None)
    decoder = Lstm2SeqDecoder(
        embedder=embedder,
        vocab=vocab,
        max_length=MAX_LENGTH,
        attn_module=request.param[2],
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        rnn_bias=False,
        num_layers=NUM_LAYERS,
    )

    return (
        decoder,
        {
            "HIDDEN_DIM": HIDDEN_DIM,
            "NUM_LAYERS": NUM_LAYERS,
            "MAX_LENGTH": MAX_LENGTH,
            "TEACHER_FORCING_RATIO": TEACHER_FORCING_RATIO,
            "LINES": lines,
            "WORDS": words,
            "VOCAB_SIZE": vocab.get_vocab_len(),
            "BIDIRECTIONAL": BIDIRECTIONAL,
        },
        encoder_outputs,
        (h0, c0),
    )
示例#5
0
def setup_lstm2vecencoder(request):
    hidden_dimension = 1024
    combine_strategy = request.param[1]
    bidirectional = request.param[0]
    embedder = WordEmbedder(embedding_type="glove_6B_50")

    encoder = LSTM2VecEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=hidden_dimension,
        bidirectional=bidirectional,
        combine_strategy=combine_strategy,
        rnn_bias=False,
    )

    texts = ["First sentence", "second sentence"]
    lines = []
    for text in texts:
        line = Line(text=text)
        lines.append(line)

    return (
        encoder,
        {
            "hidden_dim": 2 * hidden_dimension
            if bidirectional and combine_strategy == "concat"
            else hidden_dimension,
            "bidirectional": False,
            "combine_strategy": combine_strategy,
            "lines": lines,
        },
    )
示例#6
0
def setup_lines():
    texts = ["first line", "second line"]
    lines = []
    for text in texts:
        line = Line(text=text)
        lines.append(line)
    return lines
    def get_lines_labels(self) -> (List[Line], List[SeqLabel]):
        lines: List[Line] = []
        labels: List[SeqLabel] = []

        with open(self.filename, "r", encoding="utf-8") as fp:
            for line in fp:
                line = line.strip()
                if not bool(line):
                    continue
                lines_and_labels = line.strip().split(" ")
                words: List[str] = []
                word_labels: List[str] = []
                for word_line_labels in lines_and_labels:
                    word, word_label = word_line_labels.split("###")
                    word = word.strip()
                    word_label = word_label.strip()
                    words.append(word)
                    word_labels.append(word_label)

                line = Line(text=" ".join(words), tokenizers=self.tokenizers)
                label = SeqLabel(labels={"seq_label": word_labels})
                lines.append(line)
                labels.append(label)

        return lines, labels
示例#8
0
def setup_elmo_embedder():
    elmo_embedder = ElmoEmbedder()
    texts = ["I like to test elmo", "Elmo context embedder"]
    lines = []
    for text in texts:
        line = Line(text=text)
        lines.append(line)
    return elmo_embedder, lines
示例#9
0
def setup_lines():
    texts = ["First sentence", "Second Sentence"]
    lines = []
    for text in texts:
        line = Line(
            text=text,
            tokenizers={"tokens": WordTokenizer(), "char_tokens": CharacterTokenizer()},
        )
        lines.append(line)
    return lines
示例#10
0
def lines():
    texts = ["First line", "Second Line which is longer"]
    lines = []
    for text in texts:
        line = Line(
            text=text, tokenizers={"tokens": WordTokenizer(tokenizer="vanilla")}
        )
        lines.append(line)

    return lines
示例#11
0
 def test_line_word_tokenizers(self):
     text = "This is a single line"
     line = Line(text=text, tokenizers={"tokens": WordTokenizer()})
     tokens = line.tokens
     assert [token.text for token in tokens["tokens"]] == [
         "This",
         "is",
         "a",
         "single",
         "line",
     ]
示例#12
0
def setup_bow_encoder(request):
    aggregation_type = request.param
    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder = BOW_Encoder(embedder=embedder, aggregation_type=aggregation_type)
    texts = ["First sentence", "second sentence"]
    lines = []
    for text in texts:
        line = Line(text=text)
        lines.append(line)

    return encoder, lines
示例#13
0
 def _form_line_label(self, text: str, labels: List[str]):
     line = Line(text=text, tokenizers=self.tokenizers)
     labels_ = zip(*labels)
     labels_ = zip(self.column_names, labels_)
     labels_ = dict(labels_)
     if self.train_only:
         column_index = 0
         column_name = self.column_names[column_index]
         labels_ = {column_name: labels_[column_name]}
     label = SeqLabel(labels=labels_)
     return line, label
示例#14
0
def setup_bow_elmo_encoder(request):
    layer_aggregation = request.param
    strings = [
        "I like to eat carrot", "I like to go out on long drives in a car"
    ]

    lines = []
    for string in strings:
        line = Line(text=string)
        lines.append(line)

    bow_elmo_embedder = BowElmoEmbedder(layer_aggregation=layer_aggregation)
    return bow_elmo_embedder, lines
示例#15
0
    def get_lines_labels(self,
                         start_token: str = "<SOS>",
                         end_token: str = "<EOS>") -> (List[Line], List[Line]):
        lines: List[Line] = []
        labels: List[Line] = []

        with open(self.filename) as fp:
            for line in fp:
                line, label = line.split("###")
                line = line.strip()
                label = label.strip()
                line_instance = Line(text=line, tokenizers=self.tokenizers)
                label_instance = Line(text=label, tokenizers=self.tokenizers)
                for namespace, tokenizer in self.tokenizers.items():
                    line_instance.tokens[namespace].insert(
                        0, Token(start_token))
                    line_instance.tokens[namespace].append(Token(end_token))
                    label_instance.tokens[namespace].insert(
                        0, Token(start_token))
                    label_instance.tokens[namespace].append(Token(end_token))
                lines.append(line_instance)
                labels.append(label_instance)

        return lines, labels
示例#16
0
    def make_line(self, line: str):
        """ Makes a line object from string, having some characteristics as the lines used
        by the datasets

        Parameters
        ----------
        line : str

        Returns
        -------
        Line

        """
        line_ = Line(text=line, tokenizers=self.train_dataset.tokenizers)
        return line_
    def get_lines_labels(self) -> (List[Line], List[Label]):
        lines: List[Line] = []
        labels: List[Label] = []

        with open(self.filename) as fp:
            for line in fp:
                line, label = line.split("###")
                line = line.strip()
                label = label.strip()
                line_instance = Line(text=line, tokenizers=self.tokenizers)
                label_instance = Label(text=label)
                lines.append(line_instance)
                labels.append(label_instance)

        return lines, labels
示例#18
0
    def test_line_char_tokenizer(self):
        text = "Word"
        line = Line(
            text=text,
            tokenizers={
                "tokens": WordTokenizer(),
                "chars": CharacterTokenizer()
            },
        )
        tokens = line.tokens
        word_tokens = tokens["tokens"]
        char_tokens = tokens["chars"]

        word_tokens = [tok.text for tok in word_tokens]
        char_tokens = [tok.text for tok in char_tokens]

        assert word_tokens == ["Word"]
        assert char_tokens == ["W", "o", "r", "d"]
示例#19
0
def setup_char_embedder(request, clf_dataset_manager):
    char_embedding_dim, hidden_dim = request.param
    datset_manager = clf_dataset_manager
    embedder = CharEmbedder(
        char_embedding_dimension=char_embedding_dim,
        hidden_dimension=hidden_dim,
        datasets_manager=datset_manager,
    )
    texts = ["This is sentence", "This is another sentence"]
    lines = []
    for text in texts:
        line = Line(
            text=text,
            tokenizers={"tokens": WordTokenizer(), "char_tokens": CharacterTokenizer()},
        )
        lines.append(line)

    return embedder, lines
示例#20
0
def setup_lstm2seqencoder(request):
    HIDDEN_DIM = 1024
    BIDIRECTIONAL = request.param[0]
    COMBINE_STRATEGY = request.param[1]
    NUM_LAYERS = request.param[2]
    ADD_PROJECTION_LAYER = request.param[3]
    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        num_layers=NUM_LAYERS,
        add_projection_layer=ADD_PROJECTION_LAYER,
    )

    lines = []
    texts = ["First sentence", "second sentence"]
    for text in texts:
        line = Line(text=text)
        lines.append(line)

    return (
        encoder,
        {
            "HIDDEN_DIM":
            HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL
            and not ADD_PROJECTION_LAYER else HIDDEN_DIM,
            "NUM_LAYERS":
            NUM_LAYERS,
            "LINES":
            lines,
            "TIME_STEPS":
            2,
        },
    )
示例#21
0
    def _form_line_label(self, text: str, labels: List[str]):
        line = Line(text=text, tokenizers=self.tokenizers)
        labels_ = zip(*labels)
        labels_ = zip(self.column_names, labels_)
        labels_ = dict(labels_)
        if self.train_only:
            if self.train_only == "pos":
                column_index = 0
            elif self.train_only == "dep":
                column_index = 1
            elif self.train_only == "ner":
                column_index = 2
            else:
                raise ValueError(
                    f"train_only parameter can be one of [pos, dep, ner]")

            column_name = self.column_names[column_index]
            labels_ = {column_name: labels_[column_name]}
        label = SeqLabel(labels=labels_)
        return line, label
示例#22
0
def setup_bert_embedder(request):
    dropout_value = 0.0
    bert_type, aggregation_type = request.param

    bert_embedder = BertEmbedder(
        dropout_value=dropout_value,
        aggregation_type=aggregation_type,
        bert_type=bert_type,
    )
    strings = [
        "Lets start by talking politics",
        "there are radical ways to test your code",
    ]

    lines = []
    for string in strings:
        line = Line(text=string)
        lines.append(line)

    return bert_embedder, lines
示例#23
0
 def test_line_namespaces(self):
     text = "Single line"
     line = Line(text=text, tokenizers={"tokens": WordTokenizer()})
     assert line.namespaces == ["tokens"]
示例#24
0
 def _generate_lines_with_start_token(self):
     line = Line("")
     line.add_token(self.start_token, "tokens")
     return line
示例#25
0
    def forward(
        self,
        lines: List[Line],
        c0: torch.FloatTensor,
        h0: torch.FloatTensor,
        encoder_outputs: torch.FloatTensor = None,
        teacher_forcing_ratio: float = 0,
    ) -> torch.Tensor:
        """

        Parameters
        ----------
        lines : list of Line objects
            Batched tokenized source sentence of shape [batch size].

        h0, c0 : 3d torch.FloatTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]

        Returns
        -------
        prediction : 2d torch.LongTensor
            For each token in the batch, the predicted target vobulary.
            Shape [batch size, output dim]

        hn, cn : 3d torch.FloatTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]
        """
        use_teacher_forcing = (True if
                               (random.random() < teacher_forcing_ratio) else
                               False)
        if use_teacher_forcing:
            max_length = max(len(line.tokens["tokens"]) for line in lines)
        else:
            max_length = self.max_length
        batch_size = len(lines)

        # tensor to store decoder's output
        outputs = torch.zeros(max_length, batch_size,
                              self.vocab_size).to(self.device)

        # last hidden & cell state of the encoder is used as the decoder's initial hidden state
        if use_teacher_forcing:
            prediction, _, _ = self.forward_step(
                lines=lines, h0=h0, c0=c0, encoder_outputs=encoder_outputs)
            outputs[1:] = prediction.permute(1, 0, 2)[:-1]
        else:
            lines = [self._generate_lines_with_start_token()] * batch_size
            for i in range(1, max_length):
                prediction, hn, cn = self.forward_step(
                    lines=lines, h0=h0, c0=c0, encoder_outputs=encoder_outputs)
                prediction = prediction.squeeze(1)
                outputs[i] = prediction
                line_token_indexes = prediction.argmax(1)
                line_tokens = [
                    self.vocab.idx2token[line_token_index]
                    for line_token_index in line_token_indexes.cpu().numpy()
                ]
                lines = []
                for token in line_tokens:
                    line = Line("")
                    line.add_token(token, "tokens")
                    lines.append(line)
                h0, c0 = hn, cn
        outputs = outputs.permute(1, 0, 2)
        return outputs