Exemplo n.º 1
0
    def _parse_line_to_sentence(self, line: str, label_prefix: str,
                                tokenizer: Callable[[str], List[Token]]):
        words = line.split()

        labels = []
        l_len = 0

        for i in range(len(words)):
            if words[i].startswith(label_prefix):
                l_len += len(words[i]) + 1
                label = words[i].replace(label_prefix, "")
                labels.append(label)
            else:
                break

        text = line[l_len:].strip()

        if self.truncate_to_max_chars > 0:
            text = text[:self.truncate_to_max_chars]

        if text and labels:
            sentence = Sentence(text, use_tokenizer=tokenizer)

            for label in labels:
                sentence.add_label(self.label_type, label)

            if (sentence is not None
                    and 0 < self.truncate_to_max_tokens < len(sentence)):
                sentence.tokens = sentence.tokens[:self.truncate_to_max_tokens]

            return sentence
        return None
Exemplo n.º 2
0
def test_tagged_corpus_statistics_multi_label():
    train_sentence = Sentence("I love Berlin.", use_tokenizer=True).add_label('label', 'class_1')

    dev_sentence = Sentence("The sun is shining.", use_tokenizer=True).add_label('label', 'class_2')

    test_sentence = Sentence("Berlin is sunny.", use_tokenizer=True)
    test_sentence.add_label('label', 'class_1')
    test_sentence.add_label('label', 'class_2')

    class_to_count_dict = Corpus._count_sentence_labels(
        [train_sentence, dev_sentence, test_sentence]
    )

    assert "class_1" in class_to_count_dict
    assert "class_2" in class_to_count_dict
    assert 2 == class_to_count_dict["class_1"]
    assert 2 == class_to_count_dict["class_2"]

    tokens_in_sentences = Corpus._get_tokens_per_sentence(
        [train_sentence, dev_sentence, test_sentence]
    )

    assert 3 == len(tokens_in_sentences)
    assert 4 == tokens_in_sentences[0]
    assert 5 == tokens_in_sentences[1]
    assert 4 == tokens_in_sentences[2]
Exemplo n.º 3
0
 def _get_tars_formatted_sentence(self, label, original_text, tars_label=None):
     label_text_pair = " ".join([self._get_cleaned_up_label(label),
                                 self.tars_model.document_embeddings.tokenizer.sep_token,
                                 original_text])
     label_text_pair_sentence = Sentence(label_text_pair)
     if tars_label is not None:
         if tars_label:
             label_text_pair_sentence.add_label(self.tars_model.label_type,
                                                TARSClassifier.static_label_yes)
         else:
             label_text_pair_sentence.add_label(self.tars_model.label_type,
                                                TARSClassifier.static_label_no)
     return label_text_pair_sentence
Exemplo n.º 4
0
 def iterate_dataframe(dataset: pd.DataFrame) -> Iterable[Sentence]:
     for _, row in dataset.iterrows():
         res = encoder(row.smiles)
         if not res:
             continue
         res = res.replace("]", "] ").replace(".", "DOT ")
         sent = Sentence(res.strip(), use_tokenizer=plain_tokenizer)
         for col, val in row.items():
             if isinstance(val, float):
                 if val == 1.0:
                     sent.add_label(None, col.replace(" ", "_") + "_P ")
                 if val == 0.0:
                     sent.add_label(None, col.replace(" ", "_") + "_N ")
         yield sent
Exemplo n.º 5
0
    def _parse_document_to_sentence(
        self,
        text: str,
        labels: List[str],
        tokenizer: Union[Callable[[str], List[Token]], Tokenizer],
    ):
        if self.max_chars_per_doc > 0:
            text = text[:self.max_chars_per_doc]

        if text and labels:
            sentence = Sentence(text, use_tokenizer=tokenizer)
            for label in labels:
                sentence.add_label(self.tag_type, label)

            if self.max_tokens_per_doc > 0:
                sentence.tokens = sentence.tokens[:min(len(sentence), self.
                                                       max_tokens_per_doc)]

            return sentence
        return None
Exemplo n.º 6
0
def test_mixed_labels():
    # example sentence
    sentence = Sentence("I love New York")

    # has sentiment value
    sentence.add_label("sentiment", "positive")

    # has 4 part of speech tags
    sentence[1].add_label("pos", "verb")
    sentence[2].add_label("pos", "proper noun")
    sentence[3].add_label("pos", "proper noun")
    sentence[0].add_label("pos", "pronoun")

    # has 1 NER tag
    sentence[2:4].add_label("ner", "City")

    # should be in total 6 labels
    assert 6 == len(sentence.labels)
    assert 4 == len(sentence.get_labels("pos"))
    assert 1 == len(sentence.get_labels("sentiment"))
    assert 1 == len(sentence.get_labels("ner"))
Exemplo n.º 7
0
    def __getitem__(self, index: int = 0) -> Sentence:
        if self.in_memory:
            return self.sentences[index]
        else:
            row = self.raw_data[index]

            text = " ".join(
                [row[text_column] for text_column in self.text_columns])

            if self.max_chars_per_doc > 0:
                text = text[:self.max_chars_per_doc]

            sentence = Sentence(text, use_tokenizer=self.tokenizer)
            for column in self.column_name_map:
                if self.column_name_map[column].startswith(
                        "label") and row[column]:
                    sentence.add_label(self.label_type, row[column])

            if 0 < self.max_tokens_per_doc < len(sentence):
                sentence.tokens = sentence.tokens[:self.max_tokens_per_doc]

            return sentence
Exemplo n.º 8
0
def test_sentence_labels():
    # example sentence
    sentence = Sentence("I love Berlin")
    sentence.add_label("sentiment", "positive")
    sentence.add_label("topic", "travelling")

    assert 2 == len(sentence.labels)
    assert 1 == len(sentence.get_labels("sentiment"))
    assert 1 == len(sentence.get_labels("topic"))

    # add another topic label
    sentence.add_label("topic", "travelling")
    assert 3 == len(sentence.labels)
    assert 1 == len(sentence.get_labels("sentiment"))
    assert 2 == len(sentence.get_labels("topic"))

    sentence.remove_labels("topic")
    assert 1 == len(sentence.labels)
    assert 1 == len(sentence.get_labels("sentiment"))
    assert 0 == len(sentence.get_labels("topic"))
Exemplo n.º 9
0
    def predict(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
        mini_batch_size: int = 32,
        **kwargs,
    ) -> List[Sentence]:
        """Predict method for running inference using the pre-trained sequence classifier model

        * **text** - String, list of strings, sentences, or list of sentences to run inference on
        * **mini_batch_size** - Mini batch size
        * **&ast;&ast;kwargs**(Optional) - Optional arguments for the Transformers classifier
        """
        id2label = self.model.config.id2label
        sentences = text
        results: List[Sentence] = []

        with torch.no_grad():
            if not sentences:
                return sentences

            if isinstance(sentences, DataPoint) or isinstance(sentences, str):
                sentences = [sentences]

            # filter empty sentences
            if isinstance(sentences[0], Sentence):
                sentences = [
                    sentence for sentence in sentences if len(sentence) > 0
                ]
            if len(sentences) == 0:
                return sentences

            # reverse sort all sequences by their length
            rev_order_len_index = sorted(range(len(sentences)),
                                         key=lambda k: len(sentences[k]),
                                         reverse=True)
            original_order_index = sorted(range(len(rev_order_len_index)),
                                          key=lambda k: rev_order_len_index[k])

            reordered_sentences: List[Union[DataPoint, str]] = [
                sentences[index] for index in rev_order_len_index
            ]
            # Turn all Sentence objects into strings
            if isinstance(reordered_sentences[0], Sentence):
                str_reordered_sentences = [
                    sentence.to_original_text() for sentence in sentences
                ]
            else:
                str_reordered_sentences = reordered_sentences

            # Tokenize and get dataset
            dataset = self._tokenize(str_reordered_sentences)
            dataloader = DataLoader(dataset, batch_size=mini_batch_size)
            predictions: List[Tuple[str, float]] = []

            logger.info(f"Running prediction on {len(dataset)} text sequences")
            logger.info(f"Batch size = {mini_batch_size}")
            for batch in tqdm(dataloader, desc="Predicting text"):
                self.model.eval()
                batch = tuple(t.to(self.device) for t in batch)

                if len(batch) == 3:
                    inputs = {
                        "input_ids": batch[0],
                        "attention_mask": batch[1],
                        "token_type_ids": batch[2],
                    }
                else:
                    inputs = {
                        "input_ids": batch[0],
                        "attention_mask": batch[1]
                    }
                outputs = self.model(**inputs)
                logits = outputs[0]
                preds = torch.softmax(logits, dim=1).tolist()

                predictions += preds

            for text, pred in zip(str_reordered_sentences, predictions):
                # Initialize and assign labels to each class in each datapoint prediction
                text_sent = Sentence(text)
                for k, v in id2label.items():
                    text_sent.add_label(label_type="sc",
                                        value=v,
                                        score=pred[k])
                results.append(text_sent)

        # Order results back into original order
        results = [results[index] for index in original_order_index]

        return results
Exemplo n.º 10
0
# print the sentence with all tags of this type
print(sentence.to_tagged_string())

###

from flair.data import Label

tag: Label = sentence[3].get_tag('ner')

print(
    f'"{sentence[3]}" is tagged as "{tag.value}" with confidence score "{tag.score}"'
)

###

sentence = Sentence('France is the current world cup winner.')

# add a label to a sentence
sentence.add_label('sports')

# a sentence can also belong to multiple classes
sentence.add_labels(['sports', 'world cup'])

# you can also set the labels while initializing the sentence
sentence = Sentence('France is the current world cup winner.',
                    labels=['sports', 'world cup'])

print(sentence)
for label in sentence.labels:
    print(label)
Exemplo n.º 11
0
    def __init__(
        self,
        path_to_file: Union[str, Path],
        column_name_map: Dict[int, str],
        label_type: str = "class",
        max_tokens_per_doc: int = -1,
        max_chars_per_doc: int = -1,
        tokenizer=segtok_tokenizer,
        in_memory: bool = True,
        skip_header: bool = False,
        encoding: str = 'utf-8',
        **fmtparams,
    ):
        """
        Instantiates a Dataset for text classification from CSV column formatted data

        :param path_to_file: path to the file with the CSV data
        :param column_name_map: a column name map that indicates which column is text and which the label(s)
        :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
        :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
        :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization
        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
        :param skip_header: If True, skips first line because it is header
        :param fmtparams: additional parameters for the CSV file reader
        :return: a Corpus with annotated train, dev and test data
        """

        if type(path_to_file) == str:
            path_to_file: Path = Path(path_to_file)

        assert path_to_file.exists()

        # variables
        self.path_to_file = path_to_file
        self.in_memory = in_memory
        self.tokenizer = tokenizer
        self.column_name_map = column_name_map
        self.max_tokens_per_doc = max_tokens_per_doc
        self.max_chars_per_doc = max_chars_per_doc

        self.label_type = label_type

        # different handling of in_memory data than streaming data
        if self.in_memory:
            self.sentences = []
        else:
            self.raw_data = []

        self.total_sentence_count: int = 0

        # most data sets have the token text in the first column, if not, pass 'text' as column
        self.text_columns: List[int] = []
        for column in column_name_map:
            if column_name_map[column] == "text":
                self.text_columns.append(column)

        with open(self.path_to_file, encoding=encoding) as csv_file:

            csv_reader = csv.reader(csv_file, **fmtparams)

            if skip_header:
                next(csv_reader, None)  # skip the headers

            for row in csv_reader:

                # test if format is OK
                wrong_format = False
                for text_column in self.text_columns:
                    if text_column >= len(row):
                        wrong_format = True

                if wrong_format:
                    continue

                # test if at least one label given
                has_label = False
                for column in self.column_name_map:
                    if self.column_name_map[column].startswith(
                            "label") and row[column]:
                        has_label = True
                        break

                if not has_label:
                    continue

                if self.in_memory:

                    text = " ".join([
                        row[text_column] for text_column in self.text_columns
                    ])

                    if self.max_chars_per_doc > 0:
                        text = text[:self.max_chars_per_doc]

                    sentence = Sentence(text, use_tokenizer=self.tokenizer)

                    for column in self.column_name_map:
                        if (self.column_name_map[column].startswith("label")
                                and row[column]):
                            sentence.add_label(label_type, row[column])

                    if 0 < self.max_tokens_per_doc < len(sentence):
                        sentence.tokens = sentence.tokens[:self.
                                                          max_tokens_per_doc]
                    self.sentences.append(sentence)

                else:
                    self.raw_data.append(row)

                self.total_sentence_count += 1