Пример #1
0
def store_embeddings(data_points: Union[List[DataPoint], Dataset],
                     storage_mode: str):
    # if memory mode option 'none' delete everything
    if isinstance(data_points, Dataset):
        data_points = list(_iter_dataset(data_points))

    if storage_mode == "none":
        delete_keys = None
    # else delete only dynamic embeddings (otherwise autograd will keep everything in memory)
    else:
        # find out which ones are dynamic embeddings
        delete_keys = []
        data_point = data_points[0]
        if isinstance(data_point, Sentence):
            first_token = data_point[0]
            for name, vector in first_token._embeddings.items():
                if vector.requires_grad:
                    delete_keys.append(name)

        for name, vector in data_point._embeddings.items():
            if vector.requires_grad:
                delete_keys.append(name)

        # find out which ones are dynamic embeddings
    for data_point in data_points:
        data_point.clear_embeddings(delete_keys)

    # memory management - option 1: send everything to CPU (pin to memory if we train on GPU)
    if storage_mode == "cpu":
        pin_memory = str(flair.device) != "cpu"
        for data_point in data_points:
            data_point.to("cpu", pin_memory=pin_memory)

    # record current embedding storage mode to allow optimization (for instance in FlairEmbeddings class)
    flair.embedding_storage_mode = storage_mode
Пример #2
0
    def evaluate(
        self,
        data_points: Union[List[Sentence], Dataset],
        gold_label_type: str,
        out_path: Union[str, Path] = None,
        embedding_storage_mode: str = "none",
        mini_batch_size: int = 1,  # unnecessary, but trainer.train calls evaluate with this parameter
        num_workers: Optional[int] = 8,
        **kwargs,
    ) -> Result:

        if isinstance(data_points, Dataset):
            data_points = list(_iter_dataset(data_points))

        if self.regression:
            return self.evaluate_regression(
                sentences=data_points,
                out_path=out_path,
                embedding_storage_mode=embedding_storage_mode,
            )

        return self.evaluate_classification(
            sentences=data_points,
            out_path=out_path,
            embedding_storage_mode=embedding_storage_mode,
        )
Пример #3
0
def test_sanity_not_too_many_entities(CorpusType: Type[ColumnCorpus]):
    corpus = CorpusType()  # type: ignore
    n_entities_per_sentence = []
    for sentence in _iter_dataset(corpus.get_all_sentences()):
        entities = sentence.get_spans("ner")
        n_entities_per_sentence.append(len(entities))
    avg_entities_per_sentence = sum(n_entities_per_sentence) / len(
        n_entities_per_sentence)

    assert avg_entities_per_sentence <= 5
Пример #4
0
    def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):

        folder_path = Path(folder_path)
        folder_path = folder_path / "WNLI.tsv"

        with open(folder_path, mode="w") as tsv_file:
            tsv_file.write("index\tprediction\n")
            datapoint: DataPair
            for index, datapoint in enumerate(_iter_dataset(self.eval_dataset)):
                tsv_file.write(str(index) + "\t" + datapoint.get_labels("entailment")[0].value + "\n")
Пример #5
0
def test_sanity_no_long_entities(CorpusType: Type[ColumnCorpus]):
    corpus = CorpusType()  # type: ignore
    longest_entity: List[str] = []
    for sentence in _iter_dataset(corpus.get_all_sentences()):
        entities = sentence.get_spans("ner")
        for entity in entities:
            if len(entity.tokens) > len(longest_entity):
                longest_entity = [t.text for t in entity.tokens]

    assert len(longest_entity) < 10, " ".join(longest_entity)
Пример #6
0
    def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):

        folder_path = Path(folder_path)
        folder_path = folder_path / "MRPC.tsv"

        with open(folder_path, mode="w") as tsv_file:
            tsv_file.write("index\tprediction\n")
            datapoint: DataPair
            for index, datapoint in enumerate(_iter_dataset(self.test)):
                label = datapoint.get_labels("paraphrase")[0].value
                tsv_file.write(str(index) + "\t" + label + "\n")
Пример #7
0
def test_sanity_no_unmatched_parentheses(CorpusType: Type[ColumnCorpus]):
    corpus = CorpusType()  # type: ignore
    unbalanced_entities = []
    for sentence in _iter_dataset(corpus.get_all_sentences()):
        entities = sentence.get_spans("ner")
        for entity in entities:
            entity_text = "".join(t.text for t in entity.tokens)
            if not has_balanced_parantheses(entity_text):
                unbalanced_entities.append(entity_text)

    assert unbalanced_entities == []
Пример #8
0
    def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):

        folder_path = Path(folder_path)
        glue_eval_tsv = "MNLI-m.tsv" if self.evaluate_on_matched else "MNLI-mm.tsv"
        folder_path = folder_path / glue_eval_tsv

        with open(folder_path, mode="w") as tsv_file:
            tsv_file.write("index\tprediction\n")
            datapoint: DataPair
            for index, datapoint in enumerate(_iter_dataset(self.eval_dataset)):
                label = datapoint.get_labels("textual_entailment")[0].value
                tsv_file.write(str(index) + "\t" + label + "\n")
Пример #9
0
def test_sanity_not_starting_with_minus(CorpusType: Type[ColumnCorpus]):
    corpus = CorpusType()  # type: ignore
    entities_starting_with_minus = []
    for sentence in _iter_dataset(corpus.get_all_sentences()):
        entities = sentence.get_spans("ner")
        for entity in entities:
            if str(entity.tokens[0].text).startswith("-"):
                entities_starting_with_minus.append(" ".join(
                    [t.text for t in entity.tokens]))

    assert len(entities_starting_with_minus) == 0, "|".join(
        entities_starting_with_minus)
Пример #10
0
    def jsonl_from_eval_dataset(self, folder_path: Union[str, Path]):

        folder_path = Path(folder_path)
        folder_path = folder_path / "RTE.jsonl"

        with open(folder_path, mode="w") as jsonl_file:
            datapoint: DataPair
            for index, datapoint in enumerate(_iter_dataset(self.eval_dataset)):
                entry = {
                    "idx": index,
                    "label": datapoint.get_labels("textual_entailment")[0].value,
                }
                jsonl_file.write(str(entry) + "\n")
Пример #11
0
def test_sanity_no_repeating_Bs(CorpusType: Type[ColumnCorpus]):
    corpus = CorpusType()  # type: ignore
    longest_repeat_tokens: List[Token] = []
    repeat_tokens: List[Token] = []
    for sentence in _iter_dataset(corpus.get_all_sentences()):
        for token in sentence.tokens:
            if token.get_labels()[0].value.startswith(
                    "B") or token.get_labels()[0].value.startswith("S"):
                repeat_tokens.append(token)
            else:
                if len(repeat_tokens) > len(longest_repeat_tokens):
                    longest_repeat_tokens = repeat_tokens
                repeat_tokens = []

    assert len(longest_repeat_tokens) < 4
Пример #12
0
def store_embeddings(data_points: Union[List[DataPoint], Dataset],
                     storage_mode: str,
                     dynamic_embeddings: Optional[List[str]] = None):

    if isinstance(data_points, Dataset):
        data_points = list(_iter_dataset(data_points))

    # if memory mode option 'none' delete everything
    if storage_mode == "none":
        dynamic_embeddings = None

    # if dynamic embedding keys not passed, identify them automatically
    elif not dynamic_embeddings:
        dynamic_embeddings = identify_dynamic_embeddings(data_points[0])

    # always delete dynamic embeddings
    for data_point in data_points:
        data_point.clear_embeddings(dynamic_embeddings)

    # if storage mode is "cpu", send everything to CPU (pin to memory if we train on GPU)
    if storage_mode == "cpu":
        pin_memory = str(flair.device) != "cpu"
        for data_point in data_points:
            data_point.to("cpu", pin_memory=pin_memory)