예제 #1
0
def load_dicts(
    train_ds: FieldedDataset,
    ignore_e_x=False,
    pretrained_word_embeddings_file=None,
    morphlex_embeddings_file=None,
    known_chars_file=None,
):
    """Load all the modules for the model."""
    embeddings: Dict[Dicts, Tensor] = {}
    dictionaries: Dict[Dicts, VocabMap] = {}

    # Pretrained
    if pretrained_word_embeddings_file:
        m_map, m_embedding = read_pretrained_word_embeddings(
            pretrained_word_embeddings_file)
        embeddings[Dicts.Pretrained] = m_embedding
        dictionaries[Dicts.Pretrained] = m_map

    dictionaries[Dicts.Tokens] = train_ds.get_vocab_map(
        special_tokens=VocabMap.UNK_PAD)

    # MorphLex
    if morphlex_embeddings_file:
        # File is provided, use it.
        m_map, m_embedding = read_morphlex(morphlex_embeddings_file)
        embeddings[Dicts.MorphLex] = m_embedding
        dictionaries[Dicts.MorphLex] = m_map

    # Character mappings, if a file is provided, use it. Otherwise, build from dataset.
    if known_chars_file:
        char_vocab = Vocab.from_file(known_chars_file)
    else:
        char_vocab = train_ds.get_char_vocab()
    c_map = VocabMap(
        char_vocab,
        special_tokens=VocabMap.UNK_PAD_EOS_SOS,
    )
    dictionaries[Dicts.Chars] = c_map

    # TAGS (POS)
    special_tokens = VocabMap.UNK_PAD
    tags = train_ds.get_vocab(field=Fields.GoldTags)
    if ignore_e_x:
        special_tokens.append(("e", PAD_ID))
        tags.remove("e")
        special_tokens.append(("x", PAD_ID))
        tags.remove("x")
    dictionaries[Dicts.FullTag] = VocabMap(tags,
                                           special_tokens=VocabMap.UNK_PAD)
    return embeddings, dictionaries
예제 #2
0
파일: api.py 프로젝트: cadia-lvl/POS
def cast_types(sentences: Union[Sentences, FieldedDataset]) -> FieldedDataset:
    """Convert list/tuple to TokenizedDataset."""
    if type(sentences) == FieldedDataset:
        return cast(FieldedDataset, sentences)
    else:
        sentences = cast(Sentences, sentences)
        return FieldedDataset((sentences, ), fields=(Fields.Tokens, ))
예제 #3
0
파일: api.py 프로젝트: cadia-lvl/POS
    def lemma_bulk(
        self,
        dataset: Union[Tuple[Sentences, Sentences], FieldedDataset],
        batch_size=16,
    ) -> Sentences:
        """Lemmatize multiple sentence. This is a faster alternative to "lemma_sent", used for batch processing.

        Args:
            dataset: A collection of tokenized (first) sentence and their tags; Tuple[Sentences, Sentences], Sentences=Tuple[Sentence, ...], Sentence=Tuple[str, ...] or FieldedDataset.
            batch_size: The number of sentences to process at once. Set it to as high as possible without blowing up the memory.

        Returns: The lemmas a Tuple[Tuple[str, ...], ...] or FieldedDataset.
        """
        if type(dataset) == tuple:
            dataset = cast(Tuple[Sentences, Sentences], dataset)
            ds = FieldedDataset(dataset,
                                fields=(Fields.Tokens, Fields.GoldTags))
        elif type(dataset) == FieldedDataset:
            dataset = cast(FieldedDataset, dataset)
            ds = dataset
        else:
            raise ValueError(
                "Bad input type. Use Tuple[Sentences, Sentences] or FieldedDataset"
            )

        return self._infer(ds, batch_size=batch_size).get_field(Fields.Lemmas)
예제 #4
0
def test_dataset_from_file(test_tsv_lemma_file):
    test_ds = FieldedDataset.from_file(test_tsv_lemma_file,
                                       fields=(Fields.Tokens, Fields.GoldTags,
                                               Fields.GoldLemmas))
    tmp = test_ds[0]
    print(tmp)
    assert tmp == (("Hæ", ), ("a", ), ("hæ", ))
    assert len(test_ds) == 3
예제 #5
0
def run_model(model, data_in, output, batch_size, field):
    log.info("Reading dataset")
    ds = FieldedDataset.from_file(data_in)
    predicted_tags = model.tag_bulk(dataset=ds, batch_size=batch_size)
    ds = ds.add_field(predicted_tags, field)
    log.info("Writing results")
    ds.to_tsv_file(output)
    log.info("Done!")
예제 #6
0
def test_add_field(test_tsv_lemma_file):
    test_ds = FieldedDataset.from_file(test_tsv_lemma_file,
                                       fields=(Fields.Tokens, Fields.GoldTags,
                                               Fields.GoldLemmas))
    test_ds = test_ds.add_field(test_ds.get_field(Fields.GoldTags),
                                Fields.Tags)
    assert len(test_ds.fields) == 4
    for element in test_ds:
        assert len(element) == 4
예제 #7
0
def test_tagger(test_tsv_untagged_file, pretrained_tagger: pos.Tagger):
    """Test all methods of the Tagger."""
    # Tag a single sentence
    tags = pretrained_tagger.tag_sent(("Þetta", "er", "setning", "."))
    assert tags == ("fahen", "sfg3en", "nven", "pl")

    # Tag a correctly formatted file.
    dataset = FieldedDataset.from_file(test_tsv_untagged_file)
    tags = pretrained_tagger.tag_bulk(dataset=dataset)
    print(tags)
    assert tags == (("au", ), ("fahen", "sfg3en", "nhen"), ("au", "aa"))
예제 #8
0
def test_read_predicted(tagged_test_tsv_file):
    fields = (Fields.Tokens, Fields.GoldTags, Fields.Tags)
    pred_ds = FieldedDataset.from_file(tagged_test_tsv_file, fields=fields)
    assert pred_ds.get_field(Fields.Tokens) == (
        ("Hæ", ),
        ("Þetta", "er", "test"),
        ("Já", "Kannski"),
    )
    assert pred_ds.get_field(Fields.GoldTags) == (("a", ), ("f", "s", "n"),
                                                  ("a", "a"))
    assert pred_ds.get_field(Fields.Tags) == (("a", ), ("n", "s", "a"), ("a",
                                                                         "a"))
예제 #9
0
파일: api.py 프로젝트: cadia-lvl/POS
    def lemma_sent(self, sent: Sentence, tags: Sentence) -> Sentence:
        """Lemmatize a (single) sentence. To lemmatize multiple sentences at once (faster) use "lemma_bulk".

        Args:
            sent: A tokenized sentence; a Tuple[str, ...] (a tuple of strings)
            tags: The POS tags of the sentence; a Tuple[str, ...] (a tuple of strings)

        Returns: The lemmas for a sentence, a Tuple[str, ...] where the first element in the tuple corresponds to the first token in the input sentence.
        """
        ds = FieldedDataset(((sent, ), (tags, )),
                            fields=(Fields.Tokens, Fields.GoldTags))
        return self.lemma_bulk(ds, batch_size=1)[0]
예제 #10
0
def evaluate_experiments(
    directories,
    fields,
    pretrained_vocab,
    morphlex_vocab,
    criteria,
    feature,
    up_to,
    skip_gold_ex,
):
    """Evaluate the model predictions in the directory. If the directory contains other directories, it will recurse into it."""
    directories = [pathlib.Path(directory) for directory in directories]
    fields = fields.split(",")
    accuracy_results = []
    profile = Counter()
    for directory in directories:
        ds = FieldedDataset.from_file(str(directory / "predictions.tsv"),
                                      fields=fields)
        train_tokens = str(directory / "known_toks.txt")
        train_lemmas = str(directory / "known_lemmas.txt")
        if criteria == "accuracy":
            accuracy_results.append(
                evaluate.get_accuracy_from_files(
                    feature,
                    ds,
                    train_tokens=train_tokens,
                    train_lemmas=train_lemmas,
                    morphlex_vocab=morphlex_vocab,
                    pretrained_vocab=pretrained_vocab,
                    skip_gold_ex=skip_gold_ex,
                ))
        elif criteria == "profile":
            profile += evaluate.get_profile_from_files(
                feature,
                ds,
                train_tokens=train_tokens,
                train_lemmas=train_lemmas,
                morphlex_vocab=morphlex_vocab,
                pretrained_vocab=pretrained_vocab,
                skip_gold_ex=skip_gold_ex,
            )
    if criteria == "accuracy":
        click.echo(
            evaluate.format_results(
                evaluate.all_accuracy_average(accuracy_results)))
    elif criteria == "profile":
        click.echo(f"Total errors: {sum(profile.values())}")
        pred_x_e_pattern = re.compile("^[ex] >")
        click.echo(
            f"Errors caused by model predicting 'x' and 'e': {sum(value for key, value in profile.items() if pred_x_e_pattern.search(key) is not None)}"
        )
        click.echo(evaluate.format_profile(profile, up_to=up_to))
예제 #11
0
def test_tokenizer_preprocessing_and_postprocessing(ds: FieldedDataset,
                                                    electra_model):
    assert len(ds.fields) == len(ds.data)  # sanity check
    assert len(ds) == 3
    assert ds.get_lengths() == (1, 3, 2)
    # be sure that there are too long sentences
    assert any(
        len(field) > 2 for sentence_fields in ds for field in sentence_fields)
    max_sequence_length = 2 + 2 + 6  # 2 extra for [SEP] and [CLS] and extra defined in function

    wemb = TransformerEmbedding(Modules.BERT, electra_model)
    chunked_ds = chunk_dataset(ds,
                               wemb.tokenizer,
                               max_sequence_length=max_sequence_length)
    assert len(chunked_ds) == 4
    chunked_lengths = chunked_ds.get_lengths()
    assert chunked_lengths == (1, 2, 1, 2)
    # All should be of acceptable length
    assert all(length <= max_sequence_length for length in chunked_lengths)
    dechunked_ds = dechunk_dataset(ds, chunked_ds)
    dechunked_lengths = dechunked_ds.get_lengths()
    assert dechunked_lengths == ds.get_lengths()
예제 #12
0
def test_dataset_from_file_lemmas(test_tsv_lemma_file):
    test_ds = FieldedDataset.from_file(test_tsv_lemma_file,
                                       fields=(Fields.Tokens, Fields.GoldTags,
                                               Fields.GoldLemmas))
    tmp = test_ds[0]
    print(tmp)
    assert tmp == (("Hæ", ), ("a", ), ("hæ", ))
    assert len(test_ds) == 3
    assert test_ds.get_field(Fields.GoldLemmas) == (
        ("hæ", ),
        ("þetta", "vera", "test"),
        ("já", "kannski"),
    )
예제 #13
0
def evaluate_predictions(predictions, fields, morphlex_vocab, pretrained_vocab,
                         train_tokens, train_lemmas, criteria, feature, up_to):
    """Evaluate predictions.

    Evaluate a single prediction file.

    Args:
        predictions: The tagged test file.
        fields: The fields present in the test file. Separated with ',', f.ex. 'tokens,gold_tags,tags'.
        morphlex_vocab: The location of the morphlex vocab.
        pretrained_vocab: The location of the pretrained vocab.
        train_tokens: The location of the tokens used in training.
        train_lemmas: The location of the lemmas used in training.
        criteria: The evaluation criteria
        feature: Lemmas or tags?
        up_to: The number of errors for profile.
    """
    click.echo(f"Evaluating: {predictions}")
    ds = FieldedDataset.from_file(predictions, fields=tuple(fields.split(",")))
    if criteria == "accuracy":
        result = evaluate.get_accuracy_from_files(
            feature,
            ds,
            train_tokens=train_tokens,
            train_lemmas=train_lemmas,
            morphlex_vocab=morphlex_vocab,
            pretrained_vocab=pretrained_vocab,
        )
        click.echo(evaluate.format_result(result))
    elif criteria == "profile":
        result = evaluate.get_profile_from_files(
            feature,
            ds,
            train_tokens=train_tokens,
            train_lemmas=train_lemmas,
            morphlex_vocab=morphlex_vocab,
            pretrained_vocab=pretrained_vocab,
        )
        click.echo(evaluate.format_profile(result, up_to=up_to))
    else:  # confusion
        train_lemmas = Vocab.from_file(train_lemmas)
        morphlex_vocab = Vocab.from_file(morphlex_vocab)
        pretrained_vocab = Vocab.from_file(pretrained_vocab)
        evaluation = evaluate.TaggingLemmatizationEvaluation(
            test_dataset=ds,
            train_vocab=train_tokens,
            external_vocabs=evaluate.ExternalVocabularies(
                morphlex_vocab, pretrained_vocab),
            train_lemmas=train_lemmas,
        )
        click.echo(evaluation.lemma_tag_confusion_matrix())
예제 #14
0
파일: dataset.py 프로젝트: cadia-lvl/POS
def read_datasets(
    file_paths: List[str],
    fields=None,
) -> FieldedDataset:
    """Read tagged datasets from multiple files.

    Args:
        file_paths: The paths to the datasets.
        fields: The tagged fields in the dataset
    """
    log.debug(f"Reading files={file_paths}")
    combined = reduce(
        add,
        (FieldedDataset.from_file(training_file, fields) for training_file in file_paths),
    )
    log.debug(f"Dataset length={len(combined)}")
    return combined
예제 #15
0
def lemma(data_in, output, device, batch_size, force_reload, force_download):
    """Lemma using tokens and PoS tags in a file.

    Args:
        data_in: A filepath of a file formatted as: token TAB PoS-tag per line, sentences separated with newlines (empty line).
        output: A filepath. Output is formatted like the input, but after each token TAB PoS-tag there is a tab and then the lemma.
        device: cpu or cuda
    """
    model: Tagger = torch.hub.load(
        repo_or_dir="cadia-lvl/POS",
        model="lemma",
        device=device,
        force_reload=force_reload,
        force_download=force_download,
    )
    log.info("Reading dataset")
    ds = FieldedDataset.from_file(data_in)
    predicted_tags = model.lemma_bulk(dataset=ds, batch_size=batch_size)
    ds = ds.add_field(predicted_tags, Fields.Lemmas)
    log.info("Writing results")
    ds.to_tsv_file(output)
    log.info("Done!")
예제 #16
0
파일: dataset.py 프로젝트: cadia-lvl/POS
def dechunk_dataset(original_ds: FieldedDataset, chunked_ds: FieldedDataset) -> FieldedDataset:
    """Reverse the chunking from the original dataset."""
    log.info("Reversing the splitting of sentences in order to fit BERT-like model")
    original_lengths = original_ds.get_lengths()
    return chunked_ds.adjust_lengths(original_lengths, shorten=False)
예제 #17
0
파일: dataset.py 프로젝트: cadia-lvl/POS
def chunk_dataset(ds: FieldedDataset, tokenizer: PreTrainedTokenizerFast, max_sequence_length) -> FieldedDataset:
    """Split up sentences which are too long."""
    log.info("Splitting sentences in order to fit BERT-like model")
    tokens = ds.get_field()
    lengths = get_adjusted_lengths(tokens, tokenizer, max_sequence_length=max_sequence_length)
    return ds.adjust_lengths(lengths, shorten=True)
예제 #18
0
def test_more_tokenization(electra_model):
    max_sequence_length = 512
    wemb = TransformerEmbedding(Modules.BERT, electra_model)
    tok = wemb.tokenizer
    # fmt: off
    test = [('Báðar', 'segjast', 'þær', 'hafa', 'verið', 'látnar', 'matast',
             'í', 'eldhúsinu', ',', 'eins', 'og', 'hjúum', 'var', 'gjarnan',
             'skipað', 'erlendis', ',', 'og', 'ekki', 'líkað', 'það', 'par',
             'vel', 'enda', 'vanar', 'meiri', 'virðingu', 'að', 'heiman', '.')]
    # fmt: on
    lengts = get_adjusted_lengths(
        tuple(test),
        tok,
        max_sequence_length=max_sequence_length,
    )
    assert lengts == (len(test[0]), )
    ds = FieldedDataset((tuple(test), ), fields=("tokens", ))
    chunked_ds = chunk_dataset(ds,
                               tok,
                               max_sequence_length=max_sequence_length)
    # fmt: off
    test = [('Fræðslu-', 'og', 'kynningarfundur', 'kl.', '14', '.')]
    # fmt: on
    lengts = get_adjusted_lengths(
        tuple(test),
        tok,
        max_sequence_length=max_sequence_length,
    )
    ds = FieldedDataset((tuple(test), ), fields=("tokens", ))
    chunked_ds = chunk_dataset(ds,
                               tok,
                               max_sequence_length=max_sequence_length)
    # fmt: off
    test = [("9,754", "kl.")]
    # fmt: on
    lengts = get_adjusted_lengths(
        tuple(test),
        tok,
        max_sequence_length=max_sequence_length,
    )
    assert lengts == (len(test[0]), )
    ds = FieldedDataset((tuple(test), ), fields=("tokens", ))
    chunked_ds = chunk_dataset(ds,
                               tok,
                               max_sequence_length=max_sequence_length)
    # fmt: off
    test = [('Kl.', '9', 'handavinna', ',', 'útskurður', ',', 'fótaaðgerð',
             'og', 'hárgreiðsla', '.')]
    # fmt: on
    lengts = get_adjusted_lengths(
        tuple(test),
        tok,
        max_sequence_length=max_sequence_length,
    )
    assert lengts == (len(test[0]), )
    ds = FieldedDataset((tuple(test), ), fields=("tokens", ))
    chunked_ds = chunk_dataset(ds,
                               tok,
                               max_sequence_length=max_sequence_length)

    # fmt: off
    test = [
        ('ýsa', '36', '36', '36', '257', '9,252', 'Und.', 'þorskur', '62',
         '62', '62', '276', '17,112', 'Ýsa', '169', '34', '129', '5,967',
         '767,608', 'Þorskur', '204', '160', '174', '460', '79,831',
         'Þykkvalúra', '214', '214', '214', '1,070', 'Samtals', '143',
         '11,129', '1,592,544', 'FISKMARKAÐUR', 'VESTMANNAEYJA', 'Blálanga',
         '31', '31', '31', '76', '2,356', 'Grálúða', '180', '177', '178', '44',
         '7,833', 'Gullkarfi', '87', '87', '87', '124', '10,788', 'Hlýri',
         '76', '76', '76', '899', '68,325', 'Keila', '32', '32', '32', '41',
         '1,312', 'Keilubróðir', '4', 'Langa', '74', '67', '72', '1,111',
         '79,799', 'Lúða', '204', '181', '192', '83', '15,917', 'Skarkoli',
         '21', '21', '21', '5', '105', 'Skata', '97', '17', '68', '25',
         '1,705', 'Skötuselur', '181', '140', '165', '576', '94,846',
         'Steinbítur', '76', '69', '75', '454', '34,203', 'Stórkjafta', '12',
         '12', '12', '283', '3,396', 'Ufsi', '35', '25', '34', '5,384',
         '181,863', 'Und.', 'ufsi', '6', '6', '6', '30', '180', 'Ósundurliðað',
         '3', 'Ýsa', '55', '67', '89', '6,005', 'Þorskur', '186', '76', '141',
         '236', '33,375', 'Þykkvalúra', '190', '7', '160', '133', '21,244',
         'Samtals', '59', '9,600', '563,252', 'FISKMARKAÐUR', 'ÞÓRSHAFNAR',
         'Hlýri', '81', '81', '81', '770', '62,371', 'Langa', '59', '59', '59',
         '43', '2,537', 'Náskata', '15', '15', '15', '103', '1,545',
         'Steinbítur', '70', '70', '70', '131', '9,170', 'Ufsi', '22', '22',
         '22', '777', '17,094', 'Und.', 'ýsa', '29', '29', '29', '192',
         '5,568', 'Ýsa', '36', '36', '36', '120', '4,320', 'Samtals', '48',
         '2,136', '102,605', 'FISKMARKAÐURINN', 'Á', 'SKAGASTRÖND', 'Lúða',
         '459', '151', '298', '21', '6,251', 'Skata', '47', '47', '47', '12',
         '564', 'Ufsi', '15', '15', '15', '117', '1,755', 'Und.', 'þorskur',
         '63', '63', '63', '31,500', 'Ýsa', '172', '69', '133', '800',
         '106,700', 'Þorskur', '229', '90', '129', '6,690', '862,380',
         'Samtals', '124', '8,140', '1,009,150', 'FM', 'PATREKSFJARÐAR',
         'Lúða', '161', '161', '161', '24', '3,864', 'Skarkoli', '100', '100',
         '100', '129', '12,900', 'Steinbítur', '68', '68', '68', '75', '5,100',
         'Ufsi', '22', '22', '22', '1,042', '22,924', 'Und.', 'þorskur', '64',
         '64', '64', '1,258', '80,512', 'Ýsa', '122', '41', '86', '370',
         '31,937', 'Þorskur', '128', '72', '101', '14,733', '1,493,633',
         'Samtals', '94', '17,631')
    ]
    # fmt: on
    lengts = get_adjusted_lengths(
        tuple(test),
        tok,
        max_sequence_length=max_sequence_length,
    )
    assert sum(lengts) == len(test[0])
    ds = FieldedDataset((tuple(test), ), fields=("tokens", ))
    chunked_ds = chunk_dataset(ds,
                               tok,
                               max_sequence_length=max_sequence_length)
    test = [("Síðan", "kom", "Gern", "hab", "'", "ich", "die", "Frau", "'",
             "n", "geküßt", "úr")]
    lengts = get_adjusted_lengths(
        tuple(test),
        tok,
        max_sequence_length=max_sequence_length,
    )
    assert sum(lengts) == len(test[0])
    ds = FieldedDataset((tuple(test), ), fields=("tokens", ))
    chunked_ds = chunk_dataset(ds,
                               tok,
                               max_sequence_length=max_sequence_length)
    # fmt: off
    test = [('qt', '/', 'qt-1', '<', '1', '.')]
    # fmt: on
    lengts = get_adjusted_lengths(
        tuple(test),
        tok,
        max_sequence_length=max_sequence_length,
    )
    assert sum(lengts) == len(test[0])
    ds = FieldedDataset((tuple(test), ), fields=("tokens", ))
    chunked_ds = chunk_dataset(ds,
                               tok,
                               max_sequence_length=max_sequence_length)
    assert chunked_ds is not None
예제 #19
0
def test_adjust_lens(ds: FieldedDataset):
    lengths = tuple(1 for _ in range(sum(ds.get_lengths())))
    ds = ds.adjust_lengths(lengths, shorten=True)
    assert ds.get_lengths() == lengths
예제 #20
0
def ds(test_tsv_lemma_file):
    """Return a sequence tagged dataset."""
    return FieldedDataset.from_file(test_tsv_lemma_file,
                                    fields=(Fields.Tokens, Fields.GoldTags,
                                            Fields.GoldLemmas))