def load_dicts( train_ds: FieldedDataset, ignore_e_x=False, pretrained_word_embeddings_file=None, morphlex_embeddings_file=None, known_chars_file=None, ): """Load all the modules for the model.""" embeddings: Dict[Dicts, Tensor] = {} dictionaries: Dict[Dicts, VocabMap] = {} # Pretrained if pretrained_word_embeddings_file: m_map, m_embedding = read_pretrained_word_embeddings( pretrained_word_embeddings_file) embeddings[Dicts.Pretrained] = m_embedding dictionaries[Dicts.Pretrained] = m_map dictionaries[Dicts.Tokens] = train_ds.get_vocab_map( special_tokens=VocabMap.UNK_PAD) # MorphLex if morphlex_embeddings_file: # File is provided, use it. m_map, m_embedding = read_morphlex(morphlex_embeddings_file) embeddings[Dicts.MorphLex] = m_embedding dictionaries[Dicts.MorphLex] = m_map # Character mappings, if a file is provided, use it. Otherwise, build from dataset. if known_chars_file: char_vocab = Vocab.from_file(known_chars_file) else: char_vocab = train_ds.get_char_vocab() c_map = VocabMap( char_vocab, special_tokens=VocabMap.UNK_PAD_EOS_SOS, ) dictionaries[Dicts.Chars] = c_map # TAGS (POS) special_tokens = VocabMap.UNK_PAD tags = train_ds.get_vocab(field=Fields.GoldTags) if ignore_e_x: special_tokens.append(("e", PAD_ID)) tags.remove("e") special_tokens.append(("x", PAD_ID)) tags.remove("x") dictionaries[Dicts.FullTag] = VocabMap(tags, special_tokens=VocabMap.UNK_PAD) return embeddings, dictionaries
def cast_types(sentences: Union[Sentences, FieldedDataset]) -> FieldedDataset: """Convert list/tuple to TokenizedDataset.""" if type(sentences) == FieldedDataset: return cast(FieldedDataset, sentences) else: sentences = cast(Sentences, sentences) return FieldedDataset((sentences, ), fields=(Fields.Tokens, ))
def lemma_bulk( self, dataset: Union[Tuple[Sentences, Sentences], FieldedDataset], batch_size=16, ) -> Sentences: """Lemmatize multiple sentence. This is a faster alternative to "lemma_sent", used for batch processing. Args: dataset: A collection of tokenized (first) sentence and their tags; Tuple[Sentences, Sentences], Sentences=Tuple[Sentence, ...], Sentence=Tuple[str, ...] or FieldedDataset. batch_size: The number of sentences to process at once. Set it to as high as possible without blowing up the memory. Returns: The lemmas a Tuple[Tuple[str, ...], ...] or FieldedDataset. """ if type(dataset) == tuple: dataset = cast(Tuple[Sentences, Sentences], dataset) ds = FieldedDataset(dataset, fields=(Fields.Tokens, Fields.GoldTags)) elif type(dataset) == FieldedDataset: dataset = cast(FieldedDataset, dataset) ds = dataset else: raise ValueError( "Bad input type. Use Tuple[Sentences, Sentences] or FieldedDataset" ) return self._infer(ds, batch_size=batch_size).get_field(Fields.Lemmas)
def test_dataset_from_file(test_tsv_lemma_file): test_ds = FieldedDataset.from_file(test_tsv_lemma_file, fields=(Fields.Tokens, Fields.GoldTags, Fields.GoldLemmas)) tmp = test_ds[0] print(tmp) assert tmp == (("Hæ", ), ("a", ), ("hæ", )) assert len(test_ds) == 3
def run_model(model, data_in, output, batch_size, field): log.info("Reading dataset") ds = FieldedDataset.from_file(data_in) predicted_tags = model.tag_bulk(dataset=ds, batch_size=batch_size) ds = ds.add_field(predicted_tags, field) log.info("Writing results") ds.to_tsv_file(output) log.info("Done!")
def test_add_field(test_tsv_lemma_file): test_ds = FieldedDataset.from_file(test_tsv_lemma_file, fields=(Fields.Tokens, Fields.GoldTags, Fields.GoldLemmas)) test_ds = test_ds.add_field(test_ds.get_field(Fields.GoldTags), Fields.Tags) assert len(test_ds.fields) == 4 for element in test_ds: assert len(element) == 4
def test_tagger(test_tsv_untagged_file, pretrained_tagger: pos.Tagger): """Test all methods of the Tagger.""" # Tag a single sentence tags = pretrained_tagger.tag_sent(("Þetta", "er", "setning", ".")) assert tags == ("fahen", "sfg3en", "nven", "pl") # Tag a correctly formatted file. dataset = FieldedDataset.from_file(test_tsv_untagged_file) tags = pretrained_tagger.tag_bulk(dataset=dataset) print(tags) assert tags == (("au", ), ("fahen", "sfg3en", "nhen"), ("au", "aa"))
def test_read_predicted(tagged_test_tsv_file): fields = (Fields.Tokens, Fields.GoldTags, Fields.Tags) pred_ds = FieldedDataset.from_file(tagged_test_tsv_file, fields=fields) assert pred_ds.get_field(Fields.Tokens) == ( ("Hæ", ), ("Þetta", "er", "test"), ("Já", "Kannski"), ) assert pred_ds.get_field(Fields.GoldTags) == (("a", ), ("f", "s", "n"), ("a", "a")) assert pred_ds.get_field(Fields.Tags) == (("a", ), ("n", "s", "a"), ("a", "a"))
def lemma_sent(self, sent: Sentence, tags: Sentence) -> Sentence: """Lemmatize a (single) sentence. To lemmatize multiple sentences at once (faster) use "lemma_bulk". Args: sent: A tokenized sentence; a Tuple[str, ...] (a tuple of strings) tags: The POS tags of the sentence; a Tuple[str, ...] (a tuple of strings) Returns: The lemmas for a sentence, a Tuple[str, ...] where the first element in the tuple corresponds to the first token in the input sentence. """ ds = FieldedDataset(((sent, ), (tags, )), fields=(Fields.Tokens, Fields.GoldTags)) return self.lemma_bulk(ds, batch_size=1)[0]
def evaluate_experiments( directories, fields, pretrained_vocab, morphlex_vocab, criteria, feature, up_to, skip_gold_ex, ): """Evaluate the model predictions in the directory. If the directory contains other directories, it will recurse into it.""" directories = [pathlib.Path(directory) for directory in directories] fields = fields.split(",") accuracy_results = [] profile = Counter() for directory in directories: ds = FieldedDataset.from_file(str(directory / "predictions.tsv"), fields=fields) train_tokens = str(directory / "known_toks.txt") train_lemmas = str(directory / "known_lemmas.txt") if criteria == "accuracy": accuracy_results.append( evaluate.get_accuracy_from_files( feature, ds, train_tokens=train_tokens, train_lemmas=train_lemmas, morphlex_vocab=morphlex_vocab, pretrained_vocab=pretrained_vocab, skip_gold_ex=skip_gold_ex, )) elif criteria == "profile": profile += evaluate.get_profile_from_files( feature, ds, train_tokens=train_tokens, train_lemmas=train_lemmas, morphlex_vocab=morphlex_vocab, pretrained_vocab=pretrained_vocab, skip_gold_ex=skip_gold_ex, ) if criteria == "accuracy": click.echo( evaluate.format_results( evaluate.all_accuracy_average(accuracy_results))) elif criteria == "profile": click.echo(f"Total errors: {sum(profile.values())}") pred_x_e_pattern = re.compile("^[ex] >") click.echo( f"Errors caused by model predicting 'x' and 'e': {sum(value for key, value in profile.items() if pred_x_e_pattern.search(key) is not None)}" ) click.echo(evaluate.format_profile(profile, up_to=up_to))
def test_tokenizer_preprocessing_and_postprocessing(ds: FieldedDataset, electra_model): assert len(ds.fields) == len(ds.data) # sanity check assert len(ds) == 3 assert ds.get_lengths() == (1, 3, 2) # be sure that there are too long sentences assert any( len(field) > 2 for sentence_fields in ds for field in sentence_fields) max_sequence_length = 2 + 2 + 6 # 2 extra for [SEP] and [CLS] and extra defined in function wemb = TransformerEmbedding(Modules.BERT, electra_model) chunked_ds = chunk_dataset(ds, wemb.tokenizer, max_sequence_length=max_sequence_length) assert len(chunked_ds) == 4 chunked_lengths = chunked_ds.get_lengths() assert chunked_lengths == (1, 2, 1, 2) # All should be of acceptable length assert all(length <= max_sequence_length for length in chunked_lengths) dechunked_ds = dechunk_dataset(ds, chunked_ds) dechunked_lengths = dechunked_ds.get_lengths() assert dechunked_lengths == ds.get_lengths()
def test_dataset_from_file_lemmas(test_tsv_lemma_file): test_ds = FieldedDataset.from_file(test_tsv_lemma_file, fields=(Fields.Tokens, Fields.GoldTags, Fields.GoldLemmas)) tmp = test_ds[0] print(tmp) assert tmp == (("Hæ", ), ("a", ), ("hæ", )) assert len(test_ds) == 3 assert test_ds.get_field(Fields.GoldLemmas) == ( ("hæ", ), ("þetta", "vera", "test"), ("já", "kannski"), )
def evaluate_predictions(predictions, fields, morphlex_vocab, pretrained_vocab, train_tokens, train_lemmas, criteria, feature, up_to): """Evaluate predictions. Evaluate a single prediction file. Args: predictions: The tagged test file. fields: The fields present in the test file. Separated with ',', f.ex. 'tokens,gold_tags,tags'. morphlex_vocab: The location of the morphlex vocab. pretrained_vocab: The location of the pretrained vocab. train_tokens: The location of the tokens used in training. train_lemmas: The location of the lemmas used in training. criteria: The evaluation criteria feature: Lemmas or tags? up_to: The number of errors for profile. """ click.echo(f"Evaluating: {predictions}") ds = FieldedDataset.from_file(predictions, fields=tuple(fields.split(","))) if criteria == "accuracy": result = evaluate.get_accuracy_from_files( feature, ds, train_tokens=train_tokens, train_lemmas=train_lemmas, morphlex_vocab=morphlex_vocab, pretrained_vocab=pretrained_vocab, ) click.echo(evaluate.format_result(result)) elif criteria == "profile": result = evaluate.get_profile_from_files( feature, ds, train_tokens=train_tokens, train_lemmas=train_lemmas, morphlex_vocab=morphlex_vocab, pretrained_vocab=pretrained_vocab, ) click.echo(evaluate.format_profile(result, up_to=up_to)) else: # confusion train_lemmas = Vocab.from_file(train_lemmas) morphlex_vocab = Vocab.from_file(morphlex_vocab) pretrained_vocab = Vocab.from_file(pretrained_vocab) evaluation = evaluate.TaggingLemmatizationEvaluation( test_dataset=ds, train_vocab=train_tokens, external_vocabs=evaluate.ExternalVocabularies( morphlex_vocab, pretrained_vocab), train_lemmas=train_lemmas, ) click.echo(evaluation.lemma_tag_confusion_matrix())
def read_datasets( file_paths: List[str], fields=None, ) -> FieldedDataset: """Read tagged datasets from multiple files. Args: file_paths: The paths to the datasets. fields: The tagged fields in the dataset """ log.debug(f"Reading files={file_paths}") combined = reduce( add, (FieldedDataset.from_file(training_file, fields) for training_file in file_paths), ) log.debug(f"Dataset length={len(combined)}") return combined
def lemma(data_in, output, device, batch_size, force_reload, force_download): """Lemma using tokens and PoS tags in a file. Args: data_in: A filepath of a file formatted as: token TAB PoS-tag per line, sentences separated with newlines (empty line). output: A filepath. Output is formatted like the input, but after each token TAB PoS-tag there is a tab and then the lemma. device: cpu or cuda """ model: Tagger = torch.hub.load( repo_or_dir="cadia-lvl/POS", model="lemma", device=device, force_reload=force_reload, force_download=force_download, ) log.info("Reading dataset") ds = FieldedDataset.from_file(data_in) predicted_tags = model.lemma_bulk(dataset=ds, batch_size=batch_size) ds = ds.add_field(predicted_tags, Fields.Lemmas) log.info("Writing results") ds.to_tsv_file(output) log.info("Done!")
def dechunk_dataset(original_ds: FieldedDataset, chunked_ds: FieldedDataset) -> FieldedDataset: """Reverse the chunking from the original dataset.""" log.info("Reversing the splitting of sentences in order to fit BERT-like model") original_lengths = original_ds.get_lengths() return chunked_ds.adjust_lengths(original_lengths, shorten=False)
def chunk_dataset(ds: FieldedDataset, tokenizer: PreTrainedTokenizerFast, max_sequence_length) -> FieldedDataset: """Split up sentences which are too long.""" log.info("Splitting sentences in order to fit BERT-like model") tokens = ds.get_field() lengths = get_adjusted_lengths(tokens, tokenizer, max_sequence_length=max_sequence_length) return ds.adjust_lengths(lengths, shorten=True)
def test_more_tokenization(electra_model): max_sequence_length = 512 wemb = TransformerEmbedding(Modules.BERT, electra_model) tok = wemb.tokenizer # fmt: off test = [('Báðar', 'segjast', 'þær', 'hafa', 'verið', 'látnar', 'matast', 'í', 'eldhúsinu', ',', 'eins', 'og', 'hjúum', 'var', 'gjarnan', 'skipað', 'erlendis', ',', 'og', 'ekki', 'líkað', 'það', 'par', 'vel', 'enda', 'vanar', 'meiri', 'virðingu', 'að', 'heiman', '.')] # fmt: on lengts = get_adjusted_lengths( tuple(test), tok, max_sequence_length=max_sequence_length, ) assert lengts == (len(test[0]), ) ds = FieldedDataset((tuple(test), ), fields=("tokens", )) chunked_ds = chunk_dataset(ds, tok, max_sequence_length=max_sequence_length) # fmt: off test = [('Fræðslu-', 'og', 'kynningarfundur', 'kl.', '14', '.')] # fmt: on lengts = get_adjusted_lengths( tuple(test), tok, max_sequence_length=max_sequence_length, ) ds = FieldedDataset((tuple(test), ), fields=("tokens", )) chunked_ds = chunk_dataset(ds, tok, max_sequence_length=max_sequence_length) # fmt: off test = [("9,754", "kl.")] # fmt: on lengts = get_adjusted_lengths( tuple(test), tok, max_sequence_length=max_sequence_length, ) assert lengts == (len(test[0]), ) ds = FieldedDataset((tuple(test), ), fields=("tokens", )) chunked_ds = chunk_dataset(ds, tok, max_sequence_length=max_sequence_length) # fmt: off test = [('Kl.', '9', 'handavinna', ',', 'útskurður', ',', 'fótaaðgerð', 'og', 'hárgreiðsla', '.')] # fmt: on lengts = get_adjusted_lengths( tuple(test), tok, max_sequence_length=max_sequence_length, ) assert lengts == (len(test[0]), ) ds = FieldedDataset((tuple(test), ), fields=("tokens", )) chunked_ds = chunk_dataset(ds, tok, max_sequence_length=max_sequence_length) # fmt: off test = [ ('ýsa', '36', '36', '36', '257', '9,252', 'Und.', 'þorskur', '62', '62', '62', '276', '17,112', 'Ýsa', '169', '34', '129', '5,967', '767,608', 'Þorskur', '204', '160', '174', '460', '79,831', 'Þykkvalúra', '214', '214', '214', '1,070', 'Samtals', '143', '11,129', '1,592,544', 'FISKMARKAÐUR', 'VESTMANNAEYJA', 'Blálanga', '31', '31', '31', '76', '2,356', 'Grálúða', '180', '177', '178', '44', '7,833', 'Gullkarfi', '87', '87', '87', '124', '10,788', 'Hlýri', '76', '76', '76', '899', '68,325', 'Keila', '32', '32', '32', '41', '1,312', 'Keilubróðir', '4', 'Langa', '74', '67', '72', '1,111', '79,799', 'Lúða', '204', '181', '192', '83', '15,917', 'Skarkoli', '21', '21', '21', '5', '105', 'Skata', '97', '17', '68', '25', '1,705', 'Skötuselur', '181', '140', '165', '576', '94,846', 'Steinbítur', '76', '69', '75', '454', '34,203', 'Stórkjafta', '12', '12', '12', '283', '3,396', 'Ufsi', '35', '25', '34', '5,384', '181,863', 'Und.', 'ufsi', '6', '6', '6', '30', '180', 'Ósundurliðað', '3', 'Ýsa', '55', '67', '89', '6,005', 'Þorskur', '186', '76', '141', '236', '33,375', 'Þykkvalúra', '190', '7', '160', '133', '21,244', 'Samtals', '59', '9,600', '563,252', 'FISKMARKAÐUR', 'ÞÓRSHAFNAR', 'Hlýri', '81', '81', '81', '770', '62,371', 'Langa', '59', '59', '59', '43', '2,537', 'Náskata', '15', '15', '15', '103', '1,545', 'Steinbítur', '70', '70', '70', '131', '9,170', 'Ufsi', '22', '22', '22', '777', '17,094', 'Und.', 'ýsa', '29', '29', '29', '192', '5,568', 'Ýsa', '36', '36', '36', '120', '4,320', 'Samtals', '48', '2,136', '102,605', 'FISKMARKAÐURINN', 'Á', 'SKAGASTRÖND', 'Lúða', '459', '151', '298', '21', '6,251', 'Skata', '47', '47', '47', '12', '564', 'Ufsi', '15', '15', '15', '117', '1,755', 'Und.', 'þorskur', '63', '63', '63', '31,500', 'Ýsa', '172', '69', '133', '800', '106,700', 'Þorskur', '229', '90', '129', '6,690', '862,380', 'Samtals', '124', '8,140', '1,009,150', 'FM', 'PATREKSFJARÐAR', 'Lúða', '161', '161', '161', '24', '3,864', 'Skarkoli', '100', '100', '100', '129', '12,900', 'Steinbítur', '68', '68', '68', '75', '5,100', 'Ufsi', '22', '22', '22', '1,042', '22,924', 'Und.', 'þorskur', '64', '64', '64', '1,258', '80,512', 'Ýsa', '122', '41', '86', '370', '31,937', 'Þorskur', '128', '72', '101', '14,733', '1,493,633', 'Samtals', '94', '17,631') ] # fmt: on lengts = get_adjusted_lengths( tuple(test), tok, max_sequence_length=max_sequence_length, ) assert sum(lengts) == len(test[0]) ds = FieldedDataset((tuple(test), ), fields=("tokens", )) chunked_ds = chunk_dataset(ds, tok, max_sequence_length=max_sequence_length) test = [("Síðan", "kom", "Gern", "hab", "'", "ich", "die", "Frau", "'", "n", "geküßt", "úr")] lengts = get_adjusted_lengths( tuple(test), tok, max_sequence_length=max_sequence_length, ) assert sum(lengts) == len(test[0]) ds = FieldedDataset((tuple(test), ), fields=("tokens", )) chunked_ds = chunk_dataset(ds, tok, max_sequence_length=max_sequence_length) # fmt: off test = [('qt', '/', 'qt-1', '<', '1', '.')] # fmt: on lengts = get_adjusted_lengths( tuple(test), tok, max_sequence_length=max_sequence_length, ) assert sum(lengts) == len(test[0]) ds = FieldedDataset((tuple(test), ), fields=("tokens", )) chunked_ds = chunk_dataset(ds, tok, max_sequence_length=max_sequence_length) assert chunked_ds is not None
def test_adjust_lens(ds: FieldedDataset): lengths = tuple(1 for _ in range(sum(ds.get_lengths()))) ds = ds.adjust_lengths(lengths, shorten=True) assert ds.get_lengths() == lengths
def ds(test_tsv_lemma_file): """Return a sequence tagged dataset.""" return FieldedDataset.from_file(test_tsv_lemma_file, fields=(Fields.Tokens, Fields.GoldTags, Fields.GoldLemmas))