Exemplo n.º 1
0
def load_chunks(fpath, limit=None):
    with open(fpath, 'rb') as f:
        events = pulldom.parse(f, parser=_create_parser())
        chunk_id = 0
        chunk_events = _start_events(events, 'chunk')
        for chunk in tqdm(chunk_events, desc=f'Loading chunks from {fpath}'):
            for chunk in _findall(chunk, 'chunk'):
                if chunk_id == limit:
                    return
                chunk_id += 1
                tokens = []
                for tok in _findall(chunk, 'tok'):
                    orth = _findvalue(tok, 'orth')
                    lemmas = []
                    ctags = []
                    disamb_lemma = None
                    disamb_ctag = None
                    for lex in _findall(tok, 'lex'):
                        lemma = _findvalue(lex, 'base')
                        ctag = _findvalue(lex, 'ctag')
                        if lex.getAttribute('disamb') == '1':
                            disamb_lemma = lemma
                            disamb_ctag = ctag
                        else:
                            lemmas.append(lemma)
                            ctags.append(ctag)
                    token = Token(orth, lemmas, ctags, disamb_lemma,
                                  disamb_ctag)
                    tokens.append(token)

                yield Chunk(tokens)
Exemplo n.º 2
0
def test_MorphAnalyzer_unknown_word():
    unknown_word = 'Kotkowicach'
    chunks = [Chunk([Token(unknown_word)])]

    empty_dict = Dictionary({})
    analyzer = MorphAnalyzer(empty_dict)

    analyzer.analyze(chunks)
    token = chunks[0].tokens[0]
    assert token.lemmas == [unknown_word]
    assert token.ctags == ['ign']
Exemplo n.º 3
0
def test_WordEmbedEncoder():
    # TODO we should mock this
    word2vec = KeyedVectors.load_word2vec_format(
        'data/nkjp+wiki-forms-all-300-skipg-ns.txt', limit=50)

    chunks = [Chunk([Token('5'), Token('kilogramów'), Token('pomiodorów')])]

    encoder = WordEmbedEncoder(word2vec)
    X = encoder.fit_transform(chunks)
    assert type(X) == np.ndarray
    assert X.shape == (1, 3, 300)
Exemplo n.º 4
0
def merge_chunks(analyzed_fpath, gold_fpath, limit=None):
    analyzed_chunks = load_chunks(analyzed_fpath, limit)
    gold_chunks = load_chunks(gold_fpath, limit)
    for analyzed_chunk, gold_chunk in zip(analyzed_chunks, gold_chunks):
        tokens = []
        if len(analyzed_chunk.tokens) != len(gold_chunk.tokens):
            raise ValueError('Invalid tokens number')
        for analyzed, gold in zip(analyzed_chunk.tokens, gold_chunk.tokens):
            merged = Token(analyzed.orth, analyzed.lemmas, analyzed.ctags,
                           gold.disamb_lemma, gold.disamb_ctag)
            tokens.append(merged)
        yield Chunk(tokens)
Exemplo n.º 5
0
def test_MorphAnalyzer():
    known_word = 'pomidorów'
    chunks = [Chunk([Token(known_word)])]

    dict = Dictionary(
        {known_word: [DictEntry(known_word, 'pomidor', 'xyz', '')]})
    analyzer = MorphAnalyzer(dict)

    analyzer.analyze(chunks)
    token = chunks[0].tokens[0]
    assert token.lemmas == ['pomidor']
    assert token.ctags == ['xyz']
Exemplo n.º 6
0
def test_DisambCTagEncoder():
    chunks = [
        Chunk([
            Token('5', disamb_ctag='brev:pun'),
            Token('kilogramów', disamb_ctag='qub'),
            Token('pomidorów', disamb_ctag='conj')
        ]),
        Chunk([
            Token('5', disamb_ctag='brev:pun'),
            Token('kilogramów', disamb_ctag='qub'),
            Token('pomidorów', disamb_ctag='conj'),
            Token('i', disamb_ctag='conj'),
            Token('ogórków', disamb_ctag='conj')
        ])
    ]

    encoder = DisambCTagEncoder(tagset.categories)

    y = encoder.fit_transform(chunks)
    assert isinstance(y, dict)
    pos = y['pos']
    assert type(pos) == np.ndarray
    assert pos.shape == (2, 5, 35)
Exemplo n.º 7
0
def test_CTagsEncoder():
    chunks = [
        Chunk([
            Token('5', ctags=['brev:pun', 'conj', 'prep:nom']),
            Token('kilogramów', ctags=['qub']),
            Token('pomidorów', ctags=['brev:pun', 'conj', 'prep:nom'])
        ])
    ]

    encoder = CTagsEncoder(tagset.categories)

    X = encoder.fit_transform(chunks)
    assert type(X) == np.ndarray
    assert X.shape == (1, 3, 88)
Exemplo n.º 8
0
def test_KerasInputFormatter():
    # TODO Move somewhere else
    # TODO we should mock this
    word2vec = KeyedVectors.load_word2vec_format(
        'data/nkjp+wiki-forms-all-300-skipg-ns.txt', limit=5)

    u = KerasInputFormatter([('word2vec', WordEmbedEncoder(word2vec)),
                             ('tag2vec', CTagsEncoder(tagset.categories))])

    chunks = [
        Chunk([
            Token('5', ctags=['brev:pun', 'conj', 'prep:nom']),
            Token('kilogramów', ctags=['qub']),
            Token('pomidorów', ctags=['brev:pun', 'conj', 'prep:nom'])
        ])
    ]

    X = u.fit_transform(chunks)

    print(X)