class NNOTagger (BaseTagger, object):
    """
    TextBlob compatible Norsk Nynorsk POS tagger class based on the NLTK HunPos wrapper.
    """
    def __init__(self, model_fn=None):
        self.tokenizer = NOTokenizer()
        self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN,
                                   hunpos_tag_bin(), encoding='utf-8')

    def tag(self, text, tokenize=True):
        text = clean_input(text)

        if tokenize:
            text = self.tokenizer.tokenize(text)

        return self.tagger.tag(text)
Exemplo n.º 2
0
class NNOTagger(BaseTagger, object):
    """
    TextBlob compatible Norsk Nynorsk POS tagger class based on the NLTK HunPos wrapper.
    """
    def __init__(self, model_fn=None):
        self.tokenizer = NOTokenizer()
        self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN,
                                   hunpos_tag_bin(),
                                   encoding='utf-8')

    def tag(self, text, tokenize=True):
        text = clean_input(text)

        if tokenize:
            text = self.tokenizer.tokenize(text)

        return self.tagger.tag(text)
Exemplo n.º 3
0
import sys

from gensim.corpora import Dictionary
from textblob import TextBlob

from es_text_analytics.data import newsgroups
from es_text_analytics.data.dataset import download_file, default_dataset_path
from es_text_analytics.data.ndt_dataset import NDTDataset
from es_text_analytics.tokenizer import NOTokenizer
"""
Generates wordcounts from a dataset.

Stores the counts in a Gensim Dictionary text file with id, word and count as tab separated fields.
"""

NO_TOKENIZER = NOTokenizer()


def preprocess_ng(doc):
    return [w.lower() for w in TextBlob(doc['msg']).words]


def preprocess_ndt(doc):
    return [
        w.lower()
        for w in TextBlob(doc['content'], tokenizer=NO_TOKENIZER).words
    ]


def main():
    parser = ArgumentParser()
 def __init__(self, model_fn=None):
     self.tokenizer = NOTokenizer()
     self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN,
                                hunpos_tag_bin(), encoding='utf-8')
Exemplo n.º 5
0
 def test_tokenize(self):
     tokenizer = NOTokenizer()
     self.assertEqual(['Dette', 'er', u'vårt', 'hus', '.'],
                      tokenizer.tokenize(u'Dette er vårt hus.'))
Exemplo n.º 6
0
 def __init__(self, model_fn=None):
     self.tokenizer = NOTokenizer()
     self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN,
                                hunpos_tag_bin(),
                                encoding='utf-8')