def hunpos_path():
    """
    Returns the system specifiuc default install directory for HunPos binaries.

    :rtype : str|unicode
    :return:
    """
    return os.path.join(project_path(), 'models', HUNPOS_SUBDIR_MAP[sys.platform])
Пример #2
0
def hunpos_path():
    """
    Returns the system specifiuc default install directory for HunPos binaries.

    :rtype : str|unicode
    :return:
    """
    return os.path.join(project_path(), 'models',
                        HUNPOS_SUBDIR_MAP[sys.platform])
def install_hunpos():
    """
    Downloads and install system appropriate HunPos binaries in the default location.

    :rtype : None
    """
    models_dir = os.path.join(project_path(), 'models')

    hunpos_archive_fn = download_file(HUNPOS_URL_MAP[sys.platform], models_dir)

    if sys.platform == 'win32':
        with ZipFile(hunpos_archive_fn) as f:
            f.extractall(models_dir)
    else:
        with TarFile(hunpos_archive_fn) as f:
            f.extractall(models_dir)

    os.remove(hunpos_archive_fn)
Пример #4
0
def install_hunpos():
    """
    Downloads and install system appropriate HunPos binaries in the default location.

    :rtype : None
    """
    models_dir = os.path.join(project_path(), 'models')

    hunpos_archive_fn = download_file(HUNPOS_URL_MAP[sys.platform], models_dir)

    if sys.platform == 'win32':
        with ZipFile(hunpos_archive_fn) as f:
            f.extractall(models_dir)
    else:
        with TarFile(hunpos_archive_fn) as f:
            f.extractall(models_dir)

    os.remove(hunpos_archive_fn)
# coding=utf-8
import codecs
import os

from es_text_analytics.data.dataset import project_path
from es_text_analytics.tagger import FEATURES_MAP


# Norwegian lemmatizer based on Norsk Ordbank, http://www.edd.uio.no/prosjekt/ordbanken/data/index.html or
# http://www.nb.no/sprakbanken/show?serial=sbr-5&lang=nb
#
# Norsk Ordbank is not freely available but must be obtained from one of the urls above.

ORDBANK_BM_DEFAULT_PATH = os.path.join(project_path(), 'data', 'ordbank_bm')
FULLFORM_BM_FN = 'fullform_bm.txt'

FULLFORM_FIELDS = ['word_id', 'lemma', 'fullform', 'morph_descr', 'paradigm_code', 'paradigm_entry']


def parse_fullform_file(f, feat_norm='simple'):
    """
    Parses the fullform data file in Norsk Ordbank and returns dicts indexed on the fullform and lemma respectively.

    All fullforms are lowercased.
    Morphological information is normalized to POS tags.

    :param f: file instance for reading the fullform Norsk Ordbank data file.
    :param feat_norm: Type of POS tag to normalize morphological information. Must correspond to POS tagger tagset
      if doing contextual lemmatization.
    :type feat_norm: str|unicode
    :rtype : (dict, dict)
from nltk.tag.hunpos import HunposTagger
from textblob.base import BaseTagger

from es_text_analytics.data.dataset import project_path, download_file
from es_text_analytics.tokenizer import NOTokenizer






# TextBlob compatible part-of-speech tagger for Norwegian.

# default HunPos model locations
NOB_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nob-tagger-default-model')
NNO_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nno-tagger-default-model')

HUNPOS_URL_MAP = {
    'linux2': 'https://hunpos.googlecode.com/files/hunpos-1.0-linux.tgz',
    'darwin': 'https://hunpos.googlecode.com/files/hunpos-1.0-macosx.tgz',
    'win32': 'https://hunpos.googlecode.com/files/hunpos-1.0-win.zip'
}

HUNPOS_SUBDIR_MAP = {
    'win32': 'hunpos-1.0-win',
    'darwin': 'hunpos-1.0-macosx',
    'linux2': 'hunpos-1.0-linux'
}

Пример #7
0
import re
from tarfile import TarFile
from zipfile import ZipFile
import sys
from subprocess import Popen, PIPE

from nltk.tag.hunpos import HunposTagger
from textblob.base import BaseTagger

from es_text_analytics.data.dataset import project_path, download_file
from es_text_analytics.tokenizer import NOTokenizer

# TextBlob compatible part-of-speech tagger for Norwegian.

# default HunPos model locations
NOB_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models',
                                           'nob-tagger-default-model')
NNO_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models',
                                           'nno-tagger-default-model')

HUNPOS_URL_MAP = {
    'linux2': 'https://hunpos.googlecode.com/files/hunpos-1.0-linux.tgz',
    'darwin': 'https://hunpos.googlecode.com/files/hunpos-1.0-macosx.tgz',
    'win32': 'https://hunpos.googlecode.com/files/hunpos-1.0-win.zip'
}

HUNPOS_SUBDIR_MAP = {
    'win32': 'hunpos-1.0-win',
    'darwin': 'hunpos-1.0-macosx',
    'linux2': 'hunpos-1.0-linux'
}
Пример #8
0
# coding=utf-8
import codecs
import os

from es_text_analytics.data.dataset import project_path
from es_text_analytics.tagger import FEATURES_MAP

# Norwegian lemmatizer based on Norsk Ordbank, http://www.edd.uio.no/prosjekt/ordbanken/data/index.html or
# http://www.nb.no/sprakbanken/show?serial=sbr-5&lang=nb
#
# Norsk Ordbank is not freely available but must be obtained from one of the urls above.

ORDBANK_BM_DEFAULT_PATH = os.path.join(project_path(), 'data', 'ordbank_bm')
FULLFORM_BM_FN = 'fullform_bm.txt'

FULLFORM_FIELDS = [
    'word_id', 'lemma', 'fullform', 'morph_descr', 'paradigm_code',
    'paradigm_entry'
]


def parse_fullform_file(f, feat_norm='simple'):
    """
    Parses the fullform data file in Norsk Ordbank and returns dicts indexed on the fullform and lemma respectively.

    All fullforms are lowercased.
    Morphological information is normalized to POS tags.

    :param f: file instance for reading the fullform Norsk Ordbank data file.
    :param feat_norm: Type of POS tag to normalize morphological information. Must correspond to POS tagger tagset
      if doing contextual lemmatization.