def hunpos_path(): """ Returns the system specifiuc default install directory for HunPos binaries. :rtype : str|unicode :return: """ return os.path.join(project_path(), 'models', HUNPOS_SUBDIR_MAP[sys.platform])
def hunpos_path(): """ Returns the system specifiuc default install directory for HunPos binaries. :rtype : str|unicode :return: """ return os.path.join(project_path(), 'models', HUNPOS_SUBDIR_MAP[sys.platform])
def install_hunpos(): """ Downloads and install system appropriate HunPos binaries in the default location. :rtype : None """ models_dir = os.path.join(project_path(), 'models') hunpos_archive_fn = download_file(HUNPOS_URL_MAP[sys.platform], models_dir) if sys.platform == 'win32': with ZipFile(hunpos_archive_fn) as f: f.extractall(models_dir) else: with TarFile(hunpos_archive_fn) as f: f.extractall(models_dir) os.remove(hunpos_archive_fn)
def install_hunpos(): """ Downloads and install system appropriate HunPos binaries in the default location. :rtype : None """ models_dir = os.path.join(project_path(), 'models') hunpos_archive_fn = download_file(HUNPOS_URL_MAP[sys.platform], models_dir) if sys.platform == 'win32': with ZipFile(hunpos_archive_fn) as f: f.extractall(models_dir) else: with TarFile(hunpos_archive_fn) as f: f.extractall(models_dir) os.remove(hunpos_archive_fn)
# coding=utf-8 import codecs import os from es_text_analytics.data.dataset import project_path from es_text_analytics.tagger import FEATURES_MAP # Norwegian lemmatizer based on Norsk Ordbank, http://www.edd.uio.no/prosjekt/ordbanken/data/index.html or # http://www.nb.no/sprakbanken/show?serial=sbr-5&lang=nb # # Norsk Ordbank is not freely available but must be obtained from one of the urls above. ORDBANK_BM_DEFAULT_PATH = os.path.join(project_path(), 'data', 'ordbank_bm') FULLFORM_BM_FN = 'fullform_bm.txt' FULLFORM_FIELDS = ['word_id', 'lemma', 'fullform', 'morph_descr', 'paradigm_code', 'paradigm_entry'] def parse_fullform_file(f, feat_norm='simple'): """ Parses the fullform data file in Norsk Ordbank and returns dicts indexed on the fullform and lemma respectively. All fullforms are lowercased. Morphological information is normalized to POS tags. :param f: file instance for reading the fullform Norsk Ordbank data file. :param feat_norm: Type of POS tag to normalize morphological information. Must correspond to POS tagger tagset if doing contextual lemmatization. :type feat_norm: str|unicode :rtype : (dict, dict)
from nltk.tag.hunpos import HunposTagger from textblob.base import BaseTagger from es_text_analytics.data.dataset import project_path, download_file from es_text_analytics.tokenizer import NOTokenizer # TextBlob compatible part-of-speech tagger for Norwegian. # default HunPos model locations NOB_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nob-tagger-default-model') NNO_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nno-tagger-default-model') HUNPOS_URL_MAP = { 'linux2': 'https://hunpos.googlecode.com/files/hunpos-1.0-linux.tgz', 'darwin': 'https://hunpos.googlecode.com/files/hunpos-1.0-macosx.tgz', 'win32': 'https://hunpos.googlecode.com/files/hunpos-1.0-win.zip' } HUNPOS_SUBDIR_MAP = { 'win32': 'hunpos-1.0-win', 'darwin': 'hunpos-1.0-macosx', 'linux2': 'hunpos-1.0-linux' }
import re from tarfile import TarFile from zipfile import ZipFile import sys from subprocess import Popen, PIPE from nltk.tag.hunpos import HunposTagger from textblob.base import BaseTagger from es_text_analytics.data.dataset import project_path, download_file from es_text_analytics.tokenizer import NOTokenizer # TextBlob compatible part-of-speech tagger for Norwegian. # default HunPos model locations NOB_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nob-tagger-default-model') NNO_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nno-tagger-default-model') HUNPOS_URL_MAP = { 'linux2': 'https://hunpos.googlecode.com/files/hunpos-1.0-linux.tgz', 'darwin': 'https://hunpos.googlecode.com/files/hunpos-1.0-macosx.tgz', 'win32': 'https://hunpos.googlecode.com/files/hunpos-1.0-win.zip' } HUNPOS_SUBDIR_MAP = { 'win32': 'hunpos-1.0-win', 'darwin': 'hunpos-1.0-macosx', 'linux2': 'hunpos-1.0-linux' }
# coding=utf-8 import codecs import os from es_text_analytics.data.dataset import project_path from es_text_analytics.tagger import FEATURES_MAP # Norwegian lemmatizer based on Norsk Ordbank, http://www.edd.uio.no/prosjekt/ordbanken/data/index.html or # http://www.nb.no/sprakbanken/show?serial=sbr-5&lang=nb # # Norsk Ordbank is not freely available but must be obtained from one of the urls above. ORDBANK_BM_DEFAULT_PATH = os.path.join(project_path(), 'data', 'ordbank_bm') FULLFORM_BM_FN = 'fullform_bm.txt' FULLFORM_FIELDS = [ 'word_id', 'lemma', 'fullform', 'morph_descr', 'paradigm_code', 'paradigm_entry' ] def parse_fullform_file(f, feat_norm='simple'): """ Parses the fullform data file in Norsk Ordbank and returns dicts indexed on the fullform and lemma respectively. All fullforms are lowercased. Morphological information is normalized to POS tags. :param f: file instance for reading the fullform Norsk Ordbank data file. :param feat_norm: Type of POS tag to normalize morphological information. Must correspond to POS tagger tagset if doing contextual lemmatization.