Пример #1
0
    def __init__(self):
        super(PolishLemmatizer, self).__init__()
        try:
            from morfeusz2 import Morfeusz
        except ImportError:
            raise ImportError(
                'The Polish lemmatizer requires the morfeusz2-python library')

        if PolishLemmatizer._morph is None:
            PolishLemmatizer._morph = Morfeusz(dict_name='polimorf')
Пример #2
0
def process_request(params):
    option_parser = MorfeuszOptionParser(params)
    option_parser.parse_bool('expandDag', 'expand_dag')
    option_parser.parse_bool('expandTags', 'expand_tags')
    option_parser.parse_bool('expandDot', 'expand_dot')
    option_parser.parse_bool('expandUnderscore', 'expand_underscore')
    option_parser.parse_string('agglutinationRules', 'aggl',
                               AGGLUTINATION_RULES)
    option_parser.parse_string('pastTenseSegmentation', 'praet',
                               PAST_TENSE_SEGMENTATION)
    option_parser.parse_enum('tokenNumbering', 'separate_numbering',
                             TokenNumbering, TokenNumbering.separate)
    option_parser.parse_enum('caseHandling', 'case_handling', CaseHandling)
    option_parser.parse_enum('whitespaceHandling', 'whitespace',
                             WhitespaceHandling)
    option_parser.parse_actions('action')

    results = []
    response = {'results': results}

    if option_parser.validate(response):
        option_parser.set_dictionary_path('MORFEUSZ_DICT_PATH')
        morfeusz = Morfeusz(**option_parser.get_opts())

        if option_parser.action == 'analyze':
            for interp_list in morfeusz.analyse(option_parser.text):
                if isinstance(interp_list, list):
                    subitem = []
                    results.append(subitem)

                    for item in interp_list:
                        subitem.append(tag_items(item))
                else:
                    results.append(tag_items(interp_list))
        elif option_parser.action == 'generate':
            for title in option_parser.titles:
                subitem = []
                results.append(subitem)

                for interp_list in morfeusz.generate(title):
                    subitem.append(tag_items(interp_list))

        response['version'] = morfeusz2.__version__
        response['dictionaryId'] = morfeusz.dict_id()
        response['copyright'] = morfeusz.dict_copyright()

    return response
Пример #3
0
from sys import argv, exit
#other imports

corpus_filename = 'pl.txt'
try:
    filename = argv[1]
    if filename in listdir('.'):
        corpus_filename = filename
    else:
        print('File %s not found in the current directory' % filename)
        exit(-1)
except IndexError:
    pass

exclude = string.digits  #unicode(string.digits) #
morph = Morfeusz()


def lemm(line):
    sentence = re.split(
        '\d+|\W+|_', line.lower(), flags=re.UNICODE
    )  #re.split('\W+', line.lower(), flags=re.UNICODE) #line.split()
    norm_sentence = []
    for i in xrange(0, len(sentence)):
        if sentence[i] != u'':
            #print 'sen: ', sentence[i], 'len: ', len(sentence[i])
            w_desc = morph.analyse(sentence[i])
            if len(w_desc) > 0:
                norm_sentence.append(w_desc[0][2][1].split(':')[0])
    return norm_sentence
Пример #4
0
#! /usr/bin/python
# *-* coding: utf-8 *-*

from morfeusz2 import Morfeusz
from concraft_pl2 import Concraft, Server

try:
  morfeusz = Morfeusz(expand_tags=True)
  server = Server(model_path="/home/kuba/work/ipipan/concraft/pre-trained/Sep-18/model-04-09-2018.gz", port=3001)
  concraft = Concraft(port=3001)
  
  dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.')
  res = concraft.disamb(dag)
  print(res)
  
  dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.')
  dag_str = concraft.dag_to_str(dag)
  dag_disamb_str = concraft.disamb_str(dag_str)
  print(dag_disamb_str)
finally:
  server.terminate()
Пример #5
0
import pathlib, random, re, sys
from typing import Callable, Optional

from morfeusz2 import Morfeusz
from wordnet import query

morfeusz = Morfeusz(analyse=False)

DATASETS = ["new"]
DICT_LINES = {}
DICT_FUNCTIONS = {}

THESAURUS = {}

# Words from the thesaurus containing these tags will be ignored:
BLACKLISTED_TAGS = [
	"(bardzo potocznie)",
	"(potocznie)",
	"(częściej, ale wg niektórych niepoprawnie)",
	"(eufemistycznie)",  # :(
	#"(nieco potocznie)",  # Eh, it's fine
	"(obraźliwe)",
	"(obraźliwie)",
	#"(pieszczotliwie)",
	"(pogardliwie)",
	"(potoczne)",
	"(potocznie)",
	"(przestarzale)",
	"(ptoocznie)",
	"(regionalnie)",  # Contains some inappropriate words
	"(rzadko, wg niektórych niepoprawnie)",
Пример #6
0
 def __init__(self):
     """Constructor"""
     self.morf = Morfeusz()
Пример #7
0
    r'<([a-z]*-[a-z]*)(\s+[a-z]*="\w*((\s+\w*)+)?")?>(.+?)<\/[a-z]*-[a-z]*>',
    letter_contents)
# print('one word tags')
# for tag in tags_word:
#     print(tag)
#     print('\n')
# print('two word tags')
# for tag in tags_words:
#     print(tag)
#     print('\n')


def remove_dashes(text):
    tmp_str = ''
    for letter in text:
        if letter != '-':
            tmp_str = tmp_str + letter
    return tmp_str


letter1_no_tags = remove_tags(letters[0].contents)
letter1_nt_str = ' '.join(letter1_no_tags)
# letter_ntnd_str = remove_dashes(letter1_nt_str.decode('utf-8'))
# print(letter_ntnd_str)

morf = Morfeusz()
# print(morf)
print(letter1_nt_str.decode('utf8'))
letter1_analysed = morf.analyse(letter1_nt_str)
print(letter1_analysed)