def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() #self.lemmatizer = Mystem() self.tagger = rupostagger.RuPosTagger() self.tagger.load() self.lemm = rulemma.Lemmatizer() self.lemm.load()
def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() self.lemmatizer = Mystem() self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() self.postagger = rupostagger.RuPosTagger() self.gg_dictionaries = GenerativeGrammarDictionaries() self.known_words = set()
def test_posstager(): import rupostagger tagger = rupostagger.RuPosTagger() tagger.load() for word, label in tagger.tag(u'кошки спят'.split()): print(u'{} -> {}'.format(word, label)) #test_posstager()
def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() self.postagger = rupostagger.RuPosTagger() self.chunker = ruchunker.Chunker() self.word2tags = ruword2tags.RuWord2Tags() self.flexer = ruword2tags.RuFlexer() self.syntan = None self.gg_dictionaries = GenerativeGrammarDictionaries() self.known_words = set() #self.lemmatizer = Mystem() self.lemmatizer = rulemma.Lemmatizer() self.word_embeddings = None
def test(self): lemmatizer = rulemma.Lemmatizer() lemmatizer.load() tokenizer = rutokenizer.Tokenizer() tokenizer.load() tagger = rupostagger.RuPosTagger() tagger.load() sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей' tokens = tokenizer.tokenize(sent) tags = tagger.tag(tokens) lemmas = lemmatizer.lemmatize(tags) for word, tags, lemma, *_ in lemmas: print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags))
def main(): try: log('Loading tagger') global TAGGER TAGGER = rupostagger.RuPosTagger() TAGGER.load() except Exception as error: log('Can not load tagger: "%s"', error) return server = HTTPServer((HOST, PORT), HTTPHandler) try: log('Listening http://%s:%d', HOST, PORT) server.serve_forever() except KeyboardInterrupt: log('Quiting') finally: server.server_close()
def load(self, model_dir=None): if model_dir is None: module_folder = str(pathlib.Path(__file__).resolve().parent) model_dir = os.path.join(module_folder, '../tmp') if not os.path.exists(model_dir): model_dir = module_folder config_path = os.path.join(model_dir, 'chunker_NP.config') self.chunker_params = ChunkerCrfParams.load(config_path) if self.chunker_params.use_gren: self.word2tags = ruword2tags.RuWord2Tags() self.word2tags.load() if self.chunker_params.use_postagger: self.postagger = rupostagger.RuPosTagger() self.postagger.load() self.crf_tagger = pycrfsuite.Tagger() self.crf_tagger.open( os.path.join(model_dir, self.chunker_params.model_filename))
def __init__(self, stopwordsList=None, lang='russian', *args, **kwargs): nltk.download("stopwords") #nltk.download("punkt") self.mystem = Mystem() self.useLemmas = False if lang == 'russian': self.lemmatizer = rulemma.Lemmatizer() self.lemmatizer.load() self.tokenizer = rutokenizer.Tokenizer() self.tokenizer.load() self.tagger = rupostagger.RuPosTagger() self.tagger.load() else: self.lemmatizer = WordNetLemmatizer() alphabet = [] self.language = lang self.tag_dict = { "J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV } if lang == 'russian': self.stopwords = stopwords.words("russian") alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя" else: self.stopwords = stopwords.words('english') alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" self.stopwords.extend(list(alphabet)) if not stopwordsList is None: self.stopwords.extend(stopwordsList)
def vectorize_data(samples, vectorizer, params): labels = [s[2] for s in samples] y_data = np.asarray(labels) phrases1 = [s[0] for s in samples] phrases2 = [s[1] for s in samples] if params['nlp_transform'] == 'lemmatize': tagger = rupostagger.RuPosTagger() tagger.load() lemmatizer = rulemma.Lemmatizer() lemmatizer.load() all_phrases = list(set(phrases1) | set(phrases2)) phrase2lemma = dict( (phrase, lemmatize_phrase(phrase, tagger, lemmatizer)) for phrase in all_phrases) lphrases1 = [phrase2lemma[f] for f in phrases1] lphrases2 = [phrase2lemma[f] for f in phrases2] return vectorize_data2(lphrases1, lphrases2, vectorizer, params), y_data else: return vectorize_data2(phrases1, phrases2, vectorizer, params), y_data
import operator import rutokenizer import rupostagger import rulemma if __name__ == '__main__': print('Loading dictionaries and models...') lemmatizer = rulemma.Lemmatizer() lemmatizer.load('../tmp/rulemma.dat') tokenizer = rutokenizer.Tokenizer() tokenizer.load() tagger = rupostagger.RuPosTagger() tagger.load() print('Loading finished') sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей' tokens = tokenizer.tokenize(sent) tags = tagger.tag(tokens) lemmas = lemmatizer.lemmatize(tags) for word, tags, lemma, *_ in lemmas: print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags)) tests = [(u'я вижу хрюнделя', u'я видеть хрюндель'), (u'Мяукая, голодные кошки ловят жирненьких мышек', u'мяукать , голодный кошка ловить жирненький мышка'), (u'Мы спрашивали про уроки и оценки', u'я спрашивать про урок и оценка'), (u'Куда же улетели облачка?', u'куда же улететь облачко ?')]
def __init__(self): self.predictor = RNNMorphPredictor(language="ru") self.tagger = rupostagger.RuPosTagger() self.tagger.load()