def emb(): meta = Meta( id='test_1B_3k_6d_2q' ) pq = PQ( vectors=3, dim=6, qdim=2, # 1 0 0 | 1 0 0 # 0 1 1 | 0 0 0 # 0 0 0 | 0 1 0 centroids=3, indexes=np.array([ # vectors x qdim [0, 1], [1, 0], [2, 2] ]).astype(np.uint8), codes=np.array([ # qdim x centroids x chunk [[1, 0, 0], [0, 1, 1], [0, 0, 0]], [[0, 0, 0], [1, 0, 0], [0, 1, 0]], ]).astype(np.float32), ) vocab = Vocab( words=['a', 'b', 'c'], counts=[1, 2, 3] ) return Navec(meta, vocab, pq)
def __init__(self): self.navec = Navec.load(Loc.dependencies_path / 'navec_news_v1_1B_250K_300d_100q.tar') self.syntax = Syntax.load(Loc.dependencies_path / 'slovnet_syntax_news_v1.tar') self.syntax.navec(self.navec) self.morph = Morph.load(Loc.dependencies_path / 'slovnet_morph_news_v1.tar') self.morph.navec(self.navec)
def quantize_(emb, output, subdim, sample, iterations): with open(emb) as file: log_info('Load %s', emb) words, weights = parse_glove_emb(file) log_info( 'PQ, subdim: %d, sample: %d, iterations: %d', subdim, sample, iterations ) pq = quantize__(weights, subdim, sample, iterations) vocab = Vocab(words) log_info('Dump %s', output) Navec(vocab, pq).dump(output)
def pack(args): meta = Meta(args.id) with open_bin(args.vocab) as file: vocab = Vocab.from_file(file) with open_bin(args.pq) as file: pq = PQ.from_file(file) path = 'navec_%s.tar' % args.id log_info('Dumping %s', path) Navec(meta, vocab, pq).dump(path)
def pack_(vocab, pq, id): meta = Meta(id) with open_bin(vocab) as file: vocab = Vocab.from_file(file) with open_bin(pq) as file: pq = PQ.from_file(file) path = 'navec_%s.tar' % id log_info('Dumping %s', path) Navec(meta, vocab, pq).dump(path)
def build_annoy_forest(self, navec_path, index_path): # load navec vector file and extract vocabulary navec = Navec.load(navec_path) vocabulary = navec.vocab.words # build annoy forest dim = 300 tree_count = 100 forest = AnnoyIndex(dim, 'angular') for i, word in enumerate(vocabulary): forest.add_item(i, navec[word]) forest.build(tree_count) forest.save(index_path)
def load_ner(models_path: str) -> NER: """Загружаем и инициализируем NER-модель Args: models_path (str): Папка, в которой расположены необходимые для работы модели Returns: slovnet.NER: Объект slovnet.NER """ os.makedirs(models_path, exist_ok=True) if not os.path.isfile(os.path.join(models_path, 'navec_news_v1_1B_250K_300d_100q.tar')): wget.download('https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar', os.path.join(models_path, 'navec_news_v1_1B_250K_300d_100q.tar')) if not os.path.isfile(os.path.join(models_path, 'slovnet_ner_news_v1.tar')): wget.download('https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_ner_news_v1.tar', os.path.join(models_path, 'slovnet_ner_news_v1.tar')) navec = Navec.load(os.path.join(models_path, 'navec_news_v1_1B_250K_300d_100q.tar')) ner = NER.load(os.path.join(models_path, 'slovnet_ner_news_v1.tar')) ner.navec(navec) return ner
def shop_name(self) -> str: navec = Navec.load(constants.navec_file) ner = NER.load(constants.ner_file) ner.navec(navec) try: markup = ner(self.text) except IndexError: # i dont know what happens here sometimes del navec del ner return "" for span in markup.spans: if span.type == 'ORG': del navec del ner return self.text[span.start:span.stop].strip(".,;!:-–—/ ") del navec del ner return ""
def __init__(self, is_elmo_used=False): self.config = get_config('config.yml') self.parser = ConsultantPlusParser(config=self.config) self.model = ElmoModel() self.mystem = Mystem() self.spec_chars = string.punctuation + '\n\xa0«»\t—…' self.stop_words = stopwords.words("russian") self.stop_words.extend([ 'и', 'в', 'на', 'n', 'рф', 'гк', 'юридического', ' ', '1', 'ред', '2', 'ст', 'также', 'свой', 'либо', 'это', 'текст', 'закон', 'который', 'иной', 'год', 'мочь', ]) if is_elmo_used: self.model.load(self.config['model_info_file']) self.navec = Navec.load(self.config['navec_news_v1_1B_250K_300d_100q']) self.syntax = Syntax.load(self.config['slovnet_syntax_news_v1']) self.syntax.navec(self.navec)
def test_dump_load(emb): with NamedTemporaryFile() as file: path = file.name emb.dump(path) Navec.load(path)
from aiohttp import web from navec import Navec from slovnet import Morph NAVEC = getenv('NAVEC', 'navec.tar') PACK = getenv('PACK', 'pack.tar') BATCH_SIZE = int(getenv('BATCH_SIZE', 8)) HOST = getenv('HOST', '0.0.0.0') PORT = int(getenv('PORT', 8080)) MB = 1024 * 1024 MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) log('Load navec: %r' % NAVEC) navec = Navec.load(NAVEC) log('Load pack: %r' % PACK) log('Batch size: %r' % BATCH_SIZE) morph = Morph.load(PACK) morph.navec(navec) async def handle(request): chunk = await request.json() log('Post chunk size: %r' % len(chunk)) markups = list(morph.map(chunk)) tags = sum(len(_.tags) for _ in markups) log('Infer tags: %r', tags)
def __init__(self, path): meta, vocab, pq = Navec.load(path) Navec.__init__(self, meta, vocab, pq)
def navec(): path = download( 'https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar' ) return Navec.load(path)
from pprint import pprint from mpl_toolkits.mplot3d import Axes3D from sklearn.manifold import TSNE import matplotlib.pyplot as plt import gensim.downloader as api import pandas as pd from games import RED, BLUE, GREY, game from navec import Navec path = 'navec_hudlit_v1_12B_500K_300d_100q.tar' model = Navec.load(path).as_gensim # model = api.load("word2vec-ruscorpora-300") def _(model, name, noun=True): words = model.index2word if noun: ws = [ w for w in words if w.split('_')[0] == name.lower() and ( len(w.split('_')) == 1 or w.split('_')[1] == 'NOUN') ] else: ws = [w for w in words if w.split('_')[0] == name.lower()] if len(ws) == 0: raise KeyError(name, ws) if len(ws) > 1: raise KeyError(name, ws) key = ws[0] return key
def _load(self): raw = Navec.load(self.path) return NavecModel(raw, self.stats)
record, k = get_random_record(records) markup = ner(record.text) print('This is ' + tp.BOLD + tp.RED + f'{k}' + tp.END + ' record\n') show_markup(markup.text, markup.spans) def test_on_k_random_records(K): records = load_lenta(lenta_path) records_num = [i for i in range(N)] chosen_records_num = random.choices(records_num, k=K) my_records = [] for i in chosen_records_num: my_records.append(get_k_record(records, i)) print(f'This is ' + tp.BOLD + tp.RED + f'{chosen_records_num}' + tp.END + ' records\n') for i in range(K): print(tp.BOLD + tp.RED + f'{chosen_records_num[i]}' + tp.END + '\t') markup = ner(my_records[i].text) show_markup(markup.text, markup.spans) print('\n--------------------------\n\n') if __name__ == '__main__': print() navec = Navec.load(navec_path) ner = NER.load(ner_path) ner.navec(navec) test_on_random_record() test_on_k_random_records(5)
def test_integration(tmpdir): torch.manual_seed(1) device = get_device() navec = Navec.load(NAVEC) words_vocab = WordsVocab(navec.vocab.words) shapes_vocab = ShapesVocab(SHAPES) tags_vocab = TagsVocab([PER, LOC, ORG]) word_emb = NavecEmbedding.from_navec(navec) shape_emb = ShapeEmbedding( vocab_size=len(shapes_vocab), dim=10, pad_id=shapes_vocab.pad_id ) word_model = WordModel( word_emb, shape_emb ) context_model = CNNContextModel( input_dim=word_model.dim, layer_dims=[64, 32], kernel_size=3, ) tag_model = CRFTagModel( input_dim=context_model.dim, tags_num=len(tags_vocab) ) ner_model = NERModel( word_model, context_model, tag_model ).to(device) dataset = NerusDataset(NERUS) test_dataset = dataset.slice(0, 10) train_dataset = dataset.slice(10, 30) token_encoder = StackEncoder([ WordEncoder(words_vocab), ShapeEncoder(shapes_vocab) ]) markup_encoder = MarkupEncoder( token_encoder, TagEncoder(tags_vocab) ) tokenizer = Tokenizer() batch_encoder = BatchEncoder( tokenizer, markup_encoder, seq_len=100, batch_size=32, shuffle_buffer_size=256, ) test_batches = [_.to(device) for _ in batch_encoder.map(test_dataset)] train_batches = [_.to(device) for _ in batch_encoder.map(train_dataset)] path = str(tmpdir.mkdir('root')) board = Board('01', path) train_board = board.prefixed('01_train') test_board = board.prefixed('02_test') optimizer = optim.Adam( ner_model.parameters(), lr=0.001 ) proced = train_model( ner_model, optimizer, train_batches ) scores = eval_batches(tags_vocab, proced) score = avg_batch_scores(scores) train_board.add_batch_score(score) proced = infer_model( ner_model, test_batches ) scores = eval_batches(tags_vocab, proced) score = avg_batch_scores(scores) test_board.add_batch_score(score) ner_model.eval() tagger = NERTagger( tokenizer, token_encoder, tags_vocab, ner_model, device ) markup1 = tagger(TEXT) pack = ner_model.as_infer.pack('slovnet_ner_v1') path = str(tmpdir.join('slovnet_ner_v1.tar')) pack.dump(path) pack = Pack.load(path) pack.context.navec = InferNavecEmbedding.from_navec(navec) ner_model = pack.scheme.to_impl(pack.context) tagger = InferNERTagger( tokenizer, token_encoder, tags_vocab, ner_model ) markup2 = tagger(TEXT) assert markup1 == markup2
'\nКогда вам надоест со мной говорить, скажите "выход".') TEXT_FAREWELL = 'Всего доброго! Если захотите повторить, скажите "Алиса, включи навык тест tgalice".' if __name__ == '__main__': mongo_url = os.environ.get('MONGODB_URI') if mongo_url: mongo_client = MongoClient(mongo_url) mongo_db = mongo_client.get_default_database() else: mongo_client = mongomock.MongoClient() mongo_db = mongo_client.db mongo_logs = mongo_db.get_collection('message_logs') prerelease.download_if_not_exists(prerelease.navec_url, prerelease.navec_file) w2v = Navec.load(prerelease.navec_file) manager = tgalice.dialog_manager.CascadeDialogManager( tgalice.dialog_manager.FAQDialogManager( 'faq.yaml', matcher=tgalice.nlu.matchers.W2VMatcher(w2v=w2v)), tgalice.dialog_manager.GreetAndHelpDialogManager( greeting_message=TEXT_HELP, help_message=TEXT_HELP, default_message='Я вас не понимаю.', exit_message='Всего доброго! Было приятно с вами пообщаться!')) connector = tgalice.dialog_connector.DialogConnector( dialog_manager=manager, storage=tgalice.session_storage.MongoBasedStorage( database=mongo_db, collection_name='sessions'), log_storage=tgalice.storage.message_logging.MongoMessageLogger( database=mongo_db, detect_pings=True)
def __init__(self, metricType: str = "tf"): self._metric = metricType if self._metric == 'emb' or self._metric == 'embm': self._embModel = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
import pandas as pd from navec import Navec import aiohttp from aiogram import Bot, Dispatcher, executor, types # import from other modules from functions import get_groups, sort_list, check_value from DB import read_list, insert_into_list, drop, delete_from_list from config import token, ids logging.basicConfig(level=logging.INFO) bot = Bot(token=token) dp = Dispatcher(bot) navec = Navec.load('navec_hudlit_v1_12B_500K_300d_100q.tar') def auth(func): # authentication wrapper async def wrapper(message): if not (message['from']['id'] in ids): return await message.reply("Доступ закрыт", reply=False) return await func(message) return wrapper @dp.message_handler(commands=['start'] ) # Diplays the message with the user id. async def welcome(message: types.Message): await message.answer('Ваш id ' + str(message['from']['id']) + '\n' +
# import numpy as np import re import pymorphy2 from razdel import tokenize from navec import Navec from pyaspeller import Word from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics import accuracy_score from engine import * import config import argparse if __name__ == '__main__': parser = argparse.ArgumentParser(description="Run topic prediction script") parser.add_argument("-m", "--message", type=str, help="Past your message to classify", default="мне нужно посетить врача") args = parser.parse_args() morph = pymorphy2.MorphAnalyzer() w2v = Navec.load(config.path) keyword_matrix = prepare_keyword_vectors(config.keywords, w2v) processed_text = preprocess(args.message, morph, config.normalized) print(config.inv_mapping[predict_label(processed_text, keyword_matrix, w2v)])
from navec import Navec from numpy import linalg as ln import numpy as np import time import json import tensorflow as tf from tensorflow.keras import layers import matplotlib.pyplot as plt from random import random from slovnet.model.emb import NavecEmbedding import torch path = 'navec_news_v1_1B_250K_300d_100q.tar' navec = Navec.load(path) # emb = NavecEmbedding(navec) # print(ln.norm(navec['каспийский'] - navec['море'])) # navec.vocab # path = 'navec_hudlit_v1_12B_500K_300d_100q.tar' # navec = Navec.load(path) # sentences1 = [ # [[0.1, 0.3,0.7,0.8,0.9], [0.1, 0.12,0.15,0.21,0.24], # [0.1, 0.3,0.7,0.8,0.9], [0.1, 0.3,0.7,0.8,0.9], # [0.1, 0.3,0.7,0.8,0.9]], # [[0.01, 0.08,0.12,0.13,0.13], [0.58, 0.59,0.63,0.63,0.64], # [0.1, 0.3,0.7,0.8,0.9], # [0.2, 0.22,0.23,0.25,0.28]],
def __init__(self, model_path, vector_model_path): navec = Navec.load(vector_model_path) self.model = NER.load(model_path) self.model.navec(navec)
''' import spacy import pandas as pd from razdel import sentenize, tokenize from navec import Navec from slovnet import Syntax from slovnet import Morph from pymystem3 import Mystem class Tokens: def __init__(self, text, lemma_, pos_): self.text = text self.lemma_ = lemma_ self.pos_=pos_ m = Mystem() navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar') morph = Morph.load('slovnet_morph_news_v1.tar', batch_size=4) morph.navec(navec) text="Европейский союз добавил в санкционный список девять политических деятелей из самопровозглашенных республик Донбасса — Донецкой народной республики (ДНР) и Луганской народной республики (ЛНР) — в связи с прошедшими там выборами. Об этом говорится в документе, опубликованном в официальном журнале Евросоюза." def nlp(text): chunks=[] lemmaSent=[] Doc=[] for sent in sentenize(text): tokens = [_.text for _ in tokenize(sent.text)] chunks.append(tokens) for chunk in chunks: filteredChunk=list(filter(lambda a: a != ' ', chunk)) markup = next(morph.map([filteredChunk]))
def __init__(self, path_to_navec_data, path_to_syntax_data): self.navec = Navec.load(path_to_navec_data) self.syntax = Syntax.load(path_to_syntax_data).navec(self.navec)
import time import os.path from django.conf import settings from navec import Navec from numpy import dot from numpy.linalg import norm from annoy import AnnoyIndex NAVEC_PATH = os.path.join(settings.ROOT_DIR, "parser_tool", "data", "navec_hudlit_v1_12B_500K_300d_100q.tar") ANNOY_INDEX_PATH = os.path.join(settings.ROOT_DIR, "parser_tool", "data", "ANNOY_tree.ann") navec = Navec.load(NAVEC_PATH) vocabulary = navec.vocab.words word_to_index = dict() for i, word in enumerate(vocabulary): word_to_index[word] = i lsh = None # global that holds the Annoy tree def load_annoy_index(): ''' Lazy load the Annoy LSH tree. This is wrapped in a function to avoid import errors if the index file is not present at that time. ''' global lsh if lsh is None:
def test_ner_tagger(): navec = Navec.load(NAVEC) tagger = NERTagger.load(SLOVNET, navec) markup = tagger(TEXT) guess = [TEXT[_.start:_.stop] for _ in markup.spans] assert guess == ETALON