Пример #1
0
def emb():
    meta = Meta(
        id='test_1B_3k_6d_2q'
    )
    pq = PQ(
        vectors=3,
        dim=6,
        qdim=2,
        # 1 0 0 | 1 0 0
        # 0 1 1 | 0 0 0
        # 0 0 0 | 0 1 0
        centroids=3,
        indexes=np.array([  # vectors x qdim
            [0, 1],
            [1, 0],
            [2, 2]
        ]).astype(np.uint8),
        codes=np.array([  # qdim x centroids x chunk
            [[1, 0, 0], [0, 1, 1], [0, 0, 0]],
            [[0, 0, 0], [1, 0, 0], [0, 1, 0]],
        ]).astype(np.float32),
    )
    vocab = Vocab(
        words=['a', 'b', 'c'],
        counts=[1, 2, 3]
    )
    return Navec(meta, vocab, pq)
Пример #2
0
 def __init__(self):
     self.navec = Navec.load(Loc.dependencies_path /
                             'navec_news_v1_1B_250K_300d_100q.tar')
     self.syntax = Syntax.load(Loc.dependencies_path /
                               'slovnet_syntax_news_v1.tar')
     self.syntax.navec(self.navec)
     self.morph = Morph.load(Loc.dependencies_path /
                             'slovnet_morph_news_v1.tar')
     self.morph.navec(self.navec)
Пример #3
0
def quantize_(emb, output, subdim, sample, iterations):
    with open(emb) as file:
        log_info('Load %s', emb)
        words, weights = parse_glove_emb(file)
        log_info(
            'PQ, subdim: %d, sample: %d, iterations: %d',
            subdim, sample, iterations
        )
        pq = quantize__(weights, subdim, sample, iterations)
        vocab = Vocab(words)
        log_info('Dump %s', output)
        Navec(vocab, pq).dump(output)
Пример #4
0
def pack(args):
    meta = Meta(args.id)

    with open_bin(args.vocab) as file:
        vocab = Vocab.from_file(file)

    with open_bin(args.pq) as file:
        pq = PQ.from_file(file)

    path = 'navec_%s.tar' % args.id
    log_info('Dumping %s', path)
    Navec(meta, vocab, pq).dump(path)
Пример #5
0
def pack_(vocab, pq, id):
    meta = Meta(id)

    with open_bin(vocab) as file:
        vocab = Vocab.from_file(file)

    with open_bin(pq) as file:
        pq = PQ.from_file(file)

    path = 'navec_%s.tar' % id
    log_info('Dumping %s', path)
    Navec(meta, vocab, pq).dump(path)
    def build_annoy_forest(self, navec_path, index_path):
        # load navec vector file and extract vocabulary
        navec = Navec.load(navec_path)
        vocabulary = navec.vocab.words

        # build annoy forest
        dim = 300
        tree_count = 100
        forest = AnnoyIndex(dim, 'angular')

        for i, word in enumerate(vocabulary):
            forest.add_item(i, navec[word])

        forest.build(tree_count)
        forest.save(index_path)
Пример #7
0
def load_ner(models_path: str) -> NER:
    """Загружаем и инициализируем NER-модель

    Args:
        models_path (str): Папка, в которой расположены необходимые для работы модели

    Returns:
        slovnet.NER: Объект slovnet.NER
    """
    os.makedirs(models_path, exist_ok=True)
    if not os.path.isfile(os.path.join(models_path, 'navec_news_v1_1B_250K_300d_100q.tar')):
        wget.download('https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar',
                      os.path.join(models_path, 'navec_news_v1_1B_250K_300d_100q.tar'))
    if not os.path.isfile(os.path.join(models_path, 'slovnet_ner_news_v1.tar')):
        wget.download('https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_ner_news_v1.tar',
                      os.path.join(models_path, 'slovnet_ner_news_v1.tar'))
    navec = Navec.load(os.path.join(models_path, 'navec_news_v1_1B_250K_300d_100q.tar'))
    ner = NER.load(os.path.join(models_path, 'slovnet_ner_news_v1.tar'))
    ner.navec(navec)
    return ner
Пример #8
0
    def shop_name(self) -> str:
        navec = Navec.load(constants.navec_file)
        ner = NER.load(constants.ner_file)
        ner.navec(navec)

        try:
            markup = ner(self.text)
        except IndexError:
            # i dont know what happens here sometimes
            del navec
            del ner
            return ""

        for span in markup.spans:
            if span.type == 'ORG':
                del navec
                del ner
                return self.text[span.start:span.stop].strip(".,;!:-–—/ ")

        del navec
        del ner

        return ""
Пример #9
0
 def __init__(self, is_elmo_used=False):
     self.config = get_config('config.yml')
     self.parser = ConsultantPlusParser(config=self.config)
     self.model = ElmoModel()
     self.mystem = Mystem()
     self.spec_chars = string.punctuation + '\n\xa0«»\t—…'
     self.stop_words = stopwords.words("russian")
     self.stop_words.extend([
         'и',
         'в',
         'на',
         'n',
         'рф',
         'гк',
         'юридического',
         ' ',
         '1',
         'ред',
         '2',
         'ст',
         'также',
         'свой',
         'либо',
         'это',
         'текст',
         'закон',
         'который',
         'иной',
         'год',
         'мочь',
     ])
     if is_elmo_used:
         self.model.load(self.config['model_info_file'])
     self.navec = Navec.load(self.config['navec_news_v1_1B_250K_300d_100q'])
     self.syntax = Syntax.load(self.config['slovnet_syntax_news_v1'])
     self.syntax.navec(self.navec)
Пример #10
0
def test_dump_load(emb):
    with NamedTemporaryFile() as file:
        path = file.name
        emb.dump(path)
        Navec.load(path)
Пример #11
0
from aiohttp import web

from navec import Navec
from slovnet import Morph

NAVEC = getenv('NAVEC', 'navec.tar')
PACK = getenv('PACK', 'pack.tar')
BATCH_SIZE = int(getenv('BATCH_SIZE', 8))

HOST = getenv('HOST', '0.0.0.0')
PORT = int(getenv('PORT', 8080))
MB = 1024 * 1024
MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB))

log('Load navec: %r' % NAVEC)
navec = Navec.load(NAVEC)

log('Load pack: %r' % PACK)
log('Batch size: %r' % BATCH_SIZE)
morph = Morph.load(PACK)
morph.navec(navec)


async def handle(request):
    chunk = await request.json()
    log('Post chunk size: %r' % len(chunk))
    markups = list(morph.map(chunk))

    tags = sum(len(_.tags) for _ in markups)
    log('Infer tags: %r', tags)
Пример #12
0
 def __init__(self, path):
     meta, vocab, pq = Navec.load(path)
     Navec.__init__(self, meta, vocab, pq)
Пример #13
0
def navec():
    path = download(
        'https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar'
    )
    return Navec.load(path)
Пример #14
0
from pprint import pprint

from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import gensim.downloader as api
import pandas as pd

from games import RED, BLUE, GREY, game

from navec import Navec
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
model = Navec.load(path).as_gensim
# model = api.load("word2vec-ruscorpora-300")


def _(model, name, noun=True):
    words = model.index2word
    if noun:
        ws = [
            w for w in words if w.split('_')[0] == name.lower() and (
                len(w.split('_')) == 1 or w.split('_')[1] == 'NOUN')
        ]
    else:
        ws = [w for w in words if w.split('_')[0] == name.lower()]
    if len(ws) == 0:
        raise KeyError(name, ws)
    if len(ws) > 1:
        raise KeyError(name, ws)
    key = ws[0]
    return key
Пример #15
0
 def _load(self):
     raw = Navec.load(self.path)
     return NavecModel(raw, self.stats)
Пример #16
0
    record, k = get_random_record(records)
    markup = ner(record.text)
    print('This is ' + tp.BOLD + tp.RED + f'{k}' + tp.END + ' record\n')
    show_markup(markup.text, markup.spans)


def test_on_k_random_records(K):
    records = load_lenta(lenta_path)
    records_num = [i for i in range(N)]
    chosen_records_num = random.choices(records_num, k=K)
    my_records = []
    for i in chosen_records_num:
        my_records.append(get_k_record(records, i))

    print(f'This is ' + tp.BOLD + tp.RED + f'{chosen_records_num}' + tp.END +
          ' records\n')

    for i in range(K):
        print(tp.BOLD + tp.RED + f'{chosen_records_num[i]}' + tp.END + '\t')
        markup = ner(my_records[i].text)
        show_markup(markup.text, markup.spans)
        print('\n--------------------------\n\n')


if __name__ == '__main__':
    print()
    navec = Navec.load(navec_path)
    ner = NER.load(ner_path)
    ner.navec(navec)
    test_on_random_record()
    test_on_k_random_records(5)
Пример #17
0
def test_integration(tmpdir):
    torch.manual_seed(1)
    device = get_device()

    navec = Navec.load(NAVEC)

    words_vocab = WordsVocab(navec.vocab.words)
    shapes_vocab = ShapesVocab(SHAPES)
    tags_vocab = TagsVocab([PER, LOC, ORG])

    word_emb = NavecEmbedding.from_navec(navec)
    shape_emb = ShapeEmbedding(
        vocab_size=len(shapes_vocab),
        dim=10,
        pad_id=shapes_vocab.pad_id
    )
    word_model = WordModel(
        word_emb,
        shape_emb
    )
    context_model = CNNContextModel(
        input_dim=word_model.dim,
        layer_dims=[64, 32],
        kernel_size=3,
    )
    tag_model = CRFTagModel(
        input_dim=context_model.dim,
        tags_num=len(tags_vocab)
    )
    ner_model = NERModel(
        word_model,
        context_model,
        tag_model
    ).to(device)

    dataset = NerusDataset(NERUS)
    test_dataset = dataset.slice(0, 10)
    train_dataset = dataset.slice(10, 30)

    token_encoder = StackEncoder([
        WordEncoder(words_vocab),
        ShapeEncoder(shapes_vocab)
    ])
    markup_encoder = MarkupEncoder(
        token_encoder,
        TagEncoder(tags_vocab)
    )

    tokenizer = Tokenizer()
    batch_encoder = BatchEncoder(
        tokenizer,
        markup_encoder,
        seq_len=100,
        batch_size=32,
        shuffle_buffer_size=256,
    )

    test_batches = [_.to(device) for _ in batch_encoder.map(test_dataset)]
    train_batches = [_.to(device) for _ in batch_encoder.map(train_dataset)]

    path = str(tmpdir.mkdir('root'))
    board = Board('01', path)
    train_board = board.prefixed('01_train')
    test_board = board.prefixed('02_test')

    optimizer = optim.Adam(
        ner_model.parameters(),
        lr=0.001
    )
    proced = train_model(
        ner_model, optimizer,
        train_batches
    )
    scores = eval_batches(tags_vocab, proced)
    score = avg_batch_scores(scores)
    train_board.add_batch_score(score)

    proced = infer_model(
        ner_model,
        test_batches
    )
    scores = eval_batches(tags_vocab, proced)
    score = avg_batch_scores(scores)
    test_board.add_batch_score(score)

    ner_model.eval()
    tagger = NERTagger(
        tokenizer,
        token_encoder,
        tags_vocab,
        ner_model,
        device
    )
    markup1 = tagger(TEXT)

    pack = ner_model.as_infer.pack('slovnet_ner_v1')
    path = str(tmpdir.join('slovnet_ner_v1.tar'))
    pack.dump(path)

    pack = Pack.load(path)
    pack.context.navec = InferNavecEmbedding.from_navec(navec)
    ner_model = pack.scheme.to_impl(pack.context)

    tagger = InferNERTagger(
        tokenizer,
        token_encoder,
        tags_vocab,
        ner_model
    )
    markup2 = tagger(TEXT)

    assert markup1 == markup2
Пример #18
0
    '\nКогда вам надоест со мной говорить, скажите "выход".')
TEXT_FAREWELL = 'Всего доброго! Если захотите повторить, скажите "Алиса, включи навык тест tgalice".'

if __name__ == '__main__':
    mongo_url = os.environ.get('MONGODB_URI')
    if mongo_url:
        mongo_client = MongoClient(mongo_url)
        mongo_db = mongo_client.get_default_database()
    else:
        mongo_client = mongomock.MongoClient()
        mongo_db = mongo_client.db
    mongo_logs = mongo_db.get_collection('message_logs')

    prerelease.download_if_not_exists(prerelease.navec_url,
                                      prerelease.navec_file)
    w2v = Navec.load(prerelease.navec_file)

    manager = tgalice.dialog_manager.CascadeDialogManager(
        tgalice.dialog_manager.FAQDialogManager(
            'faq.yaml', matcher=tgalice.nlu.matchers.W2VMatcher(w2v=w2v)),
        tgalice.dialog_manager.GreetAndHelpDialogManager(
            greeting_message=TEXT_HELP,
            help_message=TEXT_HELP,
            default_message='Я вас не понимаю.',
            exit_message='Всего доброго! Было приятно с вами пообщаться!'))
    connector = tgalice.dialog_connector.DialogConnector(
        dialog_manager=manager,
        storage=tgalice.session_storage.MongoBasedStorage(
            database=mongo_db, collection_name='sessions'),
        log_storage=tgalice.storage.message_logging.MongoMessageLogger(
            database=mongo_db, detect_pings=True)
Пример #19
0
 def __init__(self, metricType: str = "tf"):
     self._metric = metricType
     if self._metric == 'emb' or self._metric == 'embm':
         self._embModel = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
Пример #20
0
import pandas as pd
from navec import Navec
import aiohttp
from aiogram import Bot, Dispatcher, executor, types

# import from other modules
from functions import get_groups, sort_list, check_value
from DB import read_list, insert_into_list, drop, delete_from_list
from config import token, ids

logging.basicConfig(level=logging.INFO)

bot = Bot(token=token)
dp = Dispatcher(bot)

navec = Navec.load('navec_hudlit_v1_12B_500K_300d_100q.tar')


def auth(func):  # authentication wrapper
    async def wrapper(message):
        if not (message['from']['id'] in ids):
            return await message.reply("Доступ закрыт", reply=False)
        return await func(message)

    return wrapper


@dp.message_handler(commands=['start']
                    )  # Diplays the message with the user id.
async def welcome(message: types.Message):
    await message.answer('Ваш id ' + str(message['from']['id']) + '\n' +
Пример #21
0
#
import numpy as np
import re
import pymorphy2
from razdel import tokenize
from navec import Navec
from pyaspeller import Word
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from engine import *
import config
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run topic prediction script")
    parser.add_argument("-m",
                        "--message",
                        type=str,
                        help="Past your message to classify",
                        default="мне нужно посетить врача")
    args = parser.parse_args()
    morph = pymorphy2.MorphAnalyzer()
    w2v = Navec.load(config.path)
    keyword_matrix = prepare_keyword_vectors(config.keywords, w2v)
    processed_text = preprocess(args.message, morph, config.normalized)
    print(config.inv_mapping[predict_label(processed_text, keyword_matrix,
                                           w2v)])
Пример #22
0
from navec import Navec
from numpy import linalg as ln
import numpy as np
import time
import json
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from random import random
from slovnet.model.emb import NavecEmbedding
import torch

path = 'navec_news_v1_1B_250K_300d_100q.tar'
navec = Navec.load(path)
# emb = NavecEmbedding(navec)

# print(ln.norm(navec['каспийский'] - navec['море']))
# navec.vocab

# path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
# navec = Navec.load(path)

# sentences1 = [
#                 [[0.1, 0.3,0.7,0.8,0.9], [0.1, 0.12,0.15,0.21,0.24],
#                 [0.1, 0.3,0.7,0.8,0.9], [0.1, 0.3,0.7,0.8,0.9],
#                 [0.1, 0.3,0.7,0.8,0.9]],

#                 [[0.01, 0.08,0.12,0.13,0.13], [0.58, 0.59,0.63,0.63,0.64],
#                 [0.1, 0.3,0.7,0.8,0.9],
#                 [0.2, 0.22,0.23,0.25,0.28]],
Пример #23
0
 def __init__(self, model_path, vector_model_path):
     navec = Navec.load(vector_model_path)
     self.model = NER.load(model_path)
     self.model.navec(navec)
'''
import spacy
import pandas as pd
from razdel import sentenize, tokenize
from navec import Navec
from slovnet import Syntax
from slovnet import Morph
from pymystem3 import Mystem
class Tokens:
    def __init__(self, text, lemma_, pos_):
        self.text = text
        self.lemma_ = lemma_
        self.pos_=pos_

m = Mystem()
navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
morph = Morph.load('slovnet_morph_news_v1.tar', batch_size=4)
morph.navec(navec)

text="Европейский союз добавил в санкционный список девять политических деятелей из самопровозглашенных республик Донбасса — Донецкой народной республики (ДНР) и Луганской народной республики (ЛНР) — в связи с прошедшими там выборами. Об этом говорится в документе, опубликованном в официальном журнале Евросоюза."
def nlp(text):
    chunks=[]
    lemmaSent=[]
    Doc=[]
    for sent in sentenize(text):
        tokens = [_.text for _ in tokenize(sent.text)]
        chunks.append(tokens)

    for chunk in chunks:
        filteredChunk=list(filter(lambda a: a != ' ', chunk))
        markup = next(morph.map([filteredChunk]))
Пример #25
0
 def __init__(self, path_to_navec_data, path_to_syntax_data):
     self.navec = Navec.load(path_to_navec_data)
     self.syntax = Syntax.load(path_to_syntax_data).navec(self.navec)
Пример #26
0
import time
import os.path

from django.conf import settings

from navec import Navec
from numpy import dot
from numpy.linalg import norm
from annoy import AnnoyIndex


NAVEC_PATH = os.path.join(settings.ROOT_DIR, "parser_tool", "data", "navec_hudlit_v1_12B_500K_300d_100q.tar")
ANNOY_INDEX_PATH = os.path.join(settings.ROOT_DIR, "parser_tool", "data", "ANNOY_tree.ann")

navec = Navec.load(NAVEC_PATH)
vocabulary = navec.vocab.words
word_to_index = dict()
for i, word in enumerate(vocabulary):
    word_to_index[word] = i

lsh = None # global that holds the Annoy tree

def load_annoy_index():
    ''' 
    Lazy load the Annoy LSH tree.

    This is wrapped in a function to avoid import errors if the index file
    is not present at that time.
    '''
    global lsh
    if lsh is None:
Пример #27
0
def test_ner_tagger():
    navec = Navec.load(NAVEC)
    tagger = NERTagger.load(SLOVNET, navec)
    markup = tagger(TEXT)
    guess = [TEXT[_.start:_.stop] for _ in markup.spans]
    assert guess == ETALON