def init_agent(self): agent_config = self.config['kpis'][self.kpi_name]['settings_agent'] model_dir = agent_config['model_dir'] model_file = agent_config['model_file'] dict_file = agent_config['dict_file'] embedding_file = agent_config['embedding_file'] update_model = agent_config['update_model'] if update_model: download_url = agent_config['model_dowload_url'] download_path = model_dir download_untar(download_url, download_path) params_path = os.path.join(model_dir, 'params.json') with open(params_path) as f: network_params = json.load(f) model_path = os.path.join(model_dir, model_file) dict_path = os.path.join(model_dir, dict_file) embeddingg_path = os.path.join(model_dir, embedding_file) corpus = Corpus(dicts_filepath=dict_path, embeddings_file_path=embeddingg_path) network = NER(corpus, pretrained_model_filepath=model_path, **network_params) self.agent = network
class Extractor: def __init__(self, model_path=None, tokenizer=None, model_url='http://lnsigo.mipt.ru/export/models/ner/ner_model_total_rus.tar.gz'): self.model_path = ( model_path or pkg_resources.resource_filename(__name__, "../model") ) self.model_url = model_url self._lazy_download() with open(self._get_path('params.json')) as f: self.network_params = json.load(f) self.corpus = Corpus(dicts_filepath=self._get_path('dict.txt')) self.network = NER( self.corpus, verbouse=False, pretrained_model_filepath=self._get_path('ner_model'), **self.network_params) self.tokenizer = tokenizer or Tokenizer() self._morph = pymorphy2.MorphAnalyzer() def _lazy_download(self): if not os.path.exists(self.model_path): os.mkdir(self.model_path) if not os.listdir(self.model_path): download_untar(self.model_url, self.model_path) def _get_path(self, filename): return os.path.join(self.model_path, filename) def __call__(self, text): tokens = list(self.tokenizer(text)) tokens_lemmas = lemmatize([t.text for t in tokens], self._morph) tags = self.network.predict_for_token_batch([tokens_lemmas])[0] previous_tag = null_tag = 'O' previous_tokens = [] for token, current_tag in zip( itertools.chain(tokens, [None]), itertools.chain(tags, [null_tag]) ): if current_tag.startswith('I'): previous_tokens.append(token) elif previous_tag != null_tag: yield Match( previous_tokens, Span( previous_tokens[0].span[0], previous_tokens[-1].span[1], ), previous_tag[-3:] ) if current_tag.startswith('B'): previous_tokens = [token] previous_tag = current_tag
def __init__(self, model_path=None, tokenizer=None, model_url='http://lnsigo.mipt.ru/export/models/ner/ner_model_total_rus.tar.gz'): self.model_path = ( model_path or pkg_resources.resource_filename(__name__, "../model") ) self.model_url = model_url self._lazy_download() with open(self._get_path('params.json')) as f: self.network_params = json.load(f) self.corpus = Corpus(dicts_filepath=self._get_path('dict.txt')) self.network = NER( self.corpus, verbouse=False, pretrained_model_filepath=self._get_path('ner_model'), **self.network_params) self.tokenizer = tokenizer or Tokenizer() self._morph = pymorphy2.MorphAnalyzer()
def init_agent(self): model_dir = self.config['kpis'][self.kpi_name]['settings_agent']['model_dir'] update_model = bool(self.config['kpis'][self.kpi_name]['settings_agent']['update_model']) if update_model: glob_arg = os.path.join('{}/*'.format(model_dir)) if md5_hashsum(glob(glob_arg)) != 'f25fe8e1297154077fc4d3bf65ed888e': download_url = 'http://lnsigo.mipt.ru/export/ner/ner_model_total_rus.tar.gz' download_path = model_dir download_untar(download_url, download_path) params_path = os.path.join(model_dir, 'params.json') with open(params_path) as f: network_params = json.load(f) dicts_path = os.path.join('model/dict.txt') corpus = Corpus(dicts_filepath=dicts_path) network = NER(corpus, verbouse=False, pretrained_model_filepath='model/ner_model', **network_params) self.agent = network
def __init__(self, model_path=None, tokenizer=None, model_url='http://lnsigo.mipt.ru/export/ner/ner_model_total_rus.tar.gz'): self.model_path = ( model_path or pkg_resources.resource_filename(__name__, "../model") ) self.model_url = model_url self._lazy_download() with open(self._get_path('params.json')) as f: self.network_params = json.load(f) self.corpus = Corpus(dicts_filepath=self._get_path('dict.txt')) self.network = NER( self.corpus, verbouse=False, pretrained_model_filepath=self._get_path('ner_model'), **self.network_params, ) self.tokenizer = tokenizer or Tokenizer() self._morph = pymorphy2.MorphAnalyzer()
# Check existence of the model by hashsum if md5_hashsum(glob('model_/*')) != 'f25fe8e1297154077fc4d3bf65ed888e': # Download and extract model download_url = 'http://lnsigo.mipt.ru/export/models/ner/conll_ner.tar.gz' download_path = 'model/' # download_untar(download_url, download_path) # Load network params with open('model/params.json') as f: network_params = json.load(f) corpus = Corpus(dicts_filepath='model/dict.txt', embeddings_file_path='model/glove.6B.100d.txt') network = NER(corpus, pretrained_model_filepath='model/model.ckpt', **network_params) def print_predict(sentence): # Split sentence into tokens tokens = tokenize(sentence) # Lemmatize every token # Example: был -> быть, его -> он # tokens_lemmas = lemmatize(tokens) tags = network.predict_for_token_batch([tokens])[0] for token, tag in zip(tokens, tags): print(token, tag)
data_fn=args.corpus) selected = [ fr for fr in annotated_corpus.selected_frames if any([el in fr for el in eval_mapping.keys()]) ] print(selected) selected = annotated_corpus.selected_frames print('Creating corpus in', args.data_dir) annotated_corpus.get_corpus_srl_iob(args.data_dir, train_set, args.train_size, selected=selected) dataset_dict = prepare_data_dict(args.data_dir) corpus = Corpus(dataset_dict, embeddings_file_path=None) print_dataset(dataset_dict) net = NER(corpus, **model_params) learning_params = { 'dropout_rate': args.dropout, 'epochs': 10, 'learning_rate': 0.005, 'batch_size': 8, 'learning_rate_decay': 0.707, 'model_file_path': args.model_dir } results = net.fit(**learning_params) else: dialogue_dataset = Dataset(saved_dialogues=args.dataset_path) total = 0 with open(os.path.join(args.exp_dir, 'test_set'), 'rb') as f: test_set = pickle.load(f) with open(os.path.join(args.exp_dir, 'train_set'), 'rb') as f:
# Check existence of the model by hashsum if md5_hashsum(sorted(glob('model/*'))) != 'fd50a27b96b24cdabdda13795a3baae7': # Download and extract model download_url = 'http://lnsigo.mipt.ru/export/models/ner/ner_model_total_rus.tar.gz' download_path = 'model/' download_untar(download_url, download_path) # Load network params with open('model/params.json') as f: network_params = json.load(f) corpus = Corpus(dicts_filepath='model/dict.txt') network = NER(corpus, verbouse=False, pretrained_model_filepath='model/ner_model', **network_params) def print_predict(sentence): # Split sentence into tokens tokens = tokenize(sentence) # Lemmatize every token # Example: был -> быть, его -> он tokens_lemmas = lemmatize(tokens) tags = network.predict_for_token_batch([tokens_lemmas])[0] for token, tag in zip(tokens, tags): print(token, tag)
parser.add_argument('--threshold', type=float, default=0.2) parser.add_argument('--type', type=str, default='cnn') parser.add_argument('--model_dir', type=str) args = parser.parse_args() model_params = { "filter_width": 5, "embeddings_dropout": True, "n_filters": [args.hidden_size], "token_embeddings_dim": 100, "char_embeddings_dim": 25, "use_batch_norm": True, "use_crf": False, "net_type": args.type, "cell_type": 'gru', "use_capitalization": True, } dataset_dict = prepare_data_dict(args.model_dir) corpus = Corpus(dataset_dict, embeddings_file_path=None) network = NER(corpus, verbouse=False, **model_params) saver = tf.train.Saver() saver.restore(network._sess, os.path.join(args.model_dir, 'ner_model.ckpt')) while 1: utt = input('>') by_model = print_predict(utt, network, threshold=args.threshold) print(by_model)
from ner.network import NER model_params = {"filter_width": 7, "embeddings_dropout": True, "n_filters": [ 128, 128, ], "token_embeddings_dim": 4, "char_embeddings_dim": 50, "use_batch_norm": True, "use_crf": True, "net_type": 'cnn', "use_capitalization": True, } net = NER(corp,**model_params) NER.load(net,"C:\\Users\\BioQwer\\Documents\\Development\\ner\\drug_model\\drug_model") from ner.utils import tokenize, lemmatize def print_predict(sentence, network): # Split sentence into tokens tokens = tokenize(sentence.lower()) print(tokens) tags = network.predict_for_token_batch([tokens])[0] for token, tag in zip(tokens, tags): print(token, tag)
class Extractor: def __init__(self, model_path=None, tokenizer=None, model_url='http://lnsigo.mipt.ru/export/ner/ner_model_total_rus.tar.gz'): self.model_path = ( model_path or pkg_resources.resource_filename(__name__, "../model") ) self.model_url = model_url self._lazy_download() with open(self._get_path('params.json')) as f: self.network_params = json.load(f) self.corpus = Corpus(dicts_filepath=self._get_path('dict.txt')) self.network = NER( self.corpus, verbouse=False, pretrained_model_filepath=self._get_path('ner_model'), **self.network_params, ) self.tokenizer = tokenizer or Tokenizer() self._morph = pymorphy2.MorphAnalyzer() def _lazy_download(self): if not os.path.exists(self.model_path): os.mkdir(self.model_path) if not os.listdir(self.model_path): download_untar(self.model_url, self.model_path) def _get_path(self, filename): return os.path.join(self.model_path, filename) def __call__(self, text): tokens = list(self.tokenizer(text)) tokens_lemmas = lemmatize([t.text for t in tokens], self._morph) tags = self.network.predict_for_token_batch([tokens_lemmas])[0] previous_tag = null_tag = 'O' previous_tokens = [] for token, current_tag in zip( itertools.chain(tokens, [None]), itertools.chain(tags, [null_tag]) ): if current_tag.startswith('I'): previous_tokens.append(token) elif previous_tag != null_tag: yield Match( previous_tokens, Span( previous_tokens[0].span[0], previous_tokens[-1].span[1], ), previous_tag[-3:] ) if current_tag.startswith('B'): previous_tokens = [token] previous_tag = current_tag
from ner.network import NER model_params = { "filter_width": 7, "embeddings_dropout": True, "n_filters": [ 128, 128, ], "token_embeddings_dim": 2, "char_embeddings_dim": 250, "use_batch_norm": True, "use_crf": True, "net_type": 'cnn', "use_capitalization": True, } net = NER(corp, **model_params) learning_params = { 'dropout_rate': 0.5, 'epochs': 5, 'learning_rate': 0.005, 'batch_size': 8, 'learning_rate_decay': 0.707 } results = net.fit(**learning_params) NER.save( net, "C:\\Users\\BioQwer\\Documents\\Development\\ner\\drug_model\\drug_model_lower" )