Пример #1
0
    def init_agent(self):
        agent_config = self.config['kpis'][self.kpi_name]['settings_agent']
        model_dir = agent_config['model_dir']
        model_file = agent_config['model_file']
        dict_file = agent_config['dict_file']
        embedding_file = agent_config['embedding_file']

        update_model = agent_config['update_model']

        if update_model:
            download_url = agent_config['model_dowload_url']
            download_path = model_dir
            download_untar(download_url, download_path)

        params_path = os.path.join(model_dir, 'params.json')
        with open(params_path) as f:
            network_params = json.load(f)

        model_path = os.path.join(model_dir, model_file)
        dict_path = os.path.join(model_dir, dict_file)
        embeddingg_path = os.path.join(model_dir, embedding_file)

        corpus = Corpus(dicts_filepath=dict_path,
                        embeddings_file_path=embeddingg_path)
        network = NER(corpus,
                      pretrained_model_filepath=model_path,
                      **network_params)
        self.agent = network
Пример #2
0
class Extractor:
    def __init__(self,
                 model_path=None,
                 tokenizer=None,
                 model_url='http://lnsigo.mipt.ru/export/models/ner/ner_model_total_rus.tar.gz'):
        self.model_path = (
            model_path
            or pkg_resources.resource_filename(__name__, "../model")
        )
        self.model_url = model_url
        self._lazy_download()

        with open(self._get_path('params.json')) as f:
            self.network_params = json.load(f)

        self.corpus = Corpus(dicts_filepath=self._get_path('dict.txt'))
        self.network = NER(
            self.corpus,
            verbouse=False,
            pretrained_model_filepath=self._get_path('ner_model'),
            **self.network_params)

        self.tokenizer = tokenizer or Tokenizer()
        self._morph = pymorphy2.MorphAnalyzer()

    def _lazy_download(self):
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        if not os.listdir(self.model_path):
            download_untar(self.model_url, self.model_path)

    def _get_path(self, filename):
        return os.path.join(self.model_path, filename)

    def __call__(self, text):
        tokens = list(self.tokenizer(text))
        tokens_lemmas = lemmatize([t.text for t in tokens], self._morph)
        tags = self.network.predict_for_token_batch([tokens_lemmas])[0]

        previous_tag = null_tag = 'O'
        previous_tokens = []

        for token, current_tag in zip(
                itertools.chain(tokens, [None]),
                itertools.chain(tags, [null_tag])
        ):
            if current_tag.startswith('I'):
                previous_tokens.append(token)
            elif previous_tag != null_tag:
                yield Match(
                    previous_tokens,
                    Span(
                        previous_tokens[0].span[0],
                        previous_tokens[-1].span[1],
                    ),
                    previous_tag[-3:]
                )
            if current_tag.startswith('B'):
                previous_tokens = [token]
            previous_tag = current_tag
Пример #3
0
    def __init__(self,
                 model_path=None,
                 tokenizer=None,
                 model_url='http://lnsigo.mipt.ru/export/models/ner/ner_model_total_rus.tar.gz'):
        self.model_path = (
            model_path
            or pkg_resources.resource_filename(__name__, "../model")
        )
        self.model_url = model_url
        self._lazy_download()

        with open(self._get_path('params.json')) as f:
            self.network_params = json.load(f)

        self.corpus = Corpus(dicts_filepath=self._get_path('dict.txt'))
        self.network = NER(
            self.corpus,
            verbouse=False,
            pretrained_model_filepath=self._get_path('ner_model'),
            **self.network_params)

        self.tokenizer = tokenizer or Tokenizer()
        self._morph = pymorphy2.MorphAnalyzer()
Пример #4
0
    def init_agent(self):
        model_dir = self.config['kpis'][self.kpi_name]['settings_agent']['model_dir']
        update_model = bool(self.config['kpis'][self.kpi_name]['settings_agent']['update_model'])

        if update_model:
            glob_arg = os.path.join('{}/*'.format(model_dir))
            if md5_hashsum(glob(glob_arg)) != 'f25fe8e1297154077fc4d3bf65ed888e':
                download_url = 'http://lnsigo.mipt.ru/export/ner/ner_model_total_rus.tar.gz'
                download_path = model_dir
                download_untar(download_url, download_path)

        params_path = os.path.join(model_dir, 'params.json')
        with open(params_path) as f:
            network_params = json.load(f)

        dicts_path = os.path.join('model/dict.txt')
        corpus = Corpus(dicts_filepath=dicts_path)
        network = NER(corpus, verbouse=False, pretrained_model_filepath='model/ner_model', **network_params)
        self.agent = network
Пример #5
0
    def __init__(self,
                 model_path=None,
                 tokenizer=None,
                 model_url='http://lnsigo.mipt.ru/export/ner/ner_model_total_rus.tar.gz'):
        self.model_path = (
            model_path
            or pkg_resources.resource_filename(__name__, "../model")
        )
        self.model_url = model_url
        self._lazy_download()

        with open(self._get_path('params.json')) as f:
            self.network_params = json.load(f)

        self.corpus = Corpus(dicts_filepath=self._get_path('dict.txt'))
        self.network = NER(
            self.corpus,
            verbouse=False,
            pretrained_model_filepath=self._get_path('ner_model'),
            **self.network_params,
        )

        self.tokenizer = tokenizer or Tokenizer()
        self._morph = pymorphy2.MorphAnalyzer()
Пример #6
0
# Check existence of the model by hashsum
if md5_hashsum(glob('model_/*')) != 'f25fe8e1297154077fc4d3bf65ed888e':
    # Download and extract model
    download_url = 'http://lnsigo.mipt.ru/export/models/ner/conll_ner.tar.gz'
    download_path = 'model/'
    # download_untar(download_url, download_path)

# Load network params
with open('model/params.json') as f:
    network_params = json.load(f)

corpus = Corpus(dicts_filepath='model/dict.txt',
                embeddings_file_path='model/glove.6B.100d.txt')

network = NER(corpus,
              pretrained_model_filepath='model/model.ckpt',
              **network_params)


def print_predict(sentence):
    # Split sentence into tokens
    tokens = tokenize(sentence)

    # Lemmatize every token
    # Example: был -> быть, его -> он
    # tokens_lemmas = lemmatize(tokens)

    tags = network.predict_for_token_batch([tokens])[0]
    for token, tag in zip(tokens, tags):
        print(token, tag)
Пример #7
0
             data_fn=args.corpus)
         selected = [
             fr for fr in annotated_corpus.selected_frames
             if any([el in fr for el in eval_mapping.keys()])
         ]
         print(selected)
         selected = annotated_corpus.selected_frames
         print('Creating corpus in', args.data_dir)
         annotated_corpus.get_corpus_srl_iob(args.data_dir,
                                             train_set,
                                             args.train_size,
                                             selected=selected)
     dataset_dict = prepare_data_dict(args.data_dir)
     corpus = Corpus(dataset_dict, embeddings_file_path=None)
     print_dataset(dataset_dict)
     net = NER(corpus, **model_params)
     learning_params = {
         'dropout_rate': args.dropout,
         'epochs': 10,
         'learning_rate': 0.005,
         'batch_size': 8,
         'learning_rate_decay': 0.707,
         'model_file_path': args.model_dir
     }
     results = net.fit(**learning_params)
 else:
     dialogue_dataset = Dataset(saved_dialogues=args.dataset_path)
     total = 0
     with open(os.path.join(args.exp_dir, 'test_set'), 'rb') as f:
         test_set = pickle.load(f)
     with open(os.path.join(args.exp_dir, 'train_set'), 'rb') as f:
Пример #8
0
# Check existence of the model by hashsum

if md5_hashsum(sorted(glob('model/*'))) != 'fd50a27b96b24cdabdda13795a3baae7':
    # Download and extract model
    download_url = 'http://lnsigo.mipt.ru/export/models/ner/ner_model_total_rus.tar.gz'
    download_path = 'model/'
    download_untar(download_url, download_path)

# Load network params
with open('model/params.json') as f:
    network_params = json.load(f)


corpus = Corpus(dicts_filepath='model/dict.txt')

network = NER(corpus, verbouse=False, pretrained_model_filepath='model/ner_model', **network_params)


def print_predict(sentence):
    # Split sentence into tokens
    tokens = tokenize(sentence)

    # Lemmatize every token
    # Example: был -> быть, его -> он
    tokens_lemmas = lemmatize(tokens)

    tags = network.predict_for_token_batch([tokens_lemmas])[0]
    for token, tag in zip(tokens, tags):
        print(token, tag)

Пример #9
0
    parser.add_argument('--threshold', type=float, default=0.2)
    parser.add_argument('--type', type=str, default='cnn')
    parser.add_argument('--model_dir', type=str)
    args = parser.parse_args()

    model_params = {
        "filter_width": 5,
        "embeddings_dropout": True,
        "n_filters": [args.hidden_size],
        "token_embeddings_dim": 100,
        "char_embeddings_dim": 25,
        "use_batch_norm": True,
        "use_crf": False,
        "net_type": args.type,
        "cell_type": 'gru',
        "use_capitalization": True,
    }

    dataset_dict = prepare_data_dict(args.model_dir)
    corpus = Corpus(dataset_dict, embeddings_file_path=None)
    network = NER(corpus, verbouse=False, **model_params)

    saver = tf.train.Saver()
    saver.restore(network._sess, os.path.join(args.model_dir,
                                              'ner_model.ckpt'))

    while 1:
        utt = input('>')
        by_model = print_predict(utt, network, threshold=args.threshold)
        print(by_model)
Пример #10
0
from ner.network import NER

model_params = {"filter_width": 7,
                "embeddings_dropout": True,
                "n_filters": [
                    128, 128,
                ],
                "token_embeddings_dim": 4,
                "char_embeddings_dim": 50,
                "use_batch_norm": True,
                "use_crf": True,
                "net_type": 'cnn',
                "use_capitalization": True,
               }

net = NER(corp,**model_params)

NER.load(net,"C:\\Users\\BioQwer\\Documents\\Development\\ner\\drug_model\\drug_model")


from ner.utils import tokenize, lemmatize


def print_predict(sentence, network):
    # Split sentence into tokens
    tokens = tokenize(sentence.lower())
    print(tokens)
    tags = network.predict_for_token_batch([tokens])[0]
    for token, tag in zip(tokens, tags):
        print(token, tag)
Пример #11
0
class Extractor:
    def __init__(self,
                 model_path=None,
                 tokenizer=None,
                 model_url='http://lnsigo.mipt.ru/export/ner/ner_model_total_rus.tar.gz'):
        self.model_path = (
            model_path
            or pkg_resources.resource_filename(__name__, "../model")
        )
        self.model_url = model_url
        self._lazy_download()

        with open(self._get_path('params.json')) as f:
            self.network_params = json.load(f)

        self.corpus = Corpus(dicts_filepath=self._get_path('dict.txt'))
        self.network = NER(
            self.corpus,
            verbouse=False,
            pretrained_model_filepath=self._get_path('ner_model'),
            **self.network_params,
        )

        self.tokenizer = tokenizer or Tokenizer()
        self._morph = pymorphy2.MorphAnalyzer()

    def _lazy_download(self):
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        if not os.listdir(self.model_path):
            download_untar(self.model_url, self.model_path)

    def _get_path(self, filename):
        return os.path.join(self.model_path, filename)

    def __call__(self, text):
        tokens = list(self.tokenizer(text))
        tokens_lemmas = lemmatize([t.text for t in tokens], self._morph)
        tags = self.network.predict_for_token_batch([tokens_lemmas])[0]

        previous_tag = null_tag = 'O'
        previous_tokens = []

        for token, current_tag in zip(
                itertools.chain(tokens, [None]),
                itertools.chain(tags, [null_tag])
        ):
            if current_tag.startswith('I'):
                previous_tokens.append(token)
            elif previous_tag != null_tag:
                yield Match(
                    previous_tokens,
                    Span(
                        previous_tokens[0].span[0],
                        previous_tokens[-1].span[1],
                    ),
                    previous_tag[-3:]
                )
            if current_tag.startswith('B'):
                previous_tokens = [token]
            previous_tag = current_tag
Пример #12
0
from ner.network import NER

model_params = {
    "filter_width": 7,
    "embeddings_dropout": True,
    "n_filters": [
        128,
        128,
    ],
    "token_embeddings_dim": 2,
    "char_embeddings_dim": 250,
    "use_batch_norm": True,
    "use_crf": True,
    "net_type": 'cnn',
    "use_capitalization": True,
}

net = NER(corp, **model_params)

learning_params = {
    'dropout_rate': 0.5,
    'epochs': 5,
    'learning_rate': 0.005,
    'batch_size': 8,
    'learning_rate_decay': 0.707
}
results = net.fit(**learning_params)
NER.save(
    net,
    "C:\\Users\\BioQwer\\Documents\\Development\\ner\\drug_model\\drug_model_lower"
)