async def task(): tokenizer = AutoTokenizer.from_pretrained( 'Alaeddin/convbert-base-turkish-ner-cased') model = AutoModelForTokenClassification.from_pretrained( 'Alaeddin/convbert-base-turkish-ner-cased') ner = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True) queue = await aioredis.create_redis_pool( "redis://redis:6379/0?encoding=utf-8") logging.warning("Connected to Redis") logging.warning("NER task is running asynchronously...") while True: pipe = queue.pipeline() pipe.lrange("ner", 0, 7) pipe.ltrim("ner", 8, -1) requests, _ = await pipe.execute() for r in requests: r = ujson.loads(r) results = ner(r["text"]) for i in range(len(results)): results[i]['start'] = int(results[i]['start']) results[i]['end'] = int(results[i]['end']) await queue.set(r["id"], ujson.dumps(results)) asyncio.sleep(0.1)
def pipeline(self, text: str): # TODO maybe this needs senctenizing device = -1 if self.device.type == "cpu" else 0 # set to other device id if more (see how when you have a gpu) nlp = TokenClassificationPipeline(model=self.model, tokenizer=self.tokenizer, task="ner", device=device) res = nlp(inputs=text) return res
def __init__(self): self.tokenizer = AutoTokenizer.from_pretrained( "Alaeddin/convbert-base-turkish-ner-cased") self.model = AutoModelForTokenClassification.from_pretrained( "Alaeddin/convbert-base-turkish-ner-cased") self.config = PretrainedConfig.from_pretrained( "Alaeddin/convbert-base-turkish-ner-cased") self.pipeline = pipeline('ner', model=self.model, tokenizer=self.tokenizer, config=self.config) self.nlp = spacy.load("en_core_web_sm") self.nlp_grouped = TokenClassificationPipeline( model=self.model, tokenizer=self.tokenizer, grouped_entities=True)
def run_pipeline_test(self, model, tokenizer, feature_extractor): token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer) outputs = token_classifier("A simple string") self.assertIsInstance(outputs, list) n = len(outputs) self.assertEqual( nested_simplify(outputs), [{ "entity": ANY(str), "score": ANY(float), "start": ANY(int), "end": ANY(int), "index": ANY(int), "word": ANY(str), } for i in range(n)], ) outputs = token_classifier( ["list of strings", "A simple string that is quite a bit longer"]) self.assertIsInstance(outputs, list) self.assertEqual(len(outputs), 2) n = len(outputs[0]) m = len(outputs[1]) self.assertEqual( nested_simplify(outputs), [ [{ "entity": ANY(str), "score": ANY(float), "start": ANY(int), "end": ANY(int), "index": ANY(int), "word": ANY(str), } for i in range(n)], [{ "entity": ANY(str), "score": ANY(float), "start": ANY(int), "end": ANY(int), "index": ANY(int), "word": ANY(str), } for i in range(m)], ], ) self.run_aggregation_strategy(model, tokenizer)
def create_pipeline(model_name): config = AutoConfig.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, return_offsets_mapping=True ) model = AutoModelForTokenClassification.from_pretrained( model_name, config=config ) NER_pipeline = TokenClassificationPipeline(model= model,tokenizer=tokenizer, framework='pt', task='ner', grouped_entities=True) return NER_pipeline
def __init__(self, model_type: str = "BERT", model_name: str = "dslim/bert-base-NER"): self.adaptor = get_adaptor(model_type) model = AutoModelForTokenClassification.from_pretrained(model_name) super().__init__(model_type, model_name, model) device_number = detect_cuda_device_number() self._pipeline = TokenClassificationPipeline(model=self.model, tokenizer=self.tokenizer, device=device_number) self._trainer = TOCTrainer(self.model, model_type, self.tokenizer, self._device, self.logger)
from transformers import ElectraForTokenClassification, TokenClassificationPipeline from tokenization_kocharelectra import KoCharElectraTokenizer from pprint import pprint tokenizer = KoCharElectraTokenizer.from_pretrained( "monologg/kocharelectra-base-kmounlp-ner") model = ElectraForTokenClassification.from_pretrained( "monologg/kocharelectra-base-kmounlp-ner") ner = TokenClassificationPipeline(model=model, tokenizer=tokenizer, ignore_labels=["O"], grouped_entities=True, device=-1) pprint( ner("문재인 대통령은 28일 서울 코엑스에서 열린 ‘데뷰 (Deview) 2019’ 행사에 참석해 젊은 개발자들을 격려하면서 우리 정부의 인공지능 기본구상을 내놓았다. 출처 : 미디어오늘 (http://www.mediatoday.co.kr)" ))
def run_aggregation_strategy(self, model, tokenizer): token_classifier = TokenClassificationPipeline( model=model, tokenizer=tokenizer, aggregation_strategy="simple") self.assertEqual(token_classifier.aggregation_strategy, AggregationStrategy.SIMPLE) outputs = token_classifier("A simple string") self.assertIsInstance(outputs, list) n = len(outputs) self.assertEqual( nested_simplify(outputs), [{ "entity_group": ANY(str), "score": ANY(float), "start": ANY(int), "end": ANY(int), "word": ANY(str), } for i in range(n)], ) token_classifier = TokenClassificationPipeline( model=model, tokenizer=tokenizer, aggregation_strategy="first") self.assertEqual(token_classifier.aggregation_strategy, AggregationStrategy.FIRST) outputs = token_classifier("A simple string") self.assertIsInstance(outputs, list) n = len(outputs) self.assertEqual( nested_simplify(outputs), [{ "entity_group": ANY(str), "score": ANY(float), "start": ANY(int), "end": ANY(int), "word": ANY(str), } for i in range(n)], ) token_classifier = TokenClassificationPipeline( model=model, tokenizer=tokenizer, aggregation_strategy="max") self.assertEqual(token_classifier.aggregation_strategy, AggregationStrategy.MAX) outputs = token_classifier("A simple string") self.assertIsInstance(outputs, list) n = len(outputs) self.assertEqual( nested_simplify(outputs), [{ "entity_group": ANY(str), "score": ANY(float), "start": ANY(int), "end": ANY(int), "word": ANY(str), } for i in range(n)], ) token_classifier = TokenClassificationPipeline( model=model, tokenizer=tokenizer, aggregation_strategy="average") self.assertEqual(token_classifier.aggregation_strategy, AggregationStrategy.AVERAGE) outputs = token_classifier("A simple string") self.assertIsInstance(outputs, list) n = len(outputs) self.assertEqual( nested_simplify(outputs), [{ "entity_group": ANY(str), "score": ANY(float), "start": ANY(int), "end": ANY(int), "word": ANY(str), } for i in range(n)], ) with self.assertWarns(UserWarning): token_classifier = pipeline(task="ner", model=model, tokenizer=tokenizer, grouped_entities=True) self.assertEqual(token_classifier.aggregation_strategy, AggregationStrategy.SIMPLE) with self.assertWarns(UserWarning): token_classifier = pipeline(task="ner", model=model, tokenizer=tokenizer, grouped_entities=True, ignore_subwords=True) self.assertEqual(token_classifier.aggregation_strategy, AggregationStrategy.FIRST)
def get_test_pipeline(self, model, tokenizer, feature_extractor): token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer) return token_classifier, [ "A simple string", "A simple string that is quite a bit longer" ]
# news['entities'] = news.Article.apply(get_entities) # huggingface NER pipeline from transformers import TokenClassificationPipeline, TFAutoModelForTokenClassification, AutoTokenizer # from transformers import pipeline # nlp = pipeline('ner') # print(pd.DataFrame(nlp(sentence))) # gives out tokens and labes sentence = 'Apple and Microsoft plan to form a joint venture for the development of cloud-based computing ' \ 'infrastrucutre.' ## BERT tokenizer and token classification nlp = TokenClassificationPipeline(model=TFAutoModelForTokenClassification.from_pretrained( 'distilbert-base-cased'), tokenizer=AutoTokenizer.from_pretrained('distilbert-base-cased'), framework='tf') print(pd.DataFrame(nlp(sentence))) from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification import tensorflow as tf tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1 print(model(input_ids)) import numpy as np from transformers import AutoTokenizer, pipeline, TFDistilBertModel tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')