def Extract(self, request, context): text = request.text if text is None or not len(text.strip()): return entity_count = 0 for language in request.languages: if language not in LANGUAGES: continue try: parsed = Text(text, hint_language_code=language) for entity in parsed.entities: label = ' '.join(entity) label = CLEAN.sub(' ', label) label = collapse_spaces(label) if len(label) < 4 or len(label) > 200: continue if ' ' not in label: continue length = entity.end - entity.start entity_count += 1 yield ExtractedEntity(label=label, offset=entity.start, length=length, type=TYPES[entity.tag]) except Exception: log.exception("Cannot extract. Language: %s", language) log.info("Extract: extracted %s entities.", entity_count)
def Extract(self, request, context): text = request.text if text is None or not len(text.strip()): return for language in request.languages: if language not in LANGUAGES: log.debug("Language not suported: %s", language) continue if language not in self.MODELS: log.info("Loading spaCy model: %s", language) self.MODELS[language] = spacy.load(language) nlp = self.MODELS.get(language) try: doc = nlp(request.text) for entity in doc.ents: # log.info("Entity: %s, %s", entity.text, entity.label) text = entity.text.strip() if len(text) < 4 or len(text) > 100: continue if ' ' not in text: continue type_ = LABELS.get(entity.label_) if type_ is None: continue length = entity.end_char - entity.start_char yield ExtractedEntity(label=text, offset=entity.start_char, length=length, type=type_) except Exception: log.exception("Cannot extract. Language: %s", language)
def make_entity(self, text, type_, start, end): text = text.strip() if not len(text): return entity = ExtractedEntity() entity.text = text entity.type = type_ entity.start = start entity.end = end return entity
def extract_spacy(self, text): try: doc = self.spacy(text) for ent in doc.ents: type_ = SPACY_TYPES.get(ent.label_) label = ent.text.strip() if type_ is not None and len(label): entity = ExtractedEntity() entity.text = label entity.type = type_ entity.start = ent.start entity.end = ent.end yield entity except Exception: log.exception("spaCy failed")
def Extract(self, request, context): try: doc = self.nlp(request.text) count = 0 for ent in doc.ents: type_ = SPACY_TYPES.get(ent.label_) label = ent.text.strip() if type_ is not None and len(label): count += 1 entity = ExtractedEntity() entity.text = label entity.type = type_ entity.start = ent.start entity.end = ent.end yield entity log.info("[NER]: %d entities from %d chars", count, len(request.text)) except Exception as exc: log.exception("Failed to extract entities") context.abort(grpc.StatusCode.INTERNAL, str(exc))