Exemplo n.º 1
0
    def Extract(self, request, context):
        text = request.text
        if text is None or not len(text.strip()):
            return

        entity_count = 0
        for language in request.languages:
            if language not in LANGUAGES:
                continue
            try:
                parsed = Text(text, hint_language_code=language)
                for entity in parsed.entities:
                    label = ' '.join(entity)
                    label = CLEAN.sub(' ', label)
                    label = collapse_spaces(label)
                    if len(label) < 4 or len(label) > 200:
                        continue
                    if ' ' not in label:
                        continue
                    length = entity.end - entity.start
                    entity_count += 1
                    yield ExtractedEntity(label=label,
                                          offset=entity.start,
                                          length=length,
                                          type=TYPES[entity.tag])
            except Exception:
                log.exception("Cannot extract. Language: %s", language)
        log.info("Extract: extracted %s entities.", entity_count)
Exemplo n.º 2
0
    def Extract(self, request, context):
        text = request.text
        if text is None or not len(text.strip()):
            return

        for language in request.languages:
            if language not in LANGUAGES:
                log.debug("Language not suported: %s", language)
                continue
            if language not in self.MODELS:
                log.info("Loading spaCy model: %s", language)
                self.MODELS[language] = spacy.load(language)
            nlp = self.MODELS.get(language)
            try:
                doc = nlp(request.text)
                for entity in doc.ents:
                    # log.info("Entity: %s, %s", entity.text, entity.label)
                    text = entity.text.strip()
                    if len(text) < 4 or len(text) > 100:
                        continue
                    if ' ' not in text:
                        continue
                    type_ = LABELS.get(entity.label_)
                    if type_ is None:
                        continue
                    length = entity.end_char - entity.start_char
                    yield ExtractedEntity(label=text,
                                          offset=entity.start_char,
                                          length=length,
                                          type=type_)
            except Exception:
                log.exception("Cannot extract. Language: %s", language)
Exemplo n.º 3
0
 def make_entity(self, text, type_, start, end):
     text = text.strip()
     if not len(text):
         return
     entity = ExtractedEntity()
     entity.text = text
     entity.type = type_
     entity.start = start
     entity.end = end
     return entity
Exemplo n.º 4
0
 def extract_spacy(self, text):
     try:
         doc = self.spacy(text)
         for ent in doc.ents:
             type_ = SPACY_TYPES.get(ent.label_)
             label = ent.text.strip()
             if type_ is not None and len(label):
                 entity = ExtractedEntity()
                 entity.text = label
                 entity.type = type_
                 entity.start = ent.start
                 entity.end = ent.end
                 yield entity
     except Exception:
         log.exception("spaCy failed")
Exemplo n.º 5
0
 def Extract(self, request, context):
     try:
         doc = self.nlp(request.text)
         count = 0
         for ent in doc.ents:
             type_ = SPACY_TYPES.get(ent.label_)
             label = ent.text.strip()
             if type_ is not None and len(label):
                 count += 1
                 entity = ExtractedEntity()
                 entity.text = label
                 entity.type = type_
                 entity.start = ent.start
                 entity.end = ent.end
                 yield entity
         log.info("[NER]: %d entities from %d chars", count,
                  len(request.text))
     except Exception as exc:
         log.exception("Failed to extract entities")
         context.abort(grpc.StatusCode.INTERNAL, str(exc))