Пример #1
0
 def annotate(self, text):
     for word in re.findall(self.regex_string, text):
         yield Entity(word,
                      "Noun",
                      rules=[self.rule],
                      source_text=text,
                      confidence=0.8)
Пример #2
0
    def annotate(self, text):
        if self.lowercase:
            words = text.lower().split()
        else:
            words = text.split()

        conf = 0.75 if self.lowercase else 0.9

        for word in words:
            for city in self.cities:
                code = city["country"]
                name = city["name"]

                if float(city["lat"]) < 0:
                    hemisphere = "south"
                else:
                    hemisphere = "north"

                data = {
                    "name": name,
                    "country_code": code,
                    "latitude": float(city["lat"]),
                    "longitude": float(city["lng"]),
                    "hemisphere": hemisphere
                }

                if self.lowercase:
                    name = name.lower()

                if word == name:
                    yield Entity(data["name"],
                                 "City",
                                 source_text=text,
                                 data=data,
                                 confidence=conf)
Пример #3
0
 def annotate(self, text):
     for e in spotlight.annotate(self.host,
                                 text,
                                 confidence=self.confidence,
                                 support=self.support):
         for e_type in e["types"].split(","):
             if e_type.startswith("DBpedia:"):
                 yield Entity(e["surfaceForm"],
                              e_type.split(":")[-1],
                              source_text=text,
                              data={
                                  "uri":
                                  e["URI"],
                                  "support":
                                  e["support"],
                                  "offset":
                                  e["offset"],
                                  "percentageOfSecondRank":
                                  e["percentageOfSecondRank"],
                                  "similarityScore":
                                  e["similarityScore"],
                                  "types":
                                  e["types"].split(",")
                              },
                              confidence=e["similarityScore"])
Пример #4
0
 def extract_entities(self, text, as_json=False):
     parsed = self.engine.parse(text)
     for e in parsed:
         yield Entity(e["value"],
                      e["entity_kind"],
                      source_text=text,
                      data=e["resolved_value"])
Пример #5
0
 def add_entity_examples(self, name, examples):
     if isinstance(examples, str):
         examples = [examples]
     self._container.add_entity(name, examples)
     if name not in self._examples:
         self._examples[name] = []
     for e in examples:
         self._examples[name].append(Entity(e, name))
Пример #6
0
 def add_entity_examples(self, name, examples):
     if isinstance(examples, str):
         examples = [examples]
     if name not in self._examples:
         self._examples[name] = []
     for e in examples:
         rules = r'\b' + e.lower() + r"\b"
         self._examples[name].append(
             Entity(e, name, rules=Rule(name, rules)))
Пример #7
0
 def annotate_duration(self, text):
     delta, value = _annotate_duration_en(text)
     if delta:
         data = {
             "days": delta.days,
             "seconds": delta.seconds,
             "microseconds": delta.microseconds,
             "total_seconds": delta.total_seconds(),
             "spoken": nice_duration(delta)
         }
         yield Entity(value, "duration", source_text=text, data=data)
Пример #8
0
 def extract_entities(self, query):
     for r in self._rules:
         for rule in self._rules[r]:
             for rul in rule.rules:
                 regex = self._create_regex(rul)
                 match = regex.findall(query)
                 for ent in match:
                     if not isinstance(ent, str):
                         ent = ent[0]
                     yield Entity(ent,
                                  rule.name,
                                  source_text=query,
                                  rules=self._rules[r])
Пример #9
0
 def annotate(self, text, pretokenized=False):
     doc = self.pipeline.doc(text, pretokenized=pretokenized)
     if doc is not None:
         cons_list = []
         if self.connl:
             ents = doc.get_ner_conll
             if ents.cons_list:
                 cons_list += ents.cons_list
         elif self.ontonotes:
             ents = doc.get_ner_ontonotes
             if ents.cons_list:
                 cons_list += ents.cons_list
         for e in cons_list:
             yield Entity(e["tokens"], e["label"], source_text=text, data=e)
Пример #10
0
 def annotate(self, text):
     # deprecated
     from dateparser.search import search_dates
     matches = search_dates(text)
     for value, date in matches:
         data = {
             "timestamp": date.timestamp(),
             "isoformat": date.isoformat(),
             "weekday": date.isoweekday(),
             "month": date.month,
             "day": date.day,
             "hour": date.hour,
             "minute": date.minute,
             "year": date.year
         }
         yield Entity(value, "date", source_text=text, data=data)
Пример #11
0
    def annotate(self, text):
        words = nltk.word_tokenize(text)
        tagged_Words = nltk.pos_tag(words)

        named_Entity = nltk.ne_chunk(tagged_Words)
        for x in named_Entity:
            if isinstance(x, nltk.tree.Tree):
                data = {
                    "label": x.__dict__["_label"],
                    "pos_tag": [e[1] for e in x],
                    "tokens": [e[0] for e in x]
                }
                yield Entity(" ".join([e[0] for e in x]),
                             x.__dict__["_label"],
                             source_text=text,
                             data=data)
Пример #12
0
 def _old_annotate(self, text):
     # deprecated
     import datefinder
     matches = datefinder.find_dates(text, index=True)
     for date, span in matches:
         value = text[span[0]:span[1]].strip()
         data = {
             "timestamp": date.timestamp(),
             "isoformat": date.isoformat(),
             "weekday": date.isoweekday(),
             "month": date.month,
             "day": date.day,
             "hour": date.hour,
             "minute": date.minute,
             "year": date.year
         }
         yield Entity(value, "date", source_text=text, data=data)
Пример #13
0
    def annotate(self, text):

        for e in parser.parse(text):
            spoken = e.to_spoken()
            data = e.__dict__
            data["unit"] = e.unit.__dict__
            data["unit"]["entity"] = e.unit["entity"].__dict__
            e_type = e.unit["entity"]["uri"]
            if e.unit["uri"] != e.unit["entity"]["uri"]:
                e_type = e.unit["entity"]["uri"] + ":" + e.unit["uri"]
            data["spoken"] = spoken
            data.pop("span")
            if data["unit"]["currency_code"] is None:
                data["unit"].pop("currency_code")
            data = data.copy()
            data.pop("surface")
            yield Entity(e.surface, e_type, source_text=text, data=data)
Пример #14
0
 def annotate(self, text):
     annotation = self.model.predict(text).__dict__["annotations"]
     for entity in annotation["entities"]:
         label, start, end, text = annotation["entities"][entity]
         yield Entity(text, label, source_text=text)
Пример #15
0
 def annotate(self, text):
     for label in self.entities:
         for ent in self.entities[label]:
             utt = re.sub(r'\b' + ent + r"\b", "", text.lower())
             if utt != text.lower():
                 yield Entity(ent, label, source_text=text)
Пример #16
0
 def annotate(self, text):
     for e in spacy_NER(text):
         val = text[e["start"]:e["end"]]
         yield Entity(val, e["label"], source_text=text)
Пример #17
0
 def annotate(self, text):
     for x in self.rake.run(text):
         data = {"score": x[1]}
         yield Entity(x[0], "keyword", source_text=text, data=data)
Пример #18
0
def extract_hitler(text):
    if "hitler" in text.lower():
        yield Entity("hitler", "bad_guy", source_text=text, data={
            "known_for": ["killing jews", "world war 2"]})
Пример #19
0
 def annotate(self, text):
     for e in polyglot_NER(text):
         yield Entity(e[0], e[1], source_text=text)
Пример #20
0
 def annotate(self, text):
     doc = self.nlp(text)
     for entity in doc.ents:
         yield Entity(entity.text, entity.label_, source_text=text)