def annotate(self, text): for word in re.findall(self.regex_string, text): yield Entity(word, "Noun", rules=[self.rule], source_text=text, confidence=0.8)
def annotate(self, text): if self.lowercase: words = text.lower().split() else: words = text.split() conf = 0.75 if self.lowercase else 0.9 for word in words: for city in self.cities: code = city["country"] name = city["name"] if float(city["lat"]) < 0: hemisphere = "south" else: hemisphere = "north" data = { "name": name, "country_code": code, "latitude": float(city["lat"]), "longitude": float(city["lng"]), "hemisphere": hemisphere } if self.lowercase: name = name.lower() if word == name: yield Entity(data["name"], "City", source_text=text, data=data, confidence=conf)
def annotate(self, text): for e in spotlight.annotate(self.host, text, confidence=self.confidence, support=self.support): for e_type in e["types"].split(","): if e_type.startswith("DBpedia:"): yield Entity(e["surfaceForm"], e_type.split(":")[-1], source_text=text, data={ "uri": e["URI"], "support": e["support"], "offset": e["offset"], "percentageOfSecondRank": e["percentageOfSecondRank"], "similarityScore": e["similarityScore"], "types": e["types"].split(",") }, confidence=e["similarityScore"])
def extract_entities(self, text, as_json=False): parsed = self.engine.parse(text) for e in parsed: yield Entity(e["value"], e["entity_kind"], source_text=text, data=e["resolved_value"])
def add_entity_examples(self, name, examples): if isinstance(examples, str): examples = [examples] self._container.add_entity(name, examples) if name not in self._examples: self._examples[name] = [] for e in examples: self._examples[name].append(Entity(e, name))
def add_entity_examples(self, name, examples): if isinstance(examples, str): examples = [examples] if name not in self._examples: self._examples[name] = [] for e in examples: rules = r'\b' + e.lower() + r"\b" self._examples[name].append( Entity(e, name, rules=Rule(name, rules)))
def annotate_duration(self, text): delta, value = _annotate_duration_en(text) if delta: data = { "days": delta.days, "seconds": delta.seconds, "microseconds": delta.microseconds, "total_seconds": delta.total_seconds(), "spoken": nice_duration(delta) } yield Entity(value, "duration", source_text=text, data=data)
def extract_entities(self, query): for r in self._rules: for rule in self._rules[r]: for rul in rule.rules: regex = self._create_regex(rul) match = regex.findall(query) for ent in match: if not isinstance(ent, str): ent = ent[0] yield Entity(ent, rule.name, source_text=query, rules=self._rules[r])
def annotate(self, text, pretokenized=False): doc = self.pipeline.doc(text, pretokenized=pretokenized) if doc is not None: cons_list = [] if self.connl: ents = doc.get_ner_conll if ents.cons_list: cons_list += ents.cons_list elif self.ontonotes: ents = doc.get_ner_ontonotes if ents.cons_list: cons_list += ents.cons_list for e in cons_list: yield Entity(e["tokens"], e["label"], source_text=text, data=e)
def annotate(self, text): # deprecated from dateparser.search import search_dates matches = search_dates(text) for value, date in matches: data = { "timestamp": date.timestamp(), "isoformat": date.isoformat(), "weekday": date.isoweekday(), "month": date.month, "day": date.day, "hour": date.hour, "minute": date.minute, "year": date.year } yield Entity(value, "date", source_text=text, data=data)
def annotate(self, text): words = nltk.word_tokenize(text) tagged_Words = nltk.pos_tag(words) named_Entity = nltk.ne_chunk(tagged_Words) for x in named_Entity: if isinstance(x, nltk.tree.Tree): data = { "label": x.__dict__["_label"], "pos_tag": [e[1] for e in x], "tokens": [e[0] for e in x] } yield Entity(" ".join([e[0] for e in x]), x.__dict__["_label"], source_text=text, data=data)
def _old_annotate(self, text): # deprecated import datefinder matches = datefinder.find_dates(text, index=True) for date, span in matches: value = text[span[0]:span[1]].strip() data = { "timestamp": date.timestamp(), "isoformat": date.isoformat(), "weekday": date.isoweekday(), "month": date.month, "day": date.day, "hour": date.hour, "minute": date.minute, "year": date.year } yield Entity(value, "date", source_text=text, data=data)
def annotate(self, text): for e in parser.parse(text): spoken = e.to_spoken() data = e.__dict__ data["unit"] = e.unit.__dict__ data["unit"]["entity"] = e.unit["entity"].__dict__ e_type = e.unit["entity"]["uri"] if e.unit["uri"] != e.unit["entity"]["uri"]: e_type = e.unit["entity"]["uri"] + ":" + e.unit["uri"] data["spoken"] = spoken data.pop("span") if data["unit"]["currency_code"] is None: data["unit"].pop("currency_code") data = data.copy() data.pop("surface") yield Entity(e.surface, e_type, source_text=text, data=data)
def annotate(self, text): annotation = self.model.predict(text).__dict__["annotations"] for entity in annotation["entities"]: label, start, end, text = annotation["entities"][entity] yield Entity(text, label, source_text=text)
def annotate(self, text): for label in self.entities: for ent in self.entities[label]: utt = re.sub(r'\b' + ent + r"\b", "", text.lower()) if utt != text.lower(): yield Entity(ent, label, source_text=text)
def annotate(self, text): for e in spacy_NER(text): val = text[e["start"]:e["end"]] yield Entity(val, e["label"], source_text=text)
def annotate(self, text): for x in self.rake.run(text): data = {"score": x[1]} yield Entity(x[0], "keyword", source_text=text, data=data)
def extract_hitler(text): if "hitler" in text.lower(): yield Entity("hitler", "bad_guy", source_text=text, data={ "known_for": ["killing jews", "world war 2"]})
def annotate(self, text): for e in polyglot_NER(text): yield Entity(e[0], e[1], source_text=text)
def annotate(self, text): doc = self.nlp(text) for entity in doc.ents: yield Entity(entity.text, entity.label_, source_text=text)