class DucklingExtractor(Component): """Adds entity normalization by analyzing found entities and transforming them into regular formats.""" name = "ner_duckling" context_provides = { "process": ["entities"], } output_provides = ["entities"] def __init__(self, duckling_processing_mode, duckling=None): # type: (Text, Optional[DucklingWrapper]) -> None self.duckling_processing_mode = duckling_processing_mode self.duckling = duckling @classmethod def required_packages(cls): # type: () -> List[Text] return ["duckling"] @classmethod def create(cls, duckling_processing_mode): if duckling_processing_mode not in DUCKLING_PROCESSING_MODES: raise ValueError( "Invalid duckling processing mode. Got '{}'. Allowed: {}". format(duckling_processing_mode, ", ".join(DUCKLING_PROCESSING_MODES))) return DucklingExtractor(duckling_processing_mode) @classmethod def cache_key(cls, model_metadata): # type: (Metadata) -> Text return cls.name + "-" + model_metadata.language def pipeline_init(self, language): # type: (Text, Text) -> None from duckling import DucklingWrapper if self.duckling is None: try: self.duckling = DucklingWrapper( language=language ) # languages in duckling are eg "de$core" except ValueError as e: raise Exception("Duckling error. {}".format(e)) def process(self, text, entities): # type: (Text, List[Dict[Text, Any]], Text) -> Dict[Text, Any] if self.duckling is not None: parsed = self.duckling.parse(text) for duckling_match in parsed: for entity in entities: if entity["start"] == duckling_match["start"] and entity[ "end"] == duckling_match["end"]: entity["value"] = duckling_match["value"]["value"] entity["duckling"] = duckling_match["dim"] break else: if self.duckling_processing_mode == "append": # Duckling will retrieve multiple entities, even if they overlap.. # hence the append mode might add some noise to the found entities entities.append({ "entity": duckling_match["dim"], "duckling": duckling_match["dim"], "value": duckling_match["value"]["value"], "start": duckling_match["start"], "end": duckling_match["end"], }) return {"entities": entities} @classmethod def load(cls, duckling_processing_mode): # type: (Text) -> DucklingExtractor return cls.create(duckling_processing_mode)
class DucklingExtractor(EntityExtractor): """Adds entity normalization by analyzing found entities and transforming them into regular formats.""" name = "ner_duckling" context_provides = { "process": ["entities"], } output_provides = ["entities"] @staticmethod def available_dimensions(): from duckling.dim import Dim return [ m[1] for m in getmembers(Dim) if not m[0].startswith("__") and not m[0].endswith("__") ] def __init__(self, dimensions=None, duckling=None): # type: (Text, Optional[DucklingWrapper]) -> None self.dimensions = dimensions if dimensions else self.available_dimensions( ) self.duckling = duckling @classmethod def required_packages(cls): # type: () -> List[Text] return ["duckling"] @classmethod def create(cls, duckling_dimensions): if duckling_dimensions is None: duckling_dimensions = cls.available_dimensions() unknown_dimensions = [ dim for dim in duckling_dimensions if dim not in cls.available_dimensions() ] if len(unknown_dimensions) > 0: raise ValueError( "Invalid duckling dimension. Got '{}'. Allowed: {}".format( ", ".join(unknown_dimensions), ", ".join(cls.available_dimensions()))) return DucklingExtractor(duckling_dimensions) @classmethod def cache_key(cls, model_metadata): # type: (Metadata) -> Text return cls.name + "-" + model_metadata.language def pipeline_init(self, language): # type: (Text, Text) -> None from duckling import DucklingWrapper if self.duckling is None: try: self.duckling = DucklingWrapper( language=language ) # languages in duckling are eg "de$core" except ValueError as e: # pragma: no cover raise Exception("Duckling error. {}".format(e)) def process(self, text, entities): # type: (Text, List[Dict[Text, Any]]) -> Dict[Text, Any] extracted = [] if self.duckling is not None: matches = self.duckling.parse(text) relevant_matches = [ match for match in matches if match["dim"] in self.dimensions ] for match in relevant_matches: entity = { "start": match["start"], "end": match["end"], "text": match["text"], "value": match["value"], "entity": match["dim"] } extracted.append(entity) extracted = self.add_extractor_name(extracted) entities.extend(extracted) return {"entities": entities} def persist(self, model_dir): # type: (Text) -> Dict[Text, Any] file_name = self.name + ".json" full_name = os.path.join(model_dir, file_name) with io.open(full_name, 'w') as f: f.write(str(json.dumps({"dimensions": self.dimensions}))) return {"ner_duckling_persisted": file_name} @classmethod def load(cls, model_dir, ner_duckling_persisted): # type: (Text) -> DucklingExtractor persisted = os.path.join(model_dir, ner_duckling_persisted) if os.path.isfile(persisted): with io.open(persisted, encoding='utf-8') as f: persisted_data = json.loads(f.read()) return cls.create(persisted_data["dimensions"])
ent['value'] = txt ent['entity'] = 'product' sentence['entities'].append(ent) sentence['text'] += txt + " " while random.random() > .5: m = random.choice(middle) sentence['text'] += m txt = df.sample().iloc[0, 0] ent = dict() ent['start'] = len(sentence['text']) ent['end'] = len(sentence['text'] + txt) ent['value'] = txt ent['entity'] = 'product' sentence['entities'].append(ent) sentence['text'] += txt + " " sentence['text'] += random.choice(end) train_data['rasa_nlu_data']["common_examples"].append(sentence) with open('result.json', 'w+') as fp: json.dump(train_data, fp) container = IntentContainer('intent_cache') d = DucklingWrapper() d.parse('Bring me 250 ml sugar') d.parse_ print(d.parse_time(u'Let\'s meet at 11:45am')) print(d.parse_number(u'Bring me one conserve of ravioli')) print(d.parse_quantity(u'Bring me 100 g of sugar'))