def __init__(self, model): self.nlp = model self.duckling_wrapper = DucklingWrapper(parse_datetime=True) self.stanford_ner = StanfordNERTagger( '/Users/mac/stanford-tools/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/mac/stanford-tools/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8')
def pipeline_init(self, language): # type: (Text, Text) -> None from duckling import DucklingWrapper if self.duckling is None: try: self.duckling = DucklingWrapper(language=language) # languages in duckling are eg "de$core" except ValueError as e: raise Exception("Duckling error. {}".format(e))
def test(): ## # print(str(DucklingWrapper())) # result = time_extract(u'received claims on 28th and 27th ') # print(str(result).replace('[','(').replace(']',')')) result = time_extract(u'received claims in last two months') print(str(result).replace('[','(').replace(']',')')) # # print(str(type(result[0]['value']['value']))) ## result = DucklingWrapper().parse_time(u'received claims in last two months') print(str(result)) # test() # print(str(time_extract("received claims in yesterday"))) # result = DucklingWrapper().parse_time(u'received claims on Jan,28') # print(str(result)) # result = DucklingWrapper().parse_time(u'received claims on Jan 28') # print(str(result)) # result = DucklingWrapper().parse_time(u'received claims on January 28') # print(str(result)) # result = DucklingWrapper().parse_time(u'received claims on January 28') # print(str(result))
def create_duckling_wrapper(cls, language): from duckling import DucklingWrapper try: # languages in duckling are eg "de$core" return DucklingWrapper(language=language) except ValueError as e: # pragma: no cover raise Exception("Duckling error. {}".format(e))
def init_duck(): "cache duckling parser" if 'duck' not in d: logger.info("re/initializing parser, JVMStarted_state = %d" % jpype.isJVMStarted()) duck = DucklingWrapper() d['duck'] = duck logger.info("duck ready to parse....")
def __init__(self, name: str, source_key: str = None, overwrite: bool = False, source_iter: Callable[[List[str]], Iterator[IO[AnyStr]]] = file_iter, output_handler: Callable[[str, Dict[str, Any]], None] = oh): super().__init__(name, source_key, overwrite) self.__source_iter = source_iter self.__output_handler = output_handler root_path = Path(__file__).parent.parent entities_path = str(root_path / 'config/entities.csv') self.entity_reverse_lookup, synonyms, self.regexprs = load_entities( entities_path) self.keyword_processor = prepare_keyword_processor(synonyms) duckling_entities = {ENTITY_DATE, ENTITY_NUMBER} tagger_entities = {ENTITY_PERSON} if len(duckling_entities.intersection(ENABLED_SYSTEM_ENTITIES)) > 0: self.d = DucklingWrapper() if len(tagger_entities.intersection(ENABLED_SYSTEM_ENTITIES)) > 0: self.tagger = SequenceTagger.load('ner')
class NERExtractorClass(): def __init__(self , model='en'): self.nlp = spacy.load(model) self.duckling_wrapper = DucklingWrapper(parse_datetime=True) def spacy_parse(self ,text): # https://spacy.io/api/annotation#named-entities doc = self.nlp(text) ner = [] for e in doc.ents: tmp = {"text":e.text,"label":e.label_} ner.append(tmp) return ner def duckling_parse(self, text): weekend = 'by the end of the weekend' asap = 'the end of the day' text = text.lower() text += " " text = text.replace("the end of the week ",weekend).replace("the end of week ",weekend).replace("end of week ",weekend).replace("end of the week ",weekend) text = text.replace("asap",asap).replace("as soon as possible",asap) result = self.duckling_wrapper.parse_time(text) return result def parse(self, text ,method='spacy'): if method == 'spacy': return self.spacy_parse(text) if method == 'dickling': return self.duckling_parse(text) return {}
from nltk.stem.wordnet import WordNetLemmatizer import sys import re from nltk.classify import Senna from nltk.stem.wordnet import WordNetLemmatizer pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) lmtzr = WordNetLemmatizer() #Stop Words generation stop_words = [] file_open_stopwords = open('stop_words.txt', 'r') for each_word in file_open_stopwords: stop_words.append(each_word.strip()) file_open_stopwords.close() d = DucklingWrapper() leave_application_json = {} context_stack = [] leave_application_previous = {} def intent_extractor(message): file_Name = "intent_leave_tfidf.p" fileObject = open(file_Name, 'rb') passive_tfidf = pickle.load(fileObject) fileObject.close() file_Name_lsa = "intent_leave_lsa.p" fileObject_lsa = open(file_Name_lsa, 'rb') passive_lsa = pickle.load(fileObject_lsa) fileObject_lsa.close()
class DucklingExtractor(Component): """Adds entity normalization by analyzing found entities and transforming them into regular formats.""" name = "ner_duckling" context_provides = { "process": ["entities"], } output_provides = ["entities"] def __init__(self, duckling_processing_mode, duckling=None): # type: (Text, Optional[DucklingWrapper]) -> None self.duckling_processing_mode = duckling_processing_mode self.duckling = duckling @classmethod def required_packages(cls): # type: () -> List[Text] return ["duckling"] @classmethod def create(cls, duckling_processing_mode): if duckling_processing_mode not in DUCKLING_PROCESSING_MODES: raise ValueError( "Invalid duckling processing mode. Got '{}'. Allowed: {}". format(duckling_processing_mode, ", ".join(DUCKLING_PROCESSING_MODES))) return DucklingExtractor(duckling_processing_mode) @classmethod def cache_key(cls, model_metadata): # type: (Metadata) -> Text return cls.name + "-" + model_metadata.language def pipeline_init(self, language): # type: (Text, Text) -> None from duckling import DucklingWrapper if self.duckling is None: try: self.duckling = DucklingWrapper( language=language ) # languages in duckling are eg "de$core" except ValueError as e: raise Exception("Duckling error. {}".format(e)) def process(self, text, entities): # type: (Text, List[Dict[Text, Any]], Text) -> Dict[Text, Any] if self.duckling is not None: parsed = self.duckling.parse(text) for duckling_match in parsed: for entity in entities: if entity["start"] == duckling_match["start"] and entity[ "end"] == duckling_match["end"]: entity["value"] = duckling_match["value"]["value"] entity["duckling"] = duckling_match["dim"] break else: if self.duckling_processing_mode == "append": # Duckling will retrieve multiple entities, even if they overlap.. # hence the append mode might add some noise to the found entities entities.append({ "entity": duckling_match["dim"], "duckling": duckling_match["dim"], "value": duckling_match["value"]["value"], "start": duckling_match["start"], "end": duckling_match["end"], }) return {"entities": entities} @classmethod def load(cls, duckling_processing_mode): # type: (Text) -> DucklingExtractor return cls.create(duckling_processing_mode)
from flask import Flask, request, jsonify, make_response from duckling import DucklingWrapper, Language from dotenv import load_dotenv import os import unicodedata load_dotenv() app = Flask(__name__) duck = DucklingWrapper(language=Language.SPANISH, maximum_heap_size='512m') DUCKLING_HOST = os.getenv("DUCKLING_HOST") DUCKLING_PORT = os.getenv("DUCKLING_PORT") def normalize(string: str): res = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8') return res def _parse_date(sent: str): if sent is None: return None ans = duck.parse_time(sent) precedence = ["year", "month", "day", "hour", "minute", "second"] if len(ans) > 0: text = ans[0]["text"] val = ans[0]["value"]["value"] if "grain" in ans[0]["value"]: if normalize(text.lower()) not in [
def duckling_wrapper(): return DucklingWrapper()
def duckling_wrapper_with_datetime(): return DucklingWrapper(parse_datetime=True)
def __init__(self , model='en'): self.nlp = spacy.load(model) self.duckling_wrapper = DucklingWrapper(parse_datetime=True)
from duckling import DucklingWrapper import json d = DucklingWrapper() while 1: try: sentence = raw_input("Enter: ") dic = {'out_put': d.parse_time(sentence)} print d.parse_time(sentence) file_open = open('duckling_time_new.json', 'w') json.dump(dic, file_open) file_open.close() except Exception, e: print e
def __init__(self, entities_types): self.entities_types_list = entities_types self.ducklingInstance = DucklingWrapper()
def duckling(cls): return DucklingWrapper()
def test(): print(str(DucklingWrapper())) result = time_extract(u'received claims on 28th and 27th ') print(str(result).replace('[', '(').replace(']', ')'))
class ExtractEntitiesStep(AbstractStep): """ Extract entities from collected text. """ def __init__(self, name: str, source_key: str = None, overwrite: bool = False, source_iter: Callable[[List[str]], Iterator[IO[AnyStr]]] = file_iter, output_handler: Callable[[str, Dict[str, Any]], None] = oh): super().__init__(name, source_key, overwrite) self.__source_iter = source_iter self.__output_handler = output_handler root_path = Path(__file__).parent.parent entities_path = str(root_path / 'config/entities.csv') self.entity_reverse_lookup, synonyms, self.regexprs = load_entities( entities_path) self.keyword_processor = prepare_keyword_processor(synonyms) duckling_entities = {ENTITY_DATE, ENTITY_NUMBER} tagger_entities = {ENTITY_PERSON} if len(duckling_entities.intersection(ENABLED_SYSTEM_ENTITIES)) > 0: self.d = DucklingWrapper() if len(tagger_entities.intersection(ENABLED_SYSTEM_ENTITIES)) > 0: self.tagger = SequenceTagger.load('ner') def process_file(self, file: IO[AnyStr], path: str, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: logger.debug('process file: {}'.format(file.name)) input_doc = json.load(file) metadata = input_doc['metadata'] record_id = metadata['record_id'] data = input_doc['data'] text = data['text'] nlp_text = [] for t in text: entities = [] keywords_found = self.keyword_processor.extract_keywords( t, span_info=True) for keyword in keywords_found: entities.append({ 'entity': self.entity_reverse_lookup[keyword[0]], 'location': keyword[1:], 'value': keyword[0], 'confidence': 1.0 }) matches = match_regexprs(t, self.regexprs) for match in matches: match['entity'] = self.entity_reverse_lookup[match['value']] entities.extend(matches) entities.extend(self.match_system_entities(t)) # is the span of an entity contained within the span # of another entity def is_contained(entity): start, end = entity['location'] for ent in entities: s, e = ent['location'] # exclude exact span matches if (start == s and end < e) or ( start > s and end == e) or (start > s and end < e): return True return False def is_valid(entity): # remove spurious dates if entity['entity'] == 'sys-date': start, end = entity['location'] if (end - start) < 8: return False value = entity['value'] if isinstance(value, str): try: date = parse(value) except ValueError: return False year = date.year if year < 1990 or year > 2025: return False return True # keep the entity with the longest span where an entity # is contained within the span of another pruned_entities = [ ent for ent in entities if not is_contained(ent) and is_valid(ent) ] nlp_text.append({'text': t, 'entities': pruned_entities}) now = datetime.utcnow().isoformat() write_root_dir = control_data['job']['write_root_dir'] step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.json'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) data = {} data['nlp_text'] = nlp_text content = {'metadata': metadata, 'data': data} accumulator['files_output'].append({ 'filename': output_filename, 'input': path, 'path': output_path, 'status': 'processed', 'time': now }) self.__output_handler(output_path, content) def run(self, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: file_paths = [x['path'] for x in control_data[self.source_key]] step_name = convert_name_to_underscore(self.name) processed_file_paths = {} if step_name in control_data: for x in control_data[step_name]: if x['status'] == 'processed': processed_file_paths[x['input']] = x for file, path in self.__source_iter(file_paths): if not self._overwrite and path in processed_file_paths.keys(): accumulator['files_output'].append(processed_file_paths[path]) continue self.process_file(file, path, control_data, logger, accumulator) def match_system_entities(self, utter): matches = [] if ENTITY_DATE in ENABLED_SYSTEM_ENTITIES: results = self.d.parse_time(utter) for result in results: matches.append({ 'entity': 'sys-date', 'location': [result['start'], result['end']], 'value': result['value']['value'], 'confidence': 1.0 }) if ENTITY_NUMBER in ENABLED_SYSTEM_ENTITIES: results = self.d.parse_number(utter) for result in results: matches.append({ 'entity': 'sys-number', 'location': [result['start'], result['end']], 'value': result['value']['value'], 'confidence': 1.0 }) sentence = None if ENTITY_PERSON in ENABLED_SYSTEM_ENTITIES: if sentence is None: sentence = Sentence(utter) self.tagger.predict(sentence) for entity in sentence.get_spans('ner'): if entity.tag == 'PER': matches.append({ 'entity': 'sys-person', 'location': [entity.start_pos, entity.end_pos], 'value': entity.text, 'confidence': entity.score }) return matches
class DucklingExtractor(EntityExtractor): """Adds entity normalization by analyzing found entities and transforming them into regular formats.""" name = "ner_duckling" context_provides = { "process": ["entities"], } output_provides = ["entities"] @staticmethod def available_dimensions(): from duckling.dim import Dim return [ m[1] for m in getmembers(Dim) if not m[0].startswith("__") and not m[0].endswith("__") ] def __init__(self, dimensions=None, duckling=None): # type: (Text, Optional[DucklingWrapper]) -> None self.dimensions = dimensions if dimensions else self.available_dimensions( ) self.duckling = duckling @classmethod def required_packages(cls): # type: () -> List[Text] return ["duckling"] @classmethod def create(cls, duckling_dimensions): if duckling_dimensions is None: duckling_dimensions = cls.available_dimensions() unknown_dimensions = [ dim for dim in duckling_dimensions if dim not in cls.available_dimensions() ] if len(unknown_dimensions) > 0: raise ValueError( "Invalid duckling dimension. Got '{}'. Allowed: {}".format( ", ".join(unknown_dimensions), ", ".join(cls.available_dimensions()))) return DucklingExtractor(duckling_dimensions) @classmethod def cache_key(cls, model_metadata): # type: (Metadata) -> Text return cls.name + "-" + model_metadata.language def pipeline_init(self, language): # type: (Text, Text) -> None from duckling import DucklingWrapper if self.duckling is None: try: self.duckling = DucklingWrapper( language=language ) # languages in duckling are eg "de$core" except ValueError as e: # pragma: no cover raise Exception("Duckling error. {}".format(e)) def process(self, text, entities): # type: (Text, List[Dict[Text, Any]]) -> Dict[Text, Any] extracted = [] if self.duckling is not None: matches = self.duckling.parse(text) relevant_matches = [ match for match in matches if match["dim"] in self.dimensions ] for match in relevant_matches: entity = { "start": match["start"], "end": match["end"], "text": match["text"], "value": match["value"], "entity": match["dim"] } extracted.append(entity) extracted = self.add_extractor_name(extracted) entities.extend(extracted) return {"entities": entities} def persist(self, model_dir): # type: (Text) -> Dict[Text, Any] file_name = self.name + ".json" full_name = os.path.join(model_dir, file_name) with io.open(full_name, 'w') as f: f.write(str(json.dumps({"dimensions": self.dimensions}))) return {"ner_duckling_persisted": file_name} @classmethod def load(cls, model_dir, ner_duckling_persisted): # type: (Text) -> DucklingExtractor persisted = os.path.join(model_dir, ner_duckling_persisted) if os.path.isfile(persisted): with io.open(persisted, encoding='utf-8') as f: persisted_data = json.loads(f.read()) return cls.create(persisted_data["dimensions"])
class NERExtractorClass(): def __init__(self, model): self.nlp = model self.duckling_wrapper = DucklingWrapper(parse_datetime=True) self.stanford_ner = StanfordNERTagger( '/Users/mac/stanford-tools/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/mac/stanford-tools/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8') def spacy_extract_persons_from_noun_chunks_and_NNP(self, doc): persons1 = [] nsubjs = [] for w in doc: if "nsubj" == w.dep_: nsubjs.append(w.text) for w in doc.noun_chunks: for sub in nsubjs: if sub in w.text: persons1.append({"text": w.text, "label": "PERSON"}) persons2 = [{ "text": str(x), "label": "PERSON" } for x in doc if x.tag_ == "NNP"] persons = persons1 + persons2 persons = [dict(t) for t in {tuple(d.items()) for d in persons}] return persons def cust_person_spacy_parse(self, text): # https://spacy.io/api/annotation#named-entities doc = self.nlp(text) ner = [] for e in doc.ents: if e.label_ != "PERSON": tmp = {"text": e.text, "label": e.label_} ner.append(tmp) p_ner = self.spacy_extract_persons_from_noun_chunks_and_NNP(doc) ner += p_ner return ner def spacy_extract_nouns(self, text): doc = self.nlp(text) persons = [] for w in doc: if w.pos_ == "PRON" and w.text.lower() != "it": tmp = {"text": w.text, "label": "PERSON"} persons.append(tmp) return persons def gazetteer_parse(self, text, ner): ner = gazetteer_tag(text, ner) return ner def cust_person_stanford_parse(self, text): # https://spacy.io/api/annotation#named-entities ner = self.stanford_parse(text) p_ner = self.spacy_extract_nouns(text) ner += p_ner g_ner = self.gazetteer_parse(text, ner) ner += g_ner tokenized_text = word_tokenize(text) new_ner = [] last_index = 0 print(ner) for n in ner: ner_text = " ".join([n['text'] for n in new_ner]) # print(ner_text) if n['text'].lower() not in ner_text.lower() and n['label'] != 'O': # print(n['text']) if len(n['text'].split()) == 1: current_index = tokenized_text.index(n['text']) if abs(current_index - last_index ) == 1 and new_ner[-1]["label"] == n['label']: if current_index - last_index == 1: new_ner[-1]["text"] += " " + n['text'] else: new_ner[-1][ "text"] = n['text'] + " " + new_ner[-1]["text"] else: tmp = {"text": n['text'], "label": n['label']} new_ner.append(tmp) last_index = current_index else: tmp = {"text": n['text'], "label": n['label']} new_ner.append(tmp) return new_ner def spacy_parse(self, text): # https://spacy.io/api/annotation#named-entities doc = self.nlp(text) ner = [] for e in doc.ents: tmp = {"text": e.text, "label": e.label_} ner.append(tmp) return ner def stanford_parse(self, text): tokenized_text = word_tokenize(text) classified_text = self.stanford_ner.tag(tokenized_text) ner = [] for w, t in classified_text: tmp = {"text": w, "label": t} ner.append(tmp) return ner def duckling_parse(self, text): weekend = 'by the end of the weekend' asap = 'the end of the day' text = text.lower() text += " " text = text.replace("the end of the week ", weekend).replace( "the end of week ", weekend).replace("end of week ", weekend).replace("end of the week ", weekend) text = text.replace("asap", asap).replace("as soon as possible", asap) result = self.duckling_wrapper.parse_time(text) return result def parse(self, text, method='spacy'): if method == 'spacy': return self.spacy_parse(text) elif method == 'stanford': return self.stanford_parse(text) elif method == 'gazetteer': return self.gazetteer_parse(text) elif method == 'cust_PERSON_spacy': return self.cust_person_spacy_parse(text) elif method == 'cust_PERSON_stanford': return self.cust_person_stanford_parse(text) elif method == 'dickling': return self.duckling_parse(text) return {}
# import pytest # from datetime import time, date, timedelta, datetime # from dateutil import parser # from dateutil.tz import tzlocal from datetime import datetime from duckling import DucklingWrapper, Dim # from autocorrect import spell import calendar import traceback DW_obj = DucklingWrapper() def is_future_date(s): ##### future date in respective of year # print("eneted future") s = s + " 00:00:00" # print(s) date_format = "%Y-%m-%d %H:%M:%S" start = datetime.strptime(s, date_format) now = datetime.now() dt = s if start > now: dt = dt.replace(str(int(dt.split('-')[0])), str(int(str(datetime.now().year)) - 1)) return str(dt).split(' ')[0] else: return s def is_future_date_day(s): ##### future date in respective of month # print("eneted future")
ent['value'] = txt ent['entity'] = 'product' sentence['entities'].append(ent) sentence['text'] += txt + " " while random.random() > .5: m = random.choice(middle) sentence['text'] += m txt = df.sample().iloc[0, 0] ent = dict() ent['start'] = len(sentence['text']) ent['end'] = len(sentence['text'] + txt) ent['value'] = txt ent['entity'] = 'product' sentence['entities'].append(ent) sentence['text'] += txt + " " sentence['text'] += random.choice(end) train_data['rasa_nlu_data']["common_examples"].append(sentence) with open('result.json', 'w+') as fp: json.dump(train_data, fp) container = IntentContainer('intent_cache') d = DucklingWrapper() d.parse('Bring me 250 ml sugar') d.parse_ print(d.parse_time(u'Let\'s meet at 11:45am')) print(d.parse_number(u'Bring me one conserve of ravioli')) print(d.parse_quantity(u'Bring me 100 g of sugar'))