Exemplo n.º 1
0
class ExtractEntitiesStep(AbstractStep):
    """
    Extract entities from collected text.
    """
    def __init__(self,
                 name: str,
                 source_key: str = None,
                 overwrite: bool = False,
                 source_iter: Callable[[List[str]],
                                       Iterator[IO[AnyStr]]] = file_iter,
                 output_handler: Callable[[str, Dict[str, Any]], None] = oh):
        super().__init__(name, source_key, overwrite)
        self.__source_iter = source_iter
        self.__output_handler = output_handler
        root_path = Path(__file__).parent.parent
        entities_path = str(root_path / 'config/entities.csv')
        self.entity_reverse_lookup, synonyms, self.regexprs = load_entities(
            entities_path)
        self.keyword_processor = prepare_keyword_processor(synonyms)
        duckling_entities = {ENTITY_DATE, ENTITY_NUMBER}
        tagger_entities = {ENTITY_PERSON}
        if len(duckling_entities.intersection(ENABLED_SYSTEM_ENTITIES)) > 0:
            self.d = DucklingWrapper()

        if len(tagger_entities.intersection(ENABLED_SYSTEM_ENTITIES)) > 0:
            self.tagger = SequenceTagger.load('ner')

    def process_file(self, file: IO[AnyStr], path: str,
                     control_data: Dict[str, Any], logger: Logger,
                     accumulator: Dict[str, Any]) -> None:
        logger.debug('process file: {}'.format(file.name))
        input_doc = json.load(file)
        metadata = input_doc['metadata']
        record_id = metadata['record_id']
        data = input_doc['data']
        text = data['text']
        nlp_text = []
        for t in text:
            entities = []
            keywords_found = self.keyword_processor.extract_keywords(
                t, span_info=True)
            for keyword in keywords_found:
                entities.append({
                    'entity':
                    self.entity_reverse_lookup[keyword[0]],
                    'location':
                    keyword[1:],
                    'value':
                    keyword[0],
                    'confidence':
                    1.0
                })

            matches = match_regexprs(t, self.regexprs)
            for match in matches:
                match['entity'] = self.entity_reverse_lookup[match['value']]

            entities.extend(matches)
            entities.extend(self.match_system_entities(t))

            # is the span of an entity contained within the span
            # of another entity
            def is_contained(entity):
                start, end = entity['location']
                for ent in entities:
                    s, e = ent['location']
                    # exclude exact span matches
                    if (start == s and end < e) or (
                            start > s and end == e) or (start > s and end < e):
                        return True

                return False

            def is_valid(entity):
                # remove spurious dates
                if entity['entity'] == 'sys-date':
                    start, end = entity['location']
                    if (end - start) < 8:
                        return False

                    value = entity['value']
                    if isinstance(value, str):
                        try:
                            date = parse(value)
                        except ValueError:
                            return False

                        year = date.year
                        if year < 1990 or year > 2025:
                            return False

                return True

            # keep the entity with the longest span where an entity
            # is contained within the span of another
            pruned_entities = [
                ent for ent in entities
                if not is_contained(ent) and is_valid(ent)
            ]
            nlp_text.append({'text': t, 'entities': pruned_entities})

        now = datetime.utcnow().isoformat()
        write_root_dir = control_data['job']['write_root_dir']
        step_name = convert_name_to_underscore(self.name)
        output_filename = '{}_{}.json'.format(step_name, record_id)
        output_path = os.path.join(write_root_dir, step_name, output_filename)
        data = {}
        data['nlp_text'] = nlp_text
        content = {'metadata': metadata, 'data': data}
        accumulator['files_output'].append({
            'filename': output_filename,
            'input': path,
            'path': output_path,
            'status': 'processed',
            'time': now
        })
        self.__output_handler(output_path, content)

    def run(self, control_data: Dict[str, Any], logger: Logger,
            accumulator: Dict[str, Any]) -> None:
        file_paths = [x['path'] for x in control_data[self.source_key]]
        step_name = convert_name_to_underscore(self.name)
        processed_file_paths = {}
        if step_name in control_data:
            for x in control_data[step_name]:
                if x['status'] == 'processed':
                    processed_file_paths[x['input']] = x

        for file, path in self.__source_iter(file_paths):
            if not self._overwrite and path in processed_file_paths.keys():
                accumulator['files_output'].append(processed_file_paths[path])
                continue

            self.process_file(file, path, control_data, logger, accumulator)

    def match_system_entities(self, utter):
        matches = []
        if ENTITY_DATE in ENABLED_SYSTEM_ENTITIES:
            results = self.d.parse_time(utter)
            for result in results:
                matches.append({
                    'entity': 'sys-date',
                    'location': [result['start'], result['end']],
                    'value': result['value']['value'],
                    'confidence': 1.0
                })

        if ENTITY_NUMBER in ENABLED_SYSTEM_ENTITIES:
            results = self.d.parse_number(utter)
            for result in results:
                matches.append({
                    'entity': 'sys-number',
                    'location': [result['start'], result['end']],
                    'value': result['value']['value'],
                    'confidence': 1.0
                })

        sentence = None

        if ENTITY_PERSON in ENABLED_SYSTEM_ENTITIES:
            if sentence is None:
                sentence = Sentence(utter)
                self.tagger.predict(sentence)

            for entity in sentence.get_spans('ner'):
                if entity.tag == 'PER':
                    matches.append({
                        'entity':
                        'sys-person',
                        'location': [entity.start_pos, entity.end_pos],
                        'value':
                        entity.text,
                        'confidence':
                        entity.score
                    })

        return matches
            ent['value'] = txt
            ent['entity'] = 'product'
            sentence['entities'].append(ent)
            sentence['text'] += txt + " "

            while random.random() > .5:
                m = random.choice(middle)
                sentence['text'] += m
                txt = df.sample().iloc[0, 0]
                ent = dict()
                ent['start'] = len(sentence['text'])
                ent['end'] = len(sentence['text'] + txt)
                ent['value'] = txt
                ent['entity'] = 'product'
                sentence['entities'].append(ent)
                sentence['text'] += txt + " "

            sentence['text'] += random.choice(end)
            train_data['rasa_nlu_data']["common_examples"].append(sentence)

    with open('result.json', 'w+') as fp:
        json.dump(train_data, fp)

    container = IntentContainer('intent_cache')

    d = DucklingWrapper()
    d.parse('Bring me 250 ml sugar')
    d.parse_
    print(d.parse_time(u'Let\'s meet at 11:45am'))
    print(d.parse_number(u'Bring me one conserve of ravioli'))
    print(d.parse_quantity(u'Bring me 100 g of sugar'))