Пример #1
0
    def classify_sentences(self, sentences):
        """ Classify the given sentences

            :param list sentences: sentences to be classified. Each one
             should be a dict with a `text`, a source `url` and some `linked_entities`
            :return: Classified sentences with the recognized `fes`
            :rtype: generator of dicts
        """
        self.extractor.start()

        sentences_data = []
        for data in sentences:
            if 'url' not in data:
                logger.warn('found a sentence with no URL (row number %d), skipping it')
                continue

            entities = dict(enumerate(e['chunk'] for e in data.get('linked_entities', [])))
            tagged = self.extractor.process_sentence(
                data['text'], entities, add_unknown=False, gazetteer=self.gazetteer
            )

            data['tagged'] = tagged
            sentences_data.append(data)

        features, _ = self.extractor.get_features()
        y = self.model.predict(features)

        token_offset = 0
        role_label_to_index = self.extractor.role_index.items
        role_index_to_label = self.extractor.role_index.reverse_map()

        for data in sentences_data:
            fes = []
            for each in data['tagged']:
                chunk = each[0]
                predicted_role = y[token_offset]

                if predicted_role != role_label_to_index['O']:
                    label = role_index_to_label[predicted_role]
                    logger.debug('chunk "%s" classified as "%s"', chunk, label)
                    fes.append({
                        'chunk': chunk,
                        'fe': label,
                    })
                    # TODO
                    # do not group entities into a single chunk, and after classification
                    # check if the word is contained in an entity; if so, assign the label
                    # to the whole entity

                token_offset += 1

            logger.debug('found %d FEs in sentence "%s"', len(fes), data['text'])
            if fes:
                classified = {
                    'lu': data['lu'],
                    'name': data['name'],
                    'url': data['url'],
                    'text': data['text'],
                    'fes': fes,
                    'linked_entities': data.get('linked_entities', []),
                }

                final = apply_custom_classification_rules(classified, self.language)
                yield final
Пример #2
0
    def label_sentence(self, sentence, normalize_numerical, score_type, core_weight):
        """ Labels a single sentence

            :param sentence: Sentence data to label
            :param normalize_numerical: Automatically normalize numerical FEs
            :param score_type: Which type of score (if any) to use to
             compute the classification confidence
            :param core_weight: Weight of the core FEs (used in the scoring)
            :return: Labeled data
        """
        logger.debug('processing sentence "%s"', sentence['text'])
        if not sentence.get('url'):
            logger.warn('a sentence is missing the url, skipping it')
            return None
        elif not sentence.get('text', '').strip():
            return None

        tagged = sentence['tagged'] if 'tagged' in sentence else self.tagger.tag_one(sentence['text'])

        # Normalize + annotate numerical FEs
        numerical_fes = []
        if normalize_numerical:
            numerical_fes.extend(list(normalize_numerical_fes(self.language, sentence['text'])))

        for token, pos, lemma in tagged:
            if lemma not in self.frame_data or not pos.startswith(self.frame_data[lemma]['pos']):
                continue

            frame = self.frame_data[lemma]
            if not frame['ontology_to_fe'].keys():
                logger.debug('missing FE types for frame %s, skipping',
                             frame['frame'])
                continue

            logger.debug('trying frame %s with FE of types %s', frame['frame'],
                         frame['ontology_to_fe'].keys())

            assigned_fes = self.assign_frame_elements(sentence['linked_entities'], frame)
            all_fes = numerical_fes + assigned_fes
            if assigned_fes or numerical_fes:
                logger.debug('assigning frame: %s and FEs %s', frame['frame'], all_fes)
                labeled = {
                    'name': sentence['name'],
                    'url': sentence['url'],
                    'text': sentence['text'],
                    'linked_entities': sentence['linked_entities'],
                    'frame': frame['frame'],
                    'fes': all_fes,
                    'lu': lemma,
                }
                break
            else:
                logger.debug('no FEs assigned for frame %s, trying another one', frame['frame'])
        else:
            logger.debug('did not assign any frame to sentence "%s"', sentence['text'])
            return None

        if score_type:
            labeled['score'] = scoring.compute_score(labeled,
                                                     score_type,
                                                     core_weight)

        assert 'lu' in labeled and labeled['fes']

        final = apply_custom_classification_rules(labeled, self.language)
        return final
Пример #3
0
    def classify_sentences(self, sentences):
        """ Classify the given sentences

            :param list sentences: sentences to be classified. Each one
             should be a dict with a `text`, a source `url` and some `linked_entities`
            :return: Classified sentences with the recognized `fes`
            :rtype: generator of dicts
        """
        self.extractor.start()

        sentences_data = []
        for data in sentences:
            if 'url' not in data:
                logger.warn('found a sentence with no URL (row number %d), skipping it')
                continue

            entities = dict(enumerate(e['chunk'] for e in data.get('linked_entities', [])))
            tagged = self.extractor.process_sentence(
                data['text'], data['lu'], entities, add_unknown=False, gazetteer=self.gazetteer
            )

            data['tagged'] = tagged
            sentences_data.append(data)

        features, _ = self.extractor.get_features(refit=False)
        y = self.model.predict(features)

        token_offset = 0
        role_label_to_index = self.extractor.label_index
        role_index_to_label = {v: k for k, v in self.extractor.label_index.iteritems()}

        for data in sentences_data:
            fes = []
            chunk_to_entity = {entity['chunk']: entity for entity in data.get('linked_entities', [])}
            for chunk, is_sample in data['tagged']:
                if not is_sample:
                    continue

                predicted_role = y[token_offset]
                if predicted_role != role_label_to_index['O']:
                    label = role_index_to_label[predicted_role]
                    logger.debug('chunk "%s" classified as "%s"', chunk, label)
                    fe = {
                        'chunk': chunk,
                        'fe': label,
                    }
                    if chunk in chunk_to_entity:
                        fe['link'] = chunk_to_entity[chunk]

                    fes.append(fe)

                token_offset += 1

            logger.debug('found %d FEs in sentence "%s"', len(fes), data['text'])
            if fes:
                classified = {
                    'lu': data['lu'],
                    'name': data['name'],
                    'url': data['url'],
                    'text': data['text'],
                    'linked_entities': data.get('linked_entities', []),
                    'fes': fes,
                }

                final = apply_custom_classification_rules(classified, self.language)
                yield final

        assert token_offset == len(y), 'processed %d tokens, classified %d' % (token_offset, len(y))