def classify_sentences(self, sentences): """ Classify the given sentences :param list sentences: sentences to be classified. Each one should be a dict with a `text`, a source `url` and some `linked_entities` :return: Classified sentences with the recognized `fes` :rtype: generator of dicts """ self.extractor.start() sentences_data = [] for data in sentences: if 'url' not in data: logger.warn('found a sentence with no URL (row number %d), skipping it') continue entities = dict(enumerate(e['chunk'] for e in data.get('linked_entities', []))) tagged = self.extractor.process_sentence( data['text'], entities, add_unknown=False, gazetteer=self.gazetteer ) data['tagged'] = tagged sentences_data.append(data) features, _ = self.extractor.get_features() y = self.model.predict(features) token_offset = 0 role_label_to_index = self.extractor.role_index.items role_index_to_label = self.extractor.role_index.reverse_map() for data in sentences_data: fes = [] for each in data['tagged']: chunk = each[0] predicted_role = y[token_offset] if predicted_role != role_label_to_index['O']: label = role_index_to_label[predicted_role] logger.debug('chunk "%s" classified as "%s"', chunk, label) fes.append({ 'chunk': chunk, 'fe': label, }) # TODO # do not group entities into a single chunk, and after classification # check if the word is contained in an entity; if so, assign the label # to the whole entity token_offset += 1 logger.debug('found %d FEs in sentence "%s"', len(fes), data['text']) if fes: classified = { 'lu': data['lu'], 'name': data['name'], 'url': data['url'], 'text': data['text'], 'fes': fes, 'linked_entities': data.get('linked_entities', []), } final = apply_custom_classification_rules(classified, self.language) yield final
def label_sentence(self, sentence, normalize_numerical, score_type, core_weight): """ Labels a single sentence :param sentence: Sentence data to label :param normalize_numerical: Automatically normalize numerical FEs :param score_type: Which type of score (if any) to use to compute the classification confidence :param core_weight: Weight of the core FEs (used in the scoring) :return: Labeled data """ logger.debug('processing sentence "%s"', sentence['text']) if not sentence.get('url'): logger.warn('a sentence is missing the url, skipping it') return None elif not sentence.get('text', '').strip(): return None tagged = sentence['tagged'] if 'tagged' in sentence else self.tagger.tag_one(sentence['text']) # Normalize + annotate numerical FEs numerical_fes = [] if normalize_numerical: numerical_fes.extend(list(normalize_numerical_fes(self.language, sentence['text']))) for token, pos, lemma in tagged: if lemma not in self.frame_data or not pos.startswith(self.frame_data[lemma]['pos']): continue frame = self.frame_data[lemma] if not frame['ontology_to_fe'].keys(): logger.debug('missing FE types for frame %s, skipping', frame['frame']) continue logger.debug('trying frame %s with FE of types %s', frame['frame'], frame['ontology_to_fe'].keys()) assigned_fes = self.assign_frame_elements(sentence['linked_entities'], frame) all_fes = numerical_fes + assigned_fes if assigned_fes or numerical_fes: logger.debug('assigning frame: %s and FEs %s', frame['frame'], all_fes) labeled = { 'name': sentence['name'], 'url': sentence['url'], 'text': sentence['text'], 'linked_entities': sentence['linked_entities'], 'frame': frame['frame'], 'fes': all_fes, 'lu': lemma, } break else: logger.debug('no FEs assigned for frame %s, trying another one', frame['frame']) else: logger.debug('did not assign any frame to sentence "%s"', sentence['text']) return None if score_type: labeled['score'] = scoring.compute_score(labeled, score_type, core_weight) assert 'lu' in labeled and labeled['fes'] final = apply_custom_classification_rules(labeled, self.language) return final
def classify_sentences(self, sentences): """ Classify the given sentences :param list sentences: sentences to be classified. Each one should be a dict with a `text`, a source `url` and some `linked_entities` :return: Classified sentences with the recognized `fes` :rtype: generator of dicts """ self.extractor.start() sentences_data = [] for data in sentences: if 'url' not in data: logger.warn('found a sentence with no URL (row number %d), skipping it') continue entities = dict(enumerate(e['chunk'] for e in data.get('linked_entities', []))) tagged = self.extractor.process_sentence( data['text'], data['lu'], entities, add_unknown=False, gazetteer=self.gazetteer ) data['tagged'] = tagged sentences_data.append(data) features, _ = self.extractor.get_features(refit=False) y = self.model.predict(features) token_offset = 0 role_label_to_index = self.extractor.label_index role_index_to_label = {v: k for k, v in self.extractor.label_index.iteritems()} for data in sentences_data: fes = [] chunk_to_entity = {entity['chunk']: entity for entity in data.get('linked_entities', [])} for chunk, is_sample in data['tagged']: if not is_sample: continue predicted_role = y[token_offset] if predicted_role != role_label_to_index['O']: label = role_index_to_label[predicted_role] logger.debug('chunk "%s" classified as "%s"', chunk, label) fe = { 'chunk': chunk, 'fe': label, } if chunk in chunk_to_entity: fe['link'] = chunk_to_entity[chunk] fes.append(fe) token_offset += 1 logger.debug('found %d FEs in sentence "%s"', len(fes), data['text']) if fes: classified = { 'lu': data['lu'], 'name': data['name'], 'url': data['url'], 'text': data['text'], 'linked_entities': data.get('linked_entities', []), 'fes': fes, } final = apply_custom_classification_rules(classified, self.language) yield final assert token_offset == len(y), 'processed %d tokens, classified %d' % (token_offset, len(y))