def extract_features(self, sentence, i, window_size=3): def recent_year(word): try: return 1990 <= int(word) <= 2012 except ValueError: return False features = {} for j in xrange(-window_size, window_size + 1): if 0 <= i + j < len(sentence): word = sentence[i + j] lemma = word.lemma if is_numeric(lemma): lemma = str(int(atof(lemma))) segment = word.segment tag = word.tag word_features = { 'segment': segment, 'tag': lt.get_tag(tag), 'case': lt.get_case(tag), 'number': lt.get_number(tag), 'gender': lt.get_gender(tag), 'person': lt.get_person(tag), 'aspect': lt.get_aspect(tag), 'lemma': lemma, 'recent_year': str(int(recent_year(lemma))), 'alldigits': str(int(lemma.isdigit())), 'allalpha': str(int(lemma.decode('utf-8').isalpha())), 'starts_with_capital': str(int(lemma.decode('utf-8')[0].isupper())), 'segm_starts_with_capital': str(int(segment.decode('utf-8')[0].isupper())), 'numeric': str(int(is_numeric(lemma))), 'len': str(len(lemma)), } if use_parser: word_features['parse'] = word.parse if use_wordnet: word_features['synset'] = get_hypernym(word) for name, feature in word_features.iteritems(): features['%d%s' % (j, name)] = feature return features
def extract_values(self, extracted_sentences, confidence_level=.8): setlocale(LC_NUMERIC, 'pl_PL.UTF-8') sentences = [ sentence for entity, sentences in extracted_sentences.iteritems() for sentence in sentences ] self.save_features_to_file(self.features_tag_filename, sentences) command = 'crfsuite tag -i -m %s %s' % (models_cache_path % self.model_filename, models_cache_path % self.features_tag_filename) p = Popen(command, stdout=PIPE, stdin=PIPE, stderr=PIPE, shell=True) out, _ = p.communicate() tags_list = [] tags = [] for line in out.split('\n')[:-1]: if not line: tags_list.append(tags) tags = [] else: tags.append((line[0], float(line[2:]))) extracted_values = {} i = 0 for entity, sentences in extracted_sentences.iteritems(): values = [] for sentence in sentences: tags = tags_list[i] i += 1 value = [] value_prob = 1 #automatically join hyphenated words for j, word in enumerate(sentence): if word.lemma == '-': tags[j] = ('1', 1) for word, (tag, p) in zip(sentence, tags) + [('', ('0', 1))]: if tag == '1': if word.lemma == '-' and not value: continue value.append(word.lemma) value_prob = min(value_prob, p) elif value: if value[-1] == '-': value.pop() if not value: continue v = '_'.join(value).replace('_-_', '-') value = [] #gmina can have the same name as its main city (in fact, it very often does) if v.decode('utf-8')[:4] != entity.decode('utf-8')[:4] or self.predicate in ['gmina']: values.append((v, value_prob)) value_prob = 1 #sort by decreasing probabilities values = filter(lambda (_, p): p > confidence_level, values) values.sort(key=lambda (_, p): -p) values = map(lambda (v, p): (str(int(atof(v))) if is_numeric(v) else v, p), values) values = map( lambda (v, p): ((v.decode('utf-8')[0].upper() + v.decode('utf-8')[1:]).encode('utf-8') if '_' in v else v, p), values ) if verbose: print entity, values values = [v for v, _ in values] if values: if self.predicate in numeric_predicates: extracted_values[entity] = values[0] continue #to increase precision of extraction (at the cost of recall) in textual relations, #only values that are geographic entities in DBPedia are returned values_identified_as_entities = [ v for v in values if lt.is_entity(v) ] values_identified_as_entities_of_right_type = [ v for v in values_identified_as_entities if\ any(entities_types.index(t) in lt.entities[v] for t in self.predominant_types) ] if verbose: print ' '.join(values), print ' '.join(values_identified_as_entities), print ' '.join(values_identified_as_entities_of_right_type) if values_identified_as_entities_of_right_type: extracted_values[entity] = values_identified_as_entities_of_right_type[0] elif values_identified_as_entities: extracted_values[entity] = values_identified_as_entities[0] elif self.predicate in ['gmina', 'powiat', quote_plus('województwo'), 'hrabstwo', 'stan']: extracted_values[entity] = values[0] if not save_to_cache: try: os.remove(models_cache_path % self.model_filename) except IOError: pass return extracted_values