def collect_sentences(self, names): '''classifies all sentences based on the fact that they contain a reference to the subject of the article, the searched value and if there is more than one such sentence in an article also to at least part of the predicate. Both types of sentences are returned, positive sentences contain also the value.''' positive, negative = [], [] types = [ 'hrabstwo', 'gmina', 'prowincja', quote_plus('województwo'), 'powiat', 'region' ] for subject, object in names: try: article = get_article(subject) except: continue pos = [] object = lt.prepare_value(object, self.predicate) best_match = (0, '') for sentence in article: lemmas = [word.lemma for word in sentence] if any(o in lemmas for o in object): if self.predicate not in types or any( p in [l for l in lemmas] for p in self.predicate_words): num_matches = len(set(lemmas) & set(object)) if num_matches > best_match[0]: best_match = (num_matches, (sentence, object)) else: negative.append(sentence) if best_match[0]: positive.append(best_match[1]) assert len(positive) > 10, 'Too little training examples.' return positive, negative
def extract_sentences(self, entities): articles = prepare_articles(entities) extracted_sentences = defaultdict(list) if verbose: print 'Classifying sentences:' for entity in entities: try: article = get_article(entity) except: continue if not article: continue if verbose: print entity probabilities = [ prob[1] for prob in self.classifier.predict_proba( map(self.get_features, article)) ] #for each article return all sentences with scores > confidence_level for sentence, p in izip(article, probabilities): if p > self.confidence_level: extracted_sentences[entity].append(sentence) if verbose: print '***', '%.2f' % p, ' '.join( [w.segment for w in sentence]) elif verbose: print '%.2f' % p, ' '.join([w.segment for w in sentence]) if verbose: print return extracted_sentences
def collect_sentences(self, names): """classifies all sentences based on the fact that they contain a reference to the subject of the article, the searched value and if there is more than one such sentence in an article also to at least part of the predicate. Both types of sentences are returned, positive sentences contain also the value.""" positive, negative = [], [] types = ["hrabstwo", "gmina", "prowincja", quote_plus("województwo"), "powiat", "region"] for subject, object in names: try: article = get_article(subject) except: continue pos = [] object = lt.prepare_value(object, self.predicate) best_match = (0, "") for sentence in article: lemmas = [word.lemma for word in sentence] if any(o in lemmas for o in object): if self.predicate not in types or any(p in [l for l in lemmas] for p in self.predicate_words): num_matches = len(set(lemmas) & set(object)) if num_matches > best_match[0]: best_match = (num_matches, (sentence, object)) else: negative.append(sentence) if best_match[0]: positive.append(best_match[1]) assert len(positive) > 10, "Too little training examples." return positive, negative
def extract_sentences(self, entities): articles = prepare_articles(entities) extracted_sentences = defaultdict(list) if verbose: print "Classifying sentences:" for entity in entities: try: article = get_article(entity) except: continue if not article: continue if verbose: print entity probabilities = [prob[1] for prob in self.classifier.predict_proba(map(self.get_features, article))] # for each article return all sentences with scores > confidence_level for sentence, p in izip(article, probabilities): if p > self.confidence_level: extracted_sentences[entity].append(sentence) if verbose: print "***", "%.2f" % p, " ".join([w.segment for w in sentence]) elif verbose: print "%.2f" % p, " ".join([w.segment for w in sentence]) if verbose: print return extracted_sentences
raise entities_f = open(tests_path + '%s/entities' % predicate, 'w') values_f = open(tests_path + '%s/values' % predicate, 'w') articles_f = open(tests_path + '%s/articles' % predicate, 'w') if predicate in type_restrictions: names = select_entities_of_type_in_relation( type_restrictions[predicate], predicate ) else: names = select_all({'p': predicate}) shuffle(names) names = names[: test_data_limit] subjects, objects = zip(*list(names)) values = defaultdict(list) for subject, value in names: values[subject].append(value) prepare_articles(subjects) for subject, value in values.iteritems(): try: article = get_article(subject) except: continue print >>articles_f, subject, lt.prepare_value(value[0], predicate) for sentence in article: sentence = [word.segment for word in sentence] print >>articles_f, ' '.join(sentence) print >>articles_f print >>entities_f, subject print >>values_f, subject, value[0].replace(' ', '_').encode('utf-8')
except OSError as e: if e.errno != errno.EEXIST: raise entities_f = open(tests_path + '%s/entities' % predicate, 'w') values_f = open(tests_path + '%s/values' % predicate, 'w') articles_f = open(tests_path + '%s/articles' % predicate, 'w') if predicate in type_restrictions: names = select_entities_of_type_in_relation( type_restrictions[predicate], predicate) else: names = select_all({'p': predicate}) shuffle(names) names = names[:test_data_limit] subjects, objects = zip(*list(names)) values = defaultdict(list) for subject, value in names: values[subject].append(value) prepare_articles(subjects) for subject, value in values.iteritems(): try: article = get_article(subject) except: continue print >> articles_f, subject, lt.prepare_value(value[0], predicate) for sentence in article: sentence = [word.segment for word in sentence] print >> articles_f, ' '.join(sentence) print >> articles_f print >> entities_f, subject print >> values_f, subject, value[0].replace(' ', '_').encode('utf-8')