def train(self): if self.predicate in type_restrictions: names = select_entities_of_type_in_relation( type_restrictions[self.predicate], self.predicate) else: names = select_all({'p': self.predicate}) if len(names) > training_limit and self.predicate not in [ quote_plus('województwo') ]: new_names = [] values_added = set() for e, v in names: if v not in values_added: values_added.add(v) new_names.append((e, v)) names = new_names names = names[:training_limit] if verbose: print '%d articles processed during training.' % len(names) if evaluation_mode: #make sure that entities that will be used in evaluation, are not used in training from evaluator import get_test_data names = filter( lambda (entity, _): entity not in get_test_data(self.predicate)[0], names) #prepare articles about subjects prepare_articles(zip(*names)[0]) positive, negative = self.collect_sentences(names) positive = positive[:self.sentence_limit] if verbose: print 'Sentences selected for training (%d total):' % len(positive) for s, v in positive: print ' '.join(v), ' ', ' '.join([w.lemma for w in s]) print self.extractor_training_data = positive[:] positive = map(lambda (s, v): s, positive) #decreases number of negative examples to the number of positive examples to avoid unbalanced data shuffle(negative) negative = negative[:len(positive)] sentences = positive + negative classes = [True] * len(positive) + [False] * len(negative) vocabulary = self.collect_features(positive) if verbose: print 'Words considered as features:' print vocabulary print self.classifier = Pipeline([ ('v', CountVectorizer(analyzer=lambda x: x, vocabulary=vocabulary, binary=True)), ('c', SVC(kernel='linear', probability=True)), ]) self.classifier.fit(map(self.get_features, sentences), classes) self.get_most_informative_features()
def train(self): if self.predicate in type_restrictions: names = select_entities_of_type_in_relation(type_restrictions[self.predicate], self.predicate) else: names = select_all({"p": self.predicate}) if len(names) > training_limit and self.predicate not in [quote_plus("województwo")]: new_names = [] values_added = set() for e, v in names: if v not in values_added: values_added.add(v) new_names.append((e, v)) names = new_names names = names[:training_limit] if verbose: print "%d articles processed during training." % len(names) if evaluation_mode: # make sure that entities that will be used in evaluation, are not used in training from evaluator import get_test_data names = filter(lambda (entity, _): entity not in get_test_data(self.predicate)[0], names) # prepare articles about subjects prepare_articles(zip(*names)[0]) positive, negative = self.collect_sentences(names) positive = positive[: self.sentence_limit] if verbose: print "Sentences selected for training (%d total):" % len(positive) for s, v in positive: print " ".join(v), " ", " ".join([w.lemma for w in s]) print self.extractor_training_data = positive[:] positive = map(lambda (s, v): s, positive) # decreases number of negative examples to the number of positive examples to avoid unbalanced data shuffle(negative) negative = negative[: len(positive)] sentences = positive + negative classes = [True] * len(positive) + [False] * len(negative) vocabulary = self.collect_features(positive) if verbose: print "Words considered as features:" print vocabulary print self.classifier = Pipeline( [ ("v", CountVectorizer(analyzer=lambda x: x, vocabulary=vocabulary, binary=True)), ("c", SVC(kernel="linear", probability=True)), ] ) self.classifier.fit(map(self.get_features, sentences), classes) self.get_most_informative_features()
def extract_sentences(self, entities): articles = prepare_articles(entities) extracted_sentences = defaultdict(list) if verbose: print 'Classifying sentences:' for entity in entities: try: article = get_article(entity) except: continue if not article: continue if verbose: print entity probabilities = [ prob[1] for prob in self.classifier.predict_proba( map(self.get_features, article)) ] #for each article return all sentences with scores > confidence_level for sentence, p in izip(article, probabilities): if p > self.confidence_level: extracted_sentences[entity].append(sentence) if verbose: print '***', '%.2f' % p, ' '.join( [w.segment for w in sentence]) elif verbose: print '%.2f' % p, ' '.join([w.segment for w in sentence]) if verbose: print return extracted_sentences
def extract_sentences(self, entities): articles = prepare_articles(entities) extracted_sentences = defaultdict(list) if verbose: print "Classifying sentences:" for entity in entities: try: article = get_article(entity) except: continue if not article: continue if verbose: print entity probabilities = [prob[1] for prob in self.classifier.predict_proba(map(self.get_features, article))] # for each article return all sentences with scores > confidence_level for sentence, p in izip(article, probabilities): if p > self.confidence_level: extracted_sentences[entity].append(sentence) if verbose: print "***", "%.2f" % p, " ".join([w.segment for w in sentence]) elif verbose: print "%.2f" % p, " ".join([w.segment for w in sentence]) if verbose: print return extracted_sentences
raise entities_f = open(tests_path + '%s/entities' % predicate, 'w') values_f = open(tests_path + '%s/values' % predicate, 'w') articles_f = open(tests_path + '%s/articles' % predicate, 'w') if predicate in type_restrictions: names = select_entities_of_type_in_relation( type_restrictions[predicate], predicate ) else: names = select_all({'p': predicate}) shuffle(names) names = names[: test_data_limit] subjects, objects = zip(*list(names)) values = defaultdict(list) for subject, value in names: values[subject].append(value) prepare_articles(subjects) for subject, value in values.iteritems(): try: article = get_article(subject) except: continue print >>articles_f, subject, lt.prepare_value(value[0], predicate) for sentence in article: sentence = [word.segment for word in sentence] print >>articles_f, ' '.join(sentence) print >>articles_f print >>entities_f, subject print >>values_f, subject, value[0].replace(' ', '_').encode('utf-8')
except OSError as e: if e.errno != errno.EEXIST: raise entities_f = open(tests_path + '%s/entities' % predicate, 'w') values_f = open(tests_path + '%s/values' % predicate, 'w') articles_f = open(tests_path + '%s/articles' % predicate, 'w') if predicate in type_restrictions: names = select_entities_of_type_in_relation( type_restrictions[predicate], predicate) else: names = select_all({'p': predicate}) shuffle(names) names = names[:test_data_limit] subjects, objects = zip(*list(names)) values = defaultdict(list) for subject, value in names: values[subject].append(value) prepare_articles(subjects) for subject, value in values.iteritems(): try: article = get_article(subject) except: continue print >> articles_f, subject, lt.prepare_value(value[0], predicate) for sentence in article: sentence = [word.segment for word in sentence] print >> articles_f, ' '.join(sentence) print >> articles_f print >> entities_f, subject print >> values_f, subject, value[0].replace(' ', '_').encode('utf-8')