Пример #1
0
 def train(self):
     if self.predicate in type_restrictions:
         names = select_entities_of_type_in_relation(
             type_restrictions[self.predicate], self.predicate)
     else:
         names = select_all({'p': self.predicate})
     if len(names) > training_limit and self.predicate not in [
             quote_plus('województwo')
     ]:
         new_names = []
         values_added = set()
         for e, v in names:
             if v not in values_added:
                 values_added.add(v)
                 new_names.append((e, v))
         names = new_names
     names = names[:training_limit]
     if verbose:
         print '%d articles processed during training.' % len(names)
     if evaluation_mode:
         #make sure that entities that will be used in evaluation, are not used in training
         from evaluator import get_test_data
         names = filter(
             lambda
             (entity, _): entity not in get_test_data(self.predicate)[0],
             names)
     #prepare articles about subjects
     prepare_articles(zip(*names)[0])
     positive, negative = self.collect_sentences(names)
     positive = positive[:self.sentence_limit]
     if verbose:
         print 'Sentences selected for training (%d total):' % len(positive)
         for s, v in positive:
             print ' '.join(v), ' ', ' '.join([w.lemma for w in s])
         print
     self.extractor_training_data = positive[:]
     positive = map(lambda (s, v): s, positive)
     #decreases number of negative examples to the number of positive examples to avoid unbalanced data
     shuffle(negative)
     negative = negative[:len(positive)]
     sentences = positive + negative
     classes = [True] * len(positive) + [False] * len(negative)
     vocabulary = self.collect_features(positive)
     if verbose:
         print 'Words considered as features:'
         print vocabulary
         print
     self.classifier = Pipeline([
         ('v',
          CountVectorizer(analyzer=lambda x: x,
                          vocabulary=vocabulary,
                          binary=True)),
         ('c', SVC(kernel='linear', probability=True)),
     ])
     self.classifier.fit(map(self.get_features, sentences), classes)
     self.get_most_informative_features()
Пример #2
0
    def train(self):
        if self.predicate in type_restrictions:
            names = select_entities_of_type_in_relation(type_restrictions[self.predicate], self.predicate)
        else:
            names = select_all({"p": self.predicate})
        if len(names) > training_limit and self.predicate not in [quote_plus("województwo")]:
            new_names = []
            values_added = set()
            for e, v in names:
                if v not in values_added:
                    values_added.add(v)
                    new_names.append((e, v))
            names = new_names
        names = names[:training_limit]
        if verbose:
            print "%d articles processed during training." % len(names)
        if evaluation_mode:
            # make sure that entities that will be used in evaluation, are not used in training
            from evaluator import get_test_data

            names = filter(lambda (entity, _): entity not in get_test_data(self.predicate)[0], names)
        # prepare articles about subjects
        prepare_articles(zip(*names)[0])
        positive, negative = self.collect_sentences(names)
        positive = positive[: self.sentence_limit]
        if verbose:
            print "Sentences selected for training (%d total):" % len(positive)
            for s, v in positive:
                print " ".join(v), " ", " ".join([w.lemma for w in s])
            print
        self.extractor_training_data = positive[:]
        positive = map(lambda (s, v): s, positive)
        # decreases number of negative examples to the number of positive examples to avoid unbalanced data
        shuffle(negative)
        negative = negative[: len(positive)]
        sentences = positive + negative
        classes = [True] * len(positive) + [False] * len(negative)
        vocabulary = self.collect_features(positive)
        if verbose:
            print "Words considered as features:"
            print vocabulary
            print
        self.classifier = Pipeline(
            [
                ("v", CountVectorizer(analyzer=lambda x: x, vocabulary=vocabulary, binary=True)),
                ("c", SVC(kernel="linear", probability=True)),
            ]
        )
        self.classifier.fit(map(self.get_features, sentences), classes)
        self.get_most_informative_features()
Пример #3
0
 def extract_sentences(self, entities):
     articles = prepare_articles(entities)
     extracted_sentences = defaultdict(list)
     if verbose:
         print 'Classifying sentences:'
     for entity in entities:
         try:
             article = get_article(entity)
         except:
             continue
         if not article:
             continue
         if verbose:
             print entity
         probabilities = [
             prob[1] for prob in self.classifier.predict_proba(
                 map(self.get_features, article))
         ]
         #for each article return all sentences with scores > confidence_level
         for sentence, p in izip(article, probabilities):
             if p > self.confidence_level:
                 extracted_sentences[entity].append(sentence)
                 if verbose:
                     print '***', '%.2f' % p, ' '.join(
                         [w.segment for w in sentence])
             elif verbose:
                 print '%.2f' % p, ' '.join([w.segment for w in sentence])
         if verbose:
             print
     return extracted_sentences
Пример #4
0
 def extract_sentences(self, entities):
     articles = prepare_articles(entities)
     extracted_sentences = defaultdict(list)
     if verbose:
         print "Classifying sentences:"
     for entity in entities:
         try:
             article = get_article(entity)
         except:
             continue
         if not article:
             continue
         if verbose:
             print entity
         probabilities = [prob[1] for prob in self.classifier.predict_proba(map(self.get_features, article))]
         # for each article return all sentences with scores > confidence_level
         for sentence, p in izip(article, probabilities):
             if p > self.confidence_level:
                 extracted_sentences[entity].append(sentence)
                 if verbose:
                     print "***", "%.2f" % p, " ".join([w.segment for w in sentence])
             elif verbose:
                 print "%.2f" % p, " ".join([w.segment for w in sentence])
         if verbose:
             print
     return extracted_sentences
            raise
    entities_f = open(tests_path + '%s/entities' % predicate, 'w')
    values_f = open(tests_path + '%s/values' % predicate, 'w')
    articles_f = open(tests_path + '%s/articles' % predicate, 'w')
    if predicate in type_restrictions:
        names = select_entities_of_type_in_relation(
            type_restrictions[predicate], predicate
        )
    else:
        names = select_all({'p': predicate})
    shuffle(names)
    names = names[: test_data_limit]
    subjects, objects = zip(*list(names))
    values = defaultdict(list)
    for subject, value in names:
        values[subject].append(value)
    prepare_articles(subjects)
    for subject, value in values.iteritems():
        try:
            article = get_article(subject)
        except:
            continue
        print >>articles_f, subject, lt.prepare_value(value[0], predicate)
        for sentence in article:
            sentence = [word.segment for word in sentence]
            print >>articles_f, ' '.join(sentence)
        print >>articles_f
        print >>entities_f, subject
        print >>values_f, subject, value[0].replace(' ', '_').encode('utf-8')

    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
    entities_f = open(tests_path + '%s/entities' % predicate, 'w')
    values_f = open(tests_path + '%s/values' % predicate, 'w')
    articles_f = open(tests_path + '%s/articles' % predicate, 'w')
    if predicate in type_restrictions:
        names = select_entities_of_type_in_relation(
            type_restrictions[predicate], predicate)
    else:
        names = select_all({'p': predicate})
    shuffle(names)
    names = names[:test_data_limit]
    subjects, objects = zip(*list(names))
    values = defaultdict(list)
    for subject, value in names:
        values[subject].append(value)
    prepare_articles(subjects)
    for subject, value in values.iteritems():
        try:
            article = get_article(subject)
        except:
            continue
        print >> articles_f, subject, lt.prepare_value(value[0], predicate)
        for sentence in article:
            sentence = [word.segment for word in sentence]
            print >> articles_f, ' '.join(sentence)
        print >> articles_f
        print >> entities_f, subject
        print >> values_f, subject, value[0].replace(' ', '_').encode('utf-8')