Пример #1
0
 def train(self):
     if self.predicate in type_restrictions:
         names = select_entities_of_type_in_relation(
             type_restrictions[self.predicate], self.predicate)
     else:
         names = select_all({'p': self.predicate})
     if len(names) > training_limit and self.predicate not in [
             quote_plus('województwo')
     ]:
         new_names = []
         values_added = set()
         for e, v in names:
             if v not in values_added:
                 values_added.add(v)
                 new_names.append((e, v))
         names = new_names
     names = names[:training_limit]
     if verbose:
         print '%d articles processed during training.' % len(names)
     if evaluation_mode:
         #make sure that entities that will be used in evaluation, are not used in training
         from evaluator import get_test_data
         names = filter(
             lambda
             (entity, _): entity not in get_test_data(self.predicate)[0],
             names)
     #prepare articles about subjects
     prepare_articles(zip(*names)[0])
     positive, negative = self.collect_sentences(names)
     positive = positive[:self.sentence_limit]
     if verbose:
         print 'Sentences selected for training (%d total):' % len(positive)
         for s, v in positive:
             print ' '.join(v), ' ', ' '.join([w.lemma for w in s])
         print
     self.extractor_training_data = positive[:]
     positive = map(lambda (s, v): s, positive)
     #decreases number of negative examples to the number of positive examples to avoid unbalanced data
     shuffle(negative)
     negative = negative[:len(positive)]
     sentences = positive + negative
     classes = [True] * len(positive) + [False] * len(negative)
     vocabulary = self.collect_features(positive)
     if verbose:
         print 'Words considered as features:'
         print vocabulary
         print
     self.classifier = Pipeline([
         ('v',
          CountVectorizer(analyzer=lambda x: x,
                          vocabulary=vocabulary,
                          binary=True)),
         ('c', SVC(kernel='linear', probability=True)),
     ])
     self.classifier.fit(map(self.get_features, sentences), classes)
     self.get_most_informative_features()
Пример #2
0
    def train(self):
        if self.predicate in type_restrictions:
            names = select_entities_of_type_in_relation(type_restrictions[self.predicate], self.predicate)
        else:
            names = select_all({"p": self.predicate})
        if len(names) > training_limit and self.predicate not in [quote_plus("województwo")]:
            new_names = []
            values_added = set()
            for e, v in names:
                if v not in values_added:
                    values_added.add(v)
                    new_names.append((e, v))
            names = new_names
        names = names[:training_limit]
        if verbose:
            print "%d articles processed during training." % len(names)
        if evaluation_mode:
            # make sure that entities that will be used in evaluation, are not used in training
            from evaluator import get_test_data

            names = filter(lambda (entity, _): entity not in get_test_data(self.predicate)[0], names)
        # prepare articles about subjects
        prepare_articles(zip(*names)[0])
        positive, negative = self.collect_sentences(names)
        positive = positive[: self.sentence_limit]
        if verbose:
            print "Sentences selected for training (%d total):" % len(positive)
            for s, v in positive:
                print " ".join(v), " ", " ".join([w.lemma for w in s])
            print
        self.extractor_training_data = positive[:]
        positive = map(lambda (s, v): s, positive)
        # decreases number of negative examples to the number of positive examples to avoid unbalanced data
        shuffle(negative)
        negative = negative[: len(positive)]
        sentences = positive + negative
        classes = [True] * len(positive) + [False] * len(negative)
        vocabulary = self.collect_features(positive)
        if verbose:
            print "Words considered as features:"
            print vocabulary
            print
        self.classifier = Pipeline(
            [
                ("v", CountVectorizer(analyzer=lambda x: x, vocabulary=vocabulary, binary=True)),
                ("c", SVC(kernel="linear", probability=True)),
            ]
        )
        self.classifier.fit(map(self.get_features, sentences), classes)
        self.get_most_informative_features()
 predicate = ''
 test_data_limit = 100
 try:
     os.makedirs(tests_path + '%s' % predicate)
 except OSError as e:
     if e.errno != errno.EEXIST:
         raise
 entities_f = open(tests_path + '%s/entities' % predicate, 'w')
 values_f = open(tests_path + '%s/values' % predicate, 'w')
 articles_f = open(tests_path + '%s/articles' % predicate, 'w')
 if predicate in type_restrictions:
     names = select_entities_of_type_in_relation(
         type_restrictions[predicate], predicate
     )
 else:
     names = select_all({'p': predicate})
 shuffle(names)
 names = names[: test_data_limit]
 subjects, objects = zip(*list(names))
 values = defaultdict(list)
 for subject, value in names:
     values[subject].append(value)
 prepare_articles(subjects)
 for subject, value in values.iteritems():
     try:
         article = get_article(subject)
     except:
         continue
     print >>articles_f, subject, lt.prepare_value(value[0], predicate)
     for sentence in article:
         sentence = [word.segment for word in sentence]
 lt = LanguageToolsFactory.get_language_tools()
 predicate = ''
 test_data_limit = 100
 try:
     os.makedirs(tests_path + '%s' % predicate)
 except OSError as e:
     if e.errno != errno.EEXIST:
         raise
 entities_f = open(tests_path + '%s/entities' % predicate, 'w')
 values_f = open(tests_path + '%s/values' % predicate, 'w')
 articles_f = open(tests_path + '%s/articles' % predicate, 'w')
 if predicate in type_restrictions:
     names = select_entities_of_type_in_relation(
         type_restrictions[predicate], predicate)
 else:
     names = select_all({'p': predicate})
 shuffle(names)
 names = names[:test_data_limit]
 subjects, objects = zip(*list(names))
 values = defaultdict(list)
 for subject, value in names:
     values[subject].append(value)
 prepare_articles(subjects)
 for subject, value in values.iteritems():
     try:
         article = get_article(subject)
     except:
         continue
     print >> articles_f, subject, lt.prepare_value(value[0], predicate)
     for sentence in article:
         sentence = [word.segment for word in sentence]