Exemplo n.º 1
0
 def generate_labels(self, verbose=0):
     """return extents and label dictionary for generating proposed XML docs"""        
     # load data
     train_data, test_data = self.generate_test_train()
     labels = [self.label_function(y) for y in train_data]
     # reporting
     if verbose >= 1:
         print "data loaded"
     if verbose >= 2:
         fd = {}
         for l in labels:
             fd[l] = fd.get(l, 1) + 1
         print fd
     # train model
     clf = SKClassifier(LogisticRegression(),
                        self.label_function,
                        self.feature_functions)
     clf.add_labels(set(labels))
     clf.train(train_data)
     # reporting
     if verbose >= 1:
         print "model trained"
     # classify
     pred = clf.classify(test_data, 
                         keys = ["{a},{b},{c}".format(a=extent.basename,
                                                      b=extent.lex[0].begin, 
                                                      c=extent.lex[-1].end) 
                                 for extent in test_data]
                         )
     return pred, test_data
Exemplo n.º 2
0
 def run_demo(self, verbose=0):
     """ test functionality of loading data, classification and evaluation """
     # load training data
     train_corpus = Corpus(self.train_path)
     extents = list(train_corpus.extents(self.indices_function,
                                         self.extent_class))
     # load test data
     if self.test_path:
         train_data = extents
         test_corpus = HypotheticalCorpus(self.test_path)
         test_data = list(test_corpus.extents(self.indices_function,
                                              self.extent_class))
     else:
         i = int(len(extents) * self.split)
         train_data = extents[:i]
         test_data = extents[i:]
     # verbosity functionality
     if verbose >= 1:
         print "data loaded"
     labels = [self.label_function(x) for x in extents]
     if verbose >= 2:
         fd = {}
         for l in labels:
             fd[l] = fd.get(l, 1) + 1
         print fd
     # train model
     clf = SKClassifier(LogisticRegression(),
                        self.label_function,
                        self.feature_functions)
     clf.add_labels(set(labels))
     clf.train(train_data)
     if verbose >= 1:
         print "model trained"
     # classify
     pred = clf.classify(test_data, 
                         keys = ["{a},{b},{c}".format(a=extent.basename,
                                                      b=extent.lex[0].begin, 
                                                      c=extent.lex[-1].end)
                                 for extent in test_data])
     # evaluate
     if self.gold_path:
         gold_corpus = Corpus(self.gold_path)
         gold_data = list(gold_corpus.extents(self.indices_function,
                                         self.extent_class))
     else:
         gold_data = test_data
     gold_labels = dict([
                         ("{a},{b},{c}".format(a=extent.basename,
                                               b=extent.lex[0].begin, 
                                               c=extent.lex[-1].end),  
                         self.label_function(extent)) 
                     for extent in gold_data])        
     clf.evaluate(pred, gold_labels)