def generate_wrappers(self, url):
     wrapper_manager = WrapperGateway()
     example_manager = ExampleGateway(max_examples=self.max_examples,
                                      max_examples_from_db=
                                      self.max_examples_from_db,
                                      seconds_between_requests=
                                      self.secs_between_reqs)
     example_sets = example_manager.get_examples(self.wrapper_gen_examples,
                                                 url, self.min_validity)
     
     rulers = []
     for set in example_sets:
         log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable
         
         if set == 'author' or set == 'editor':
             rulers = [MultiValuePathRuler(),
                       SeparatorsRegexRuler(),
                       ElementsRegexRuler(),
                       PersonRuler()]
         else:
             try:
                 value_guide = self.value_guides[set]
                 pass
             except KeyError:
                 value_guide = '.*'
             rulers = [PathRuler(value_guide), RegexRuler()] 
     
         trainer = WrapperTrainer(rulers, self.wrapper_gen_examples)
         try:
             wrappers = trainer.train(example_sets[set])
             wrappers = self._prune_wrappers(wrappers)
             wrapper_manager.persist_wrappers(url, set, wrappers)
             log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable
         except Exception, e:
             log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
 def setUp(self):
     self.wt = WrapperTrainer([MockRuler()], num_examples=3)
     self.nsets = 5
     self.example_sets = MockExampleManager()._get_examples(
         self.nsets, 5, 10)
     self.example_set = [
         Example('v01', 'c01'),
         Example('v02', 'c02'),
         Example('v03', 'c03')
     ]
 def setUp(self):
     self.wt = WrapperTrainer([MockRuler()], num_examples=3)
     self.nsets = 5
     self.example_sets = MockExampleManager()._get_examples(self.nsets,
                                                            5, 10)
     self.example_set = [Example('v01', 'c01'),
                         Example('v02', 'c02'),
                         Example('v03', 'c03')]
示例#4
0
    def generate_wrappers(self, url):
        wrapper_manager = WrapperGateway()
        example_manager = ExampleGateway(
            max_examples=self.max_examples,
            max_examples_from_db=self.max_examples_from_db,
            seconds_between_requests=self.secs_between_reqs)
        example_sets = example_manager.get_examples(self.wrapper_gen_examples,
                                                    url, self.min_validity)

        rulers = []
        for set in example_sets:
            log.info('Starting wrapper training for set "%s"' %
                     set)  #@UndefinedVariable

            if set == 'author' or set == 'editor':
                rulers = [
                    MultiValuePathRuler(),
                    SeparatorsRegexRuler(),
                    ElementsRegexRuler(),
                    PersonRuler()
                ]
            else:
                try:
                    value_guide = self.value_guides[set]
                    pass
                except KeyError:
                    value_guide = '.*'
                rulers = [PathRuler(value_guide), RegexRuler()]

            trainer = WrapperTrainer(rulers, self.wrapper_gen_examples)
            try:
                wrappers = trainer.train(example_sets[set])
                wrappers = self._prune_wrappers(wrappers)
                wrapper_manager.persist_wrappers(url, set, wrappers)
                log.info('Trainer generated %d wrappers' %
                         len(wrappers))  #@UndefinedVariable
            except Exception, e:
                log.error('Error training wrapper for set "%s": %s' %
                          (set, e))  #@UndefinedVariable
class TestWrapperTrainer(unittest.TestCase):
    def setUp(self):
        self.wt = WrapperTrainer([MockRuler()], num_examples=3)
        self.nsets = 5
        self.example_sets = MockExampleManager()._get_examples(
            self.nsets, 5, 10)
        self.example_set = [
            Example('v01', 'c01'),
            Example('v02', 'c02'),
            Example('v03', 'c03')
        ]

    def test_train_too_few_examples(self):
        self.wt.set_num_examples(20)
        result = self.wt.train(self.example_sets)
        self.failUnless(len(result) == 0)

    def test_train(self):
        self.wt.rulers = [MockRuler(), MockRuler()]
        wrappers = self.wt.train(self.example_sets['a'])
        self.failUnless(len(wrappers) == 4)
        self.failUnless(len(wrappers[0].rules) == 2)
        self.failUnless(wrappers[0].downvotes > 0)

    def test_get_new_example_set(self):
        example_set = self.wt._get_new_example_set(MockRule('xx'),
                                                   self.example_set)
        self.failUnless(len(example_set) == 3)
        self.failUnless(example_set[0].content == 'c01_rxx')

    def test_get_rule_sets(self):
        rules = self.wt._get_rule_sets([MockRuler(), MockRuler()],
                                       self.example_set)
        self.failUnless(len(rules) == 4)
        length_bools = map(lambda x: len(x) == 2, rules)
        self.failUnless(length_bools.count(False) == 0)

    def test_evaluate_single_value_wrapper(self):
        info = 'In Germany'
        value = 'Germany'
        result = self.wt._evaluate_single_value_wrapper(info, value)
        self.failUnless(result == True)

        info = 'Germany'
        value = 'In Germany'
        result = self.wt._evaluate_single_value_wrapper(info, value)
        self.failUnless(result == False)

    def test_evaluate_multi_value_wrapper(self):
        info_list = ['Alberto Del Angel', 'Pierre Geurts', 'Damien Ernst']
        values = [
            '(Alberto.*Angel|Angel.*Alberto)',
            '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)'
        ]
        result = self.wt._evaluate_multi_value_wrapper(info_list, values)
        self.failUnless(result == True)

        info_list = ['Alberto Del Ange']
        result = self.wt._evaluate_multi_value_wrapper(info_list, values)
        self.failUnless(result == False)

    def test_evaluate_wrapper(self):
        self.example_set = [
            Example('c01_r22_r33', 'c01'),
            Example('c02_r22_r33', 'c02'),
            Example('incorrect_value', 'c03')
        ]
        wrapper = Wrapper(rules=[MockRule(22), MockRule(33)])
        self.wt._evaluate_wrapper(wrapper, self.example_set)
        self.failUnless(wrapper.upvotes == 2)
        self.failUnless(wrapper.downvotes == 1)
class TestWrapperTrainer(unittest.TestCase):
    def setUp(self):
        self.wt = WrapperTrainer([MockRuler()], num_examples=3)
        self.nsets = 5
        self.example_sets = MockExampleManager()._get_examples(self.nsets,
                                                               5, 10)
        self.example_set = [Example('v01', 'c01'),
                            Example('v02', 'c02'),
                            Example('v03', 'c03')]

    def test_train_too_few_examples(self):
        self.wt.set_num_examples(20)
        result = self.wt.train(self.example_sets)
        self.failUnless(len(result) == 0)

    def test_train(self):
        self.wt.rulers = [MockRuler(), MockRuler()]
        wrappers = self.wt.train(self.example_sets['a'])
        self.failUnless(len(wrappers) == 4)
        self.failUnless(len(wrappers[0].rules) == 2)
        self.failUnless(wrappers[0].downvotes > 0)

    def test_get_new_example_set(self):
        example_set = self.wt._get_new_example_set(MockRule('xx'),
                                                   self.example_set)
        self.failUnless(len(example_set) == 3)
        self.failUnless(example_set[0].content == 'c01_rxx')

    def test_get_rule_sets(self):
        rules = self.wt._get_rule_sets([MockRuler(), MockRuler()],
                                       self.example_set)
        self.failUnless(len(rules) == 4)
        length_bools = map(lambda x: len(x) == 2, rules)
        self.failUnless(length_bools.count(False) == 0)

    def test_evaluate_single_value_wrapper(self):
        info = 'In Germany'
        value = 'Germany'
        result = self.wt._evaluate_single_value_wrapper(info, value)
        self.failUnless(result == True)

        info = 'Germany'
        value = 'In Germany'
        result = self.wt._evaluate_single_value_wrapper(info, value)
        self.failUnless(result == False)

    def test_evaluate_multi_value_wrapper(self):
        info_list = ['Alberto Del Angel',
                     'Pierre Geurts',
                     'Damien Ernst']
        values = ['(Alberto.*Angel|Angel.*Alberto)',
                  '(Geurts.*Pierre|Pierre.*Geurts)',
                  '(Damien.*Ernst|Ernst.*Damien)']
        result = self.wt._evaluate_multi_value_wrapper(info_list, values)
        self.failUnless(result == True)
        
        info_list = ['Alberto Del Ange']
        result = self.wt._evaluate_multi_value_wrapper(info_list, values)
        self.failUnless(result == False)
    
    def test_evaluate_wrapper(self):
        self.example_set = [Example('c01_r22_r33', 'c01'),
                            Example('c02_r22_r33', 'c02'),
                            Example('incorrect_value', 'c03')]
        wrapper = Wrapper(rules=[MockRule(22), MockRule(33)])
        self.wt._evaluate_wrapper(wrapper, self.example_set)
        self.failUnless(wrapper.upvotes == 2)
        self.failUnless(wrapper.downvotes == 1)