示例#1
0
    def generate_wrappers(self, url):
        wrapper_manager = WrapperGateway()
        example_manager = ExampleGateway(
            max_examples=self.max_examples,
            max_examples_from_db=self.max_examples_from_db,
            seconds_between_requests=self.secs_between_reqs)
        example_sets = example_manager.get_examples(self.wrapper_gen_examples,
                                                    url, self.min_validity)

        rulers = []
        for set in example_sets:
            log.info('Starting wrapper training for set "%s"' %
                     set)  #@UndefinedVariable

            if set == 'author' or set == 'editor':
                rulers = [
                    MultiValuePathRuler(),
                    SeparatorsRegexRuler(),
                    ElementsRegexRuler(),
                    PersonRuler()
                ]
            else:
                try:
                    value_guide = self.value_guides[set]
                    pass
                except KeyError:
                    value_guide = '.*'
                rulers = [PathRuler(value_guide), RegexRuler()]

            trainer = WrapperTrainer(rulers, self.wrapper_gen_examples)
            try:
                wrappers = trainer.train(example_sets[set])
                wrappers = self._prune_wrappers(wrappers)
                wrapper_manager.persist_wrappers(url, set, wrappers)
                log.info('Trainer generated %d wrappers' %
                         len(wrappers))  #@UndefinedVariable
            except Exception, e:
                log.error('Error training wrapper for set "%s": %s' %
                          (set, e))  #@UndefinedVariable
示例#2
0
 def setUp(self):
     self.ruler = RegexRuler()
     super(TestRegexRuler, self).setUp()
示例#3
0
class TestRegexRuler(object):#(TestRuler):

    def setUp(self):
        self.ruler = RegexRuler()
        super(TestRegexRuler, self).setUp()
        
    def test_rule_example(self):
        example = Example('2007', 'Volume 31, Number 7 / July, 2007')
        rules = self.ruler._rule_example(example)
        expected = 'Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)'
        self.failUnless(rules[0].pattern == expected)

    def test_should_merge(self):
        rule01 = Rule('Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)')
        rule02 = Rule('Wednesday\,\ November\ 03\,\ (.*)')
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == False)
        
        rule01 = Rule(u'\\ Volume\\ 70\\ \\,\\&nbsp\\;\\ '
                      'Issue\\ 16\\-18\\ \\&nbsp\\;\\(October\\ (.*)\\)')
        rule02 = Rule(u'\\ Volume\\ 22\\ \\,\\&nbsp\\;\\ '
                      'Issue\\ 21\\-23\\ \\&nbsp\\;\\(January\\ (.*)\\)')
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == True)

    def test_merge_patterns(self):
        general = 'aa3aaxxxx\(\)'
        pattern = 'aa1aaxxxx\(\)'
        result = self.ruler._merge_patterns(general, pattern)
        expected = 'aa(?:.*)aaxxxx\(\)'
        self.failUnless(result == expected)
  
        general = (u'\\ Volume\\ 70\\ \\,\\&nbsp\\;\\ '
                    'Issue\\ 16\\-18\\ \\&nbsp\\;\\(October\\ (.*)\\)')
        pattern = (u'\\ Volume\\ 22\\ \\,\\&nbsp\\;\\ '
                    'Issue\\ 21\\-23\\ \\&nbsp\\;\\(January\\ (.*)\\)')
        result = self.ruler._merge_patterns(general, pattern)
        expected = (u'\\ Volume\\ (?:.*)\\ \\,\\&nbsp\\;\\ '
                     'Issue\\ (?:.*)\\-(?:.*)\\ \\&nb'
                     'sp\\;\\((?:.*)\\ (.*)\\)')
        self.failUnless(result == expected)
        
        pattern = (u'\\ Volume\\ 22\\ \\,\\&nbsp\\;\\ '
                    'Issue\\ 22\\-23\\ \\&nbsp\\;\\(May\\ (.*)\\)')
        result = self.ruler._merge_patterns(general, pattern)
        expected = (u'\\ Volume\\ (?:.*)\\ \\,\\&nbsp\\;\\ '
                     'Issue\\ (?:.*)\\-(?:.*)\\ \\&nbsp\\;'
                     '\\((?:.*)\\ (.*)\\)')
        self.failUnless(result == expected)
        
        
        general = '(.*)\ \/\ \(2007\)'
        pattern = '(.*)\ \(2010\)'
        result = self.ruler._merge_patterns(general, pattern)
        pass
    
    def test_rule(self):
        example01 = Example(u'2007', u' Volume 22 ,  '
                    'Issue 22-23  (May 2007)')
        example02 = Example(u'2009', u' Volume 11 ,  '
                    'Issue 16-25  (May 2009)')
        example03 = Example(u'2008', u' Year of publication: 2008')

        results = self.ruler.rule([example01, example02])
        self.failUnless(results[0].pattern == u'Volume\\ (?:.*)\\ \\,\\&nbsp'
                        '\\;\\ Issue\\ (?:.*)\\-2(?:.*)\\ \\&nbsp\\;\\(May\\ '
                        '(.*)\\)')

        results = self.ruler.rule([example01, example02, example03])
        self.failUnless(len(results) == 2)
        self.failUnless(results[0].pattern == u'Year\\ of\\ publication\\:'
                        '\\&nbsp\\;(.*)')

    def test_apply_heuristics(self):
        sm = difflib.SequenceMatcher(None, 'The 3rd House', 'The 35th house')
        result = self.ruler._apply_heuristics('The 35th house',
                                              sm.get_matching_blocks())
        self.failUnless(len(result) == 3)
 def setUp(self):
     self.ruler = RegexRuler()
     super(TestRegexRuler, self).setUp()
class TestRegexRuler(object):  #(TestRuler):
    def setUp(self):
        self.ruler = RegexRuler()
        super(TestRegexRuler, self).setUp()

    def test_rule_example(self):
        example = Example('2007', 'Volume 31, Number 7 / July, 2007')
        rules = self.ruler._rule_example(example)
        expected = 'Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)'
        self.failUnless(rules[0].pattern == expected)

    def test_should_merge(self):
        rule01 = Rule('Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)')
        rule02 = Rule('Wednesday\,\ November\ 03\,\ (.*)')
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == False)

        rule01 = Rule(u'\\ Volume\\ 70\\ \\,\\&nbsp\\;\\ '
                      'Issue\\ 16\\-18\\ \\&nbsp\\;\\(October\\ (.*)\\)')
        rule02 = Rule(u'\\ Volume\\ 22\\ \\,\\&nbsp\\;\\ '
                      'Issue\\ 21\\-23\\ \\&nbsp\\;\\(January\\ (.*)\\)')
        should_merge = self.ruler._should_merge(rule01, rule02)
        self.failUnless(should_merge == True)

    def test_merge_patterns(self):
        general = 'aa3aaxxxx\(\)'
        pattern = 'aa1aaxxxx\(\)'
        result = self.ruler._merge_patterns(general, pattern)
        expected = 'aa(?:.*)aaxxxx\(\)'
        self.failUnless(result == expected)

        general = (u'\\ Volume\\ 70\\ \\,\\&nbsp\\;\\ '
                   'Issue\\ 16\\-18\\ \\&nbsp\\;\\(October\\ (.*)\\)')
        pattern = (u'\\ Volume\\ 22\\ \\,\\&nbsp\\;\\ '
                   'Issue\\ 21\\-23\\ \\&nbsp\\;\\(January\\ (.*)\\)')
        result = self.ruler._merge_patterns(general, pattern)
        expected = (u'\\ Volume\\ (?:.*)\\ \\,\\&nbsp\\;\\ '
                    'Issue\\ (?:.*)\\-(?:.*)\\ \\&nb'
                    'sp\\;\\((?:.*)\\ (.*)\\)')
        self.failUnless(result == expected)

        pattern = (u'\\ Volume\\ 22\\ \\,\\&nbsp\\;\\ '
                   'Issue\\ 22\\-23\\ \\&nbsp\\;\\(May\\ (.*)\\)')
        result = self.ruler._merge_patterns(general, pattern)
        expected = (u'\\ Volume\\ (?:.*)\\ \\,\\&nbsp\\;\\ '
                    'Issue\\ (?:.*)\\-(?:.*)\\ \\&nbsp\\;'
                    '\\((?:.*)\\ (.*)\\)')
        self.failUnless(result == expected)

        general = '(.*)\ \/\ \(2007\)'
        pattern = '(.*)\ \(2010\)'
        result = self.ruler._merge_patterns(general, pattern)
        pass

    def test_rule(self):
        example01 = Example(
            u'2007', u' Volume 22 ,  '
            'Issue 22-23  (May 2007)')
        example02 = Example(
            u'2009', u' Volume 11 ,  '
            'Issue 16-25  (May 2009)')
        example03 = Example(u'2008', u' Year of publication: 2008')

        results = self.ruler.rule([example01, example02])
        self.failUnless(results[0].pattern == u'Volume\\ (?:.*)\\ \\,\\&nbsp'
                        '\\;\\ Issue\\ (?:.*)\\-2(?:.*)\\ \\&nbsp\\;\\(May\\ '
                        '(.*)\\)')

        results = self.ruler.rule([example01, example02, example03])
        self.failUnless(len(results) == 2)
        self.failUnless(results[0].pattern == u'Year\\ of\\ publication\\:'
                        '\\&nbsp\\;(.*)')

    def test_apply_heuristics(self):
        sm = difflib.SequenceMatcher(None, 'The 3rd House', 'The 35th house')
        result = self.ruler._apply_heuristics('The 35th house',
                                              sm.get_matching_blocks())
        self.failUnless(len(result) == 3)