class WrapperManagerWizard(QtGui.QWizard):
    
    def __init__(self):
        super(WrapperManagerWizard, self).__init__()
        self.initialize()

    def initialize(self):
        self.setOption(QtGui.QWizard.NoCancelButton, True)
        self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True)
        
        self.wrapper_gw = WrapperGateway()
        
        wizard_title = 'Manage Wrappers'
        self.page01 = WrapperManagerPage(wizard_title, self)
        self.addPage(self.page01)
        
    def show(self):
        self.removePage(0)
        self.initialize()
        self.restart()
        super(WrapperManagerWizard, self).show()
        
    def done(self, status):
        self.page01._update_collection()
        self.page01._update_wrapper()
        self.wrapper_gw.flush()
示例#2
0
class WrapperManagerWizard(QtGui.QWizard):
    def __init__(self):
        super(WrapperManagerWizard, self).__init__()
        self.initialize()

    def initialize(self):
        self.setOption(QtGui.QWizard.NoCancelButton, True)
        self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True)

        self.wrapper_gw = WrapperGateway()

        wizard_title = 'Manage Wrappers'
        self.page01 = WrapperManagerPage(wizard_title, self)
        self.addPage(self.page01)

    def show(self):
        self.removePage(0)
        self.initialize()
        self.restart()
        super(WrapperManagerWizard, self).show()

    def done(self, status):
        self.page01._update_collection()
        self.page01._update_wrapper()
        self.wrapper_gw.flush()
 def generate_wrappers(self, url):
     wrapper_manager = WrapperGateway()
     example_manager = ExampleGateway(max_examples=self.max_examples,
                                      max_examples_from_db=
                                      self.max_examples_from_db,
                                      seconds_between_requests=
                                      self.secs_between_reqs)
     example_sets = example_manager.get_examples(self.wrapper_gen_examples,
                                                 url, self.min_validity)
     
     rulers = []
     for set in example_sets:
         log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable
         
         if set == 'author' or set == 'editor':
             rulers = [MultiValuePathRuler(),
                       SeparatorsRegexRuler(),
                       ElementsRegexRuler(),
                       PersonRuler()]
         else:
             try:
                 value_guide = self.value_guides[set]
                 pass
             except KeyError:
                 value_guide = '.*'
             rulers = [PathRuler(value_guide), RegexRuler()] 
     
         trainer = WrapperTrainer(rulers, self.wrapper_gen_examples)
         try:
             wrappers = trainer.train(example_sets[set])
             wrappers = self._prune_wrappers(wrappers)
             wrapper_manager.persist_wrappers(url, set, wrappers)
             log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable
         except Exception, e:
             log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
示例#4
0
    def initialize(self):
        self.setOption(QtGui.QWizard.NoCancelButton, True)
        self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True)

        self.wrapper_gw = WrapperGateway()

        wizard_title = 'Manage Wrappers'
        self.page01 = WrapperManagerPage(wizard_title, self)
        self.addPage(self.page01)
示例#5
0
    def _sort_results(self, results):
        """
        Sorts the results depending on the available wrappers. 
        Returns a list with the results that have a wrapper available on top
        of it, and those with no wrapper are discarded.
        The list is ordered depending on the quality of the wrappers.
        """

        # Create a list with all the available wrappers ordered by priority
        # Reference wrapper will be at the very beginning of the priority queue
        reference_wrappers = ReferenceWrapper().get_available_wrappers()
        available_wrappers = list(reference_wrappers)

        field_wrappers = WrapperGateway().get_available_wrappers()
        available_wrappers.extend(list(field_wrappers))

        wrappers_heap = []
        for result in results:
            base_url = result.base_url
            if self._in_black_list(result.url):
                continue
            elif not base_url in available_wrappers:
                continue
            else:
                # TODO: Remove this conditional
                if base_url.startswith('http://citeseerx'):
                    wrapper_index = len(results) + 5
                else:
                    wrapper_index = available_wrappers.index(base_url)
                heapq.heappush(wrappers_heap, (wrapper_index, result))
        results = heapq.nsmallest(len(results), wrappers_heap)
        return [result[1] for result in results]
 def initialize(self):
     self.setOption(QtGui.QWizard.NoCancelButton, True)
     self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True)
     
     self.wrapper_gw = WrapperGateway()
     
     wizard_title = 'Manage Wrappers'
     self.page01 = WrapperManagerPage(wizard_title, self)
     self.addPage(self.page01)
 def _use_rule_wrappers(self, source, page, raw_text):
     """
     Look if there is any wrapper in the database for the given source.
     """
     log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable
     fields = {}
     reference = Reference()
     wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers)
     wrapper_field_collections = wrapper_manager.find_wrapper_collections(source)
     
     for collection in wrapper_field_collections:
         # Get the wrappers for the current collection
         url, field = collection.url, collection.field
         wrappers = wrapper_manager.get_wrappers(url, field)
         log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable
                                                         len(wrappers)))
         
         # Get field validator
         try:
             validator = self.field_validation[collection.field][1]
         except KeyError:
             validator = None
         
         # Extract information using the wrappers we have
         for wrapper in wrappers:
             info = wrapper.extract_info(page)
             # we expect 'info' to be a string
             if type(info) == list and not (collection.field == 'author' 
                  or collection.field == 'editor'):
                 continue 
             log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable
             
             valid = validator.validate(info, raw_text) if validator else True
             # Save the extracted info even if it's not correct. It will
             # be overwritten afterwards if necessary
             reference.set_field(field, info, valid)
             
             if not valid: 
                 log.debug('The extracted information is not valid. ' #@UndefinedVariable
                           'Downvoting wrapper.') 
                 wrapper.downvotes += 1
                 wrapper_manager.update_wrapper(wrapper)
             else:
                 log.debug('The extracted information is valid. ' #@UndefinedVariable
                           'Upvoting wrapper') 
                 wrapper.upvotes += 1
                 wrapper_manager.update_wrapper(wrapper)
                 fields[field] = info
                 break
             
     if len(reference.fields) > 0:
         log.info('Extracted reference')  #@UndefinedVariable
         return [reference]
     else:
         log.info('Could not extract reference using ruled wrappers')  #@UndefinedVariable
         return []
示例#8
0
    def generate_wrappers(self, url):
        wrapper_manager = WrapperGateway()
        example_manager = ExampleGateway(
            max_examples=self.max_examples,
            max_examples_from_db=self.max_examples_from_db,
            seconds_between_requests=self.secs_between_reqs)
        example_sets = example_manager.get_examples(self.wrapper_gen_examples,
                                                    url, self.min_validity)

        rulers = []
        for set in example_sets:
            log.info('Starting wrapper training for set "%s"' %
                     set)  #@UndefinedVariable

            if set == 'author' or set == 'editor':
                rulers = [
                    MultiValuePathRuler(),
                    SeparatorsRegexRuler(),
                    ElementsRegexRuler(),
                    PersonRuler()
                ]
            else:
                try:
                    value_guide = self.value_guides[set]
                    pass
                except KeyError:
                    value_guide = '.*'
                rulers = [PathRuler(value_guide), RegexRuler()]

            trainer = WrapperTrainer(rulers, self.wrapper_gen_examples)
            try:
                wrappers = trainer.train(example_sets[set])
                wrappers = self._prune_wrappers(wrappers)
                wrapper_manager.persist_wrappers(url, set, wrappers)
                log.info('Trainer generated %d wrappers' %
                         len(wrappers))  #@UndefinedVariable
            except Exception, e:
                log.error('Error training wrapper for set "%s": %s' %
                          (set, e))  #@UndefinedVariable
    def initialize(self):
        self.setDefaultProperty('FileChooser', 'path',
                                QtCore.SIGNAL('pathChanged()'))
        self.setDefaultProperty('QProgressBar', 'value',
                                QtCore.SIGNAL('valueChanged(int)'))

        self.setOption(QtGui.QWizard.NoCancelButton, True)
        self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True)
        self.setOption(QtGui.QWizard.NoBackButtonOnLastPage, True)

        self.wrapper_gw = WrapperGateway()

        self.page01 = URLChoosePage(self)
        self.page02 = ProgressPage(self)
        #self.page03 = FinishedPage(self)
        self.addPage(self.page01)
        self.addPage(self.page02)
class TestWrapperManager(unittest.TestCase):
    def setUp(self):
        self.wm = WrapperGateway(
            create_session(sql_uri='sqlite:///:memory:', debug=True))

    def test_find_collection(self):
        # Do not create
        collection1 = self.wm.find_wrapper_collection(u'some_url',
                                                      u'some_field')
        self.failIf(collection1)

        # New collection
        collection1 = self.wm.find_wrapper_collection(u'some_url',
                                                      u'some_field', True)
        self.failUnless(collection1)
        self.failUnless(type(collection1) == mappers.WrapperCollection)

        # Existent collection
        collection2 = self.wm.find_wrapper_collection(u'some_url',
                                                      u'some_field')
        self.failUnless(collection2)
        self.failUnless(collection1 is collection2)

    def test_find_collections(self):
        collection11 = self.wm.find_wrapper_collection(u'c01', u'f01',
                                                       True)  #@UnusedVariable
        collection12 = self.wm.find_wrapper_collection(u'c01', u'f02',
                                                       True)  #@UnusedVariable
        collection21 = self.wm.find_wrapper_collection(u'c02', u'f01',
                                                       True)  #@UnusedVariable
        collection22 = self.wm.find_wrapper_collection(u'c02', u'f02',
                                                       True)  #@UnusedVariable

        collections = self.wm.find_wrapper_collections()
        self.failUnless(collections.count() >= 4)

        collections = self.wm.find_wrapper_collections(url=u'c02')
        self.failUnless(collections.count() == 2)

        collections = self.wm.find_wrapper_collections(field=u'f02')
        self.failUnless(collections.count() == 2)

    def test_get_unavailable_wrappers(self):
        wrappers = self.wm.get_wrappers(u'non_existent_url', u'no_field')
        self.failUnless(wrappers == [])

    def xtest_persist_wrapper_with_incorrect_rules(self):
        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(MockRule02(33)))
        self.failUnlessRaises(TypeError, self.wm.persist_wrapper, u'some_url',
                              u'some_field', wrapper)

    def xtest_persist_and_get_wrapper(self):
        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(33))
        wrapper.add_rule(MockRule01(55))
        wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6]))
        self.wm.persist_wrapper(u'some_url', u'some_field', wrapper)

        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(66))
        wrapper.add_rule(MockRule01(77))
        wrapper.add_rule(MockRule02([[2, 3, 4, 5], 4]))
        self.wm.persist_wrapper(u'some_url', u'some_field', wrapper)

        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(11))
        wrapper.add_rule(MockRule01(22))
        wrapper.add_rule(MockRule01(33))
        self.wm.persist_wrapper(u'some_url', u'some_other_field', wrapper)

        # Get non-existent wrapper
        wrappers = self.wm.get_wrappers(u'some_url', u'non_existent_field')
        self.failIf(wrappers, 'Get non-existent wrapper')

        # Get wrappers
        wrappers = self.wm.get_wrappers(u'some_url', u'some_field')
        self.failUnless(len(wrappers) == 2)
        wrappers = self.wm.get_wrappers(u'some_url', u'some_other_field')
        self.failUnless(len(wrappers) == 1)

    def test_persist_and_update_wrapper(self):
        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(33))
        wrapper.add_rule(MockRule01(55))
        wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6]))
        self.wm.persist_wrapper(u'concrete_url', u'concrete_field', wrapper)

        # Get wrappers
        wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field')
        self.failUnless(len(wrappers) == 1)

        # Update wrapper
        wrapper = wrappers[0]
        wrapper.upvotes += 1
        wrapper.downvotes -= 1
        self.wm.update_wrapper(wrapper)

        # Get the wrapper again
        wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field')
        self.failUnless(len(wrappers) == 1)
        self.failUnless(wrappers[0].upvotes == 1)
        self.failUnless(wrappers[0].downvotes == -1)

        # Update wrapper rules
        wrapper = wrappers[0]
        wrapper.rules[0].pattern = 223
        wrapper.rules[2].pattern = [1, 6]
        self.wm.update_wrapper(wrapper)

        # Get the wrapper again
        wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field')
        self.failUnless(len(wrappers) == 1)
        self.failUnless(wrappers[0].rules[0].pattern == 223)
        self.failUnless(wrappers[0].rules[2].pattern == [1, 6])

        # Add another wrapper rule
        wrapper = wrappers[0]
        wrapper.rules.append(MockRule02([2, 3, 4, 5]))
        self.wm.update_wrapper(wrapper)

        # Get the wrapper again
        wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field')
        self.failUnless(len(wrappers) == 1)
        self.failUnless(len(wrappers[0].rules) == 4)
        self.failUnless(wrappers[0].rules[3].pattern == [2, 3, 4, 5])
 def setUp(self):
     self.wm = WrapperGateway(create_session(
         sql_uri='sqlite:///:memory:', debug=True))
    def _populate_db(self):
        wg = WrapperGateway(self.session)
        collection01 = wg.new_wrapper_collection()
        collection01.field = u"a"
        collection01.url = u"url01"

        collection02 = wg.new_wrapper_collection()
        collection02.field = u"b"
        collection02.url = u"url01"

        collection03 = wg.new_wrapper_collection()
        collection03.field = u"a"
        collection03.url = u"url02"

        collection04 = wg.new_wrapper_collection()
        collection04.field = u"b"
        collection04.url = u"url02"

        collection05 = wg.new_wrapper_collection()
        collection05.field = u"a"
        collection05.url = u"url03"

        wrapper01 = wg.new_wrapper()
        wrapper01.downvotes = 0
        wrapper01.upvotes = 3
        wrapper01.score = 1.0
        collection01.wrappers.append(wrapper01)

        wrapper02 = wg.new_wrapper()
        wrapper02.downvotes = 0
        wrapper02.upvotes = 2
        wrapper02.score = 1.0
        collection01.wrappers.append(wrapper02)

        wrapper03 = wg.new_wrapper()
        wrapper03.downvotes = 1
        wrapper03.upvotes = 1
        wrapper03.score = 0.5
        collection02.wrappers.append(wrapper03)

        wrapper04 = wg.new_wrapper()
        wrapper04.downvotes = 0
        wrapper04.upvotes = 3
        wrapper04.score = 1.0
        collection04.wrappers.append(wrapper04)

        wrapper05 = wg.new_wrapper()
        wrapper05.downvotes = 0
        wrapper05.upvotes = 2
        wrapper05.score = 1.0
        collection05.wrappers.append(wrapper05)

        wrapper06 = wg.new_wrapper()
        wrapper06.downvotes = 1
        wrapper06.upvotes = 1
        wrapper06.score = 0.8
        collection05.wrappers.append(wrapper06)

        wrapper07 = wg.new_wrapper()
        wrapper07.downvotes = 0
        wrapper07.upvotes = 3
        wrapper07.score = 1.0
        collection05.wrappers.append(wrapper07)

        wrapper08 = wg.new_wrapper()
        wrapper08.downvotes = 1
        wrapper08.upvotes = 1
        wrapper08.score = 0.2
        collection05.wrappers.append(wrapper08)

        self.session.flush()
class TestWrapperManager(unittest.TestCase):
    
    def setUp(self):
        self.wm = WrapperGateway(create_session(
            sql_uri='sqlite:///:memory:', debug=True))
    
    def test_find_collection(self):
        # Do not create
        collection1 = self.wm.find_wrapper_collection(u'some_url',
                                                     u'some_field')
        self.failIf(collection1)
        
        # New collection
        collection1 = self.wm.find_wrapper_collection(u'some_url',
                                                     u'some_field', True)
        self.failUnless(collection1)
        self.failUnless(type(collection1) == mappers.WrapperCollection)
        
        # Existent collection
        collection2 = self.wm.find_wrapper_collection(u'some_url',
                                                     u'some_field')
        self.failUnless(collection2)
        self.failUnless(collection1 is collection2)
    
    def test_find_collections(self):
        collection11 = self.wm.find_wrapper_collection(u'c01', u'f01', True) #@UnusedVariable
        collection12 = self.wm.find_wrapper_collection(u'c01', u'f02', True) #@UnusedVariable
        collection21 = self.wm.find_wrapper_collection(u'c02', u'f01', True) #@UnusedVariable
        collection22 = self.wm.find_wrapper_collection(u'c02', u'f02', True) #@UnusedVariable
        
        collections = self.wm.find_wrapper_collections()
        self.failUnless(collections.count() >= 4)
        
        collections = self.wm.find_wrapper_collections(url=u'c02')
        self.failUnless(collections.count() == 2)
    
        collections = self.wm.find_wrapper_collections(field=u'f02')
        self.failUnless(collections.count() == 2)
            
    def test_get_unavailable_wrappers(self):
        wrappers = self.wm.get_wrappers(u'non_existent_url', u'no_field')
        self.failUnless(wrappers == [])

    def xtest_persist_wrapper_with_incorrect_rules(self):
        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(MockRule02(33)))
        self.failUnlessRaises(TypeError, self.wm.persist_wrapper,
                              u'some_url', u'some_field', wrapper)
    
    def xtest_persist_and_get_wrapper(self):
        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(33))
        wrapper.add_rule(MockRule01(55))
        wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6]))
        self.wm.persist_wrapper(u'some_url', u'some_field', wrapper)

        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(66))
        wrapper.add_rule(MockRule01(77))
        wrapper.add_rule(MockRule02([[2, 3, 4, 5], 4]))
        self.wm.persist_wrapper(u'some_url', u'some_field', wrapper)

        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(11))
        wrapper.add_rule(MockRule01(22))
        wrapper.add_rule(MockRule01(33))
        self.wm.persist_wrapper(u'some_url', u'some_other_field', wrapper)
    
        # Get non-existent wrapper
        wrappers = self.wm.get_wrappers(u'some_url', u'non_existent_field')
        self.failIf(wrappers, 'Get non-existent wrapper')
        
        # Get wrappers
        wrappers = self.wm.get_wrappers(u'some_url', u'some_field')
        self.failUnless(len(wrappers) == 2)
        wrappers = self.wm.get_wrappers(u'some_url', u'some_other_field')
        self.failUnless(len(wrappers) == 1)
        
    def test_persist_and_update_wrapper(self):
        wrapper = Wrapper()
        wrapper.add_rule(MockRule01(33))
        wrapper.add_rule(MockRule01(55))
        wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6]))
        self.wm.persist_wrapper(u'concrete_url', u'concrete_field', wrapper)
        
        # Get wrappers
        wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field')
        self.failUnless(len(wrappers) == 1)
        
        # Update wrapper
        wrapper = wrappers[0]
        wrapper.upvotes += 1
        wrapper.downvotes -= 1
        self.wm.update_wrapper(wrapper)
        
        # Get the wrapper again
        wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field')
        self.failUnless(len(wrappers) == 1)
        self.failUnless(wrappers[0].upvotes == 1)
        self.failUnless(wrappers[0].downvotes == -1)
        
        # Update wrapper rules
        wrapper = wrappers[0]
        wrapper.rules[0].pattern = 223
        wrapper.rules[2].pattern = [1, 6]
        self.wm.update_wrapper(wrapper)
        
        # Get the wrapper again
        wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field')
        self.failUnless(len(wrappers) == 1)
        self.failUnless(wrappers[0].rules[0].pattern == 223)
        self.failUnless(wrappers[0].rules[2].pattern == [1, 6])
        
        # Add another wrapper rule
        wrapper = wrappers[0]
        wrapper.rules.append(MockRule02([2, 3, 4, 5]))
        self.wm.update_wrapper(wrapper)
        
        # Get the wrapper again
        wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field')
        self.failUnless(len(wrappers) == 1)
        self.failUnless(len(wrappers[0].rules) == 4)
        self.failUnless(wrappers[0].rules[3].pattern == [2, 3, 4, 5])
 def setUp(self):
     self.session = create_session('sqlite:///:memory:', True)
     self.wg = WrapperGateway(session=self.session)
class TestWrapperGateway(unittest.TestCase):
    def setUp(self):
        self.session = create_session('sqlite:///:memory:', True)
        self.wg = WrapperGateway(session=self.session)
        
    def _populate_db(self):
        collection01 = self.wg.new_wrapper_collection()
        collection01.field = u'a'
        collection01.url = u'url01'
        
        collection02 = self.wg.new_wrapper_collection()
        collection02.field = u'b'
        collection02.url = u'url01'
        
        collection03 = self.wg.new_wrapper_collection()
        collection03.field = u'a'
        collection03.url = u'url02'
        
        collection04 = self.wg.new_wrapper_collection()
        collection04.field = u'b'
        collection04.url = u'url02'
        
        collection05 = self.wg.new_wrapper_collection()
        collection05.field = u'a'
        collection05.url = u'url03'

        wrapper01 = self.wg.new_wrapper()
        wrapper01.downvotes = 0
        wrapper01.upvotes = 3
        wrapper01.score = 1.0
        collection01.wrappers.append(wrapper01)
        
        wrapper02 = self.wg.new_wrapper()
        wrapper02.downvotes = 0
        wrapper02.upvotes = 2
        wrapper02.score = 1.0
        collection01.wrappers.append(wrapper02)

        wrapper03 = self.wg.new_wrapper()
        wrapper03.downvotes = 1
        wrapper03.upvotes = 1
        wrapper03.score = 0.5
        collection02.wrappers.append(wrapper03)        
 
        wrapper04 = self.wg.new_wrapper()
        wrapper04.downvotes = 0
        wrapper04.upvotes = 3
        wrapper04.score = 1.0
        collection04.wrappers.append(wrapper04)
        
        wrapper05 = self.wg.new_wrapper()
        wrapper05.downvotes = 0
        wrapper05.upvotes = 2
        wrapper05.score = 1.0
        collection05.wrappers.append(wrapper05)

        wrapper06 = self.wg.new_wrapper()
        wrapper06.downvotes = 1
        wrapper06.upvotes = 1
        wrapper06.score = 0.8
        collection05.wrappers.append(wrapper06)     
        
        
        wrapper07 = self.wg.new_wrapper()
        wrapper07.downvotes = 0
        wrapper07.upvotes = 3
        wrapper07.score = 1.0
        collection05.wrappers.append(wrapper07)

        wrapper08 = self.wg.new_wrapper()
        wrapper08.downvotes = 1
        wrapper08.upvotes = 1
        wrapper08.score = 0.2
        collection05.wrappers.append(wrapper08)     
        
        self.session.flush()
        
    def test_get_available_wrappers(self):
        self._populate_db()
        wrappers = self.wg.get_available_wrappers()
        self.failUnless(wrappers == [u'url03', u'url01', u'url02'])
示例#16
0
 def setUp(self):
     self.session = create_session('sqlite:///:memory:', True)
     self.wg = WrapperGateway(session=self.session)
示例#17
0
    def _use_rule_wrappers(self, source, page, raw_text):
        """
        Look if there is any wrapper in the database for the given source.
        """
        log.info('Attempting to extract reference with ruled wrappers'
                 )  #@UndefinedVariable
        fields = {}
        reference = Reference()
        wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers)
        wrapper_field_collections = wrapper_manager.find_wrapper_collections(
            source)

        for collection in wrapper_field_collections:
            # Get the wrappers for the current collection
            url, field = collection.url, collection.field
            wrappers = wrapper_manager.get_wrappers(url, field)
            log.debug('Collection %s:%s has %d wrappers' % (
                url,
                field,  #@UndefinedVariable
                len(wrappers)))

            # Get field validator
            try:
                validator = self.field_validation[collection.field][1]
            except KeyError:
                validator = None

            # Extract information using the wrappers we have
            for wrapper in wrappers:
                info = wrapper.extract_info(page)
                # we expect 'info' to be a string
                if type(info) == list and not (collection.field == 'author' or
                                               collection.field == 'editor'):
                    continue
                log.debug('Info extracted by wrapper: %s' %
                          info)  #@UndefinedVariable

                valid = validator.validate(info,
                                           raw_text) if validator else True
                # Save the extracted info even if it's not correct. It will
                # be overwritten afterwards if necessary
                reference.set_field(field, info, valid)

                if not valid:
                    log.debug(
                        'The extracted information is not valid. '  #@UndefinedVariable
                        'Downvoting wrapper.')
                    wrapper.downvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                else:
                    log.debug(
                        'The extracted information is valid. '  #@UndefinedVariable
                        'Upvoting wrapper')
                    wrapper.upvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                    fields[field] = info
                    break

        if len(reference.fields) > 0:
            log.info('Extracted reference')  #@UndefinedVariable
            return [reference]
        else:
            log.info('Could not extract reference using ruled wrappers'
                     )  #@UndefinedVariable
            return []
 def setUp(self):
     self.wm = WrapperGateway(
         create_session(sql_uri='sqlite:///:memory:', debug=True))
示例#19
0
class TestWrapperGateway(unittest.TestCase):
    def setUp(self):
        self.session = create_session('sqlite:///:memory:', True)
        self.wg = WrapperGateway(session=self.session)

    def _populate_db(self):
        collection01 = self.wg.new_wrapper_collection()
        collection01.field = u'a'
        collection01.url = u'url01'

        collection02 = self.wg.new_wrapper_collection()
        collection02.field = u'b'
        collection02.url = u'url01'

        collection03 = self.wg.new_wrapper_collection()
        collection03.field = u'a'
        collection03.url = u'url02'

        collection04 = self.wg.new_wrapper_collection()
        collection04.field = u'b'
        collection04.url = u'url02'

        collection05 = self.wg.new_wrapper_collection()
        collection05.field = u'a'
        collection05.url = u'url03'

        wrapper01 = self.wg.new_wrapper()
        wrapper01.downvotes = 0
        wrapper01.upvotes = 3
        wrapper01.score = 1.0
        collection01.wrappers.append(wrapper01)

        wrapper02 = self.wg.new_wrapper()
        wrapper02.downvotes = 0
        wrapper02.upvotes = 2
        wrapper02.score = 1.0
        collection01.wrappers.append(wrapper02)

        wrapper03 = self.wg.new_wrapper()
        wrapper03.downvotes = 1
        wrapper03.upvotes = 1
        wrapper03.score = 0.5
        collection02.wrappers.append(wrapper03)

        wrapper04 = self.wg.new_wrapper()
        wrapper04.downvotes = 0
        wrapper04.upvotes = 3
        wrapper04.score = 1.0
        collection04.wrappers.append(wrapper04)

        wrapper05 = self.wg.new_wrapper()
        wrapper05.downvotes = 0
        wrapper05.upvotes = 2
        wrapper05.score = 1.0
        collection05.wrappers.append(wrapper05)

        wrapper06 = self.wg.new_wrapper()
        wrapper06.downvotes = 1
        wrapper06.upvotes = 1
        wrapper06.score = 0.8
        collection05.wrappers.append(wrapper06)

        wrapper07 = self.wg.new_wrapper()
        wrapper07.downvotes = 0
        wrapper07.upvotes = 3
        wrapper07.score = 1.0
        collection05.wrappers.append(wrapper07)

        wrapper08 = self.wg.new_wrapper()
        wrapper08.downvotes = 1
        wrapper08.upvotes = 1
        wrapper08.score = 0.2
        collection05.wrappers.append(wrapper08)

        self.session.flush()

    def test_get_available_wrappers(self):
        self._populate_db()
        wrappers = self.wg.get_available_wrappers()
        self.failUnless(wrappers == [u'url03', u'url01', u'url02'])