Exemplo n.º 1
0
    def __init__(self):

        self.info = {}
        
        self.nexamples = 4
        self.base_path = '/home/rxuriguera/benchmark/pages/'
        self.fields = ['addres', 'author', 'isbn', 'issn', 'journal', 'number', 'pages', 'publisher', 'title', 'volume', 'year']
        self.libraries = ['informaworld']#['acm', 'citeulike', 'computerorg', 'econpapers', 'ideas', 'informaworld', 'sciencedirect', 'scientificcommons', 'springer']
        
        self.factory = UtilFactory()
        self.iec = IEController(self.factory, secs_between_reqs=0,
                                wrapper_gen_examples=self.nexamples)
        self.rec = ReferencesController(self.factory)
Exemplo n.º 2
0
 def setUp(self):
     factory = UtilFactory()
     self.iec = IEController(factory, ReferenceFormat.BIBTEX)
     self.top_results = [
         SearchResult(
             'result01',
             'http://portal.acm.org/citation.cfm?id=507338.507355'),
         SearchResult(
             'result01',
             'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf')
     ]
     self.empty_page = BeautifulSoup("<html><head/><body/></html>")
     self.page = self._get_soup('acm01.html')
     self.text = 'ss'
Exemplo n.º 3
0
    def make_reference(self, file, target_format):
        """
        Uses the controllers to extract the content of a file, get some query
        strings, retrieve results from a search engine, and extract the
        reference.
        """
        extraction = Extraction()
        
        extraction.file_path = file
        extraction.target_format = target_format
        
        log.info("Making reference for file: %s" % file) #@UndefinedVariable

        rce = RCEController(self.factory)
        raw_text = rce.extract_content(file, FileFormat.TXT)
        if not raw_text:
            return extraction
        
        extraction.query_strings = rce.get_query_strings(raw_text)
        if not extraction.query_strings:
            log.error('No query strings extracted') #@UndefinedVariable
            return extraction
        log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable
        
        ir = IRController(self.factory)
        extraction.top_results, extraction.used_query = (
            ir.get_top_results(extraction.query_strings))
        if not extraction.top_results:
            log.error('No top results to use with the available wrappers ' #@UndefinedVariable
                      'after trying %d queries' % 
                      len(extraction.query_strings))
            return extraction
        extraction.query_strings.remove(extraction.used_query)
        
        log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable
        log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable
        
        ie = IEController(self.factory, target_format)
        extraction.entries, extraction.used_result = (
            ie.extract_reference(extraction.top_results, raw_text))
        extraction.top_results.remove(extraction.used_result)
        log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable
        
        validator = ReferenceValidator(FIELD_WEIGHTS)
        for entry in extraction.entries:
            validator.validate(entry, raw_text)
        
        return extraction
Exemplo n.º 4
0
class WrapperGenerator(threading.Thread):               
    def __init__(self, url):
        super(WrapperGenerator, self).__init__()
        self.name = 'WrapTrainer'
        self.url = url
        self.factory = UtilFactory()
        self.ie_controller = IEController(self.factory)

    def run(self):
        self.generate_wrappers()

    def set_wrapper_gen_examples(self, num_examples):
        self.ie_controller.wrapper_gen_examples = num_examples

    def generate_wrappers(self):
        self.ie_controller.generate_wrappers(self.url)
Exemplo n.º 5
0
class WrapperGenerator(threading.Thread):
    def __init__(self, url):
        super(WrapperGenerator, self).__init__()
        self.name = 'WrapTrainer'
        self.url = url
        self.factory = UtilFactory()
        self.ie_controller = IEController(self.factory)

    def run(self):
        self.generate_wrappers()

    def set_wrapper_gen_examples(self, num_examples):
        self.ie_controller.wrapper_gen_examples = num_examples

    def generate_wrappers(self):
        self.ie_controller.generate_wrappers(self.url)
 def setUp(self):
     factory = UtilFactory()
     self.iec = IEController(factory, ReferenceFormat.BIBTEX)
     self.top_results = [
         SearchResult('result01',
             'http://portal.acm.org/citation.cfm?id=507338.507355'),
         SearchResult('result01',
             'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf')]        
     self.empty_page = BeautifulSoup("<html><head/><body/></html>")
     self.page = self._get_soup('acm01.html')
     self.text = 'ss';
class TestIEController(unittest.TestCase):
        
    def setUp(self):
        factory = UtilFactory()
        self.iec = IEController(factory, ReferenceFormat.BIBTEX)
        self.top_results = [
            SearchResult('result01',
                'http://portal.acm.org/citation.cfm?id=507338.507355'),
            SearchResult('result01',
                'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf')]        
        self.empty_page = BeautifulSoup("<html><head/><body/></html>")
        self.page = self._get_soup('acm01.html')
        self.text = 'ss';
        
    def tearDown(self):
        pass

    def test_use_reference_wrappers_page_with_no_wrapper(self):
        references = self.iec._use_reference_wrappers('some_source',
                                                      self.empty_page,
                                                      self.text)
        self.failUnless(len(references) == 0)
    
    def xtest_use_reference_wrappers_page_with_wrapper(self):
        references = self.iec._use_reference_wrappers('http://portal.acm.org',
                                                      self.page,
                                                      self.text)
        self.failUnless(len(references) == 1)
    
    def test_format_reference_same_format(self):
        ref = Reference(format=ReferenceFormat.BIBTEX, entry='formatted entry')
        self.iec._format_reference(ref)
        self.failUnless(ref.get_entry() == 'formatted entry')
        
    def test_format_reference_different_format(self):
        ref = Reference()
        ref.set_field('reference_id', 'Lmadsen99')
        ref.set_field('title', 'Some article title')
        
        self.iec._format_reference(ref)
        
        self.failUnless(ref.get_entry().startswith('@article{Lmadsen99,'))
        self.failUnless(ref.get_format() == self.iec.format)

    def xtest_use_rule_wrappers(self):
        references = self.iec._use_rule_wrappers(u'some_source', 'test 2007 content', '')
        self.failUnless(len(references) == 1)
        self.failUnless(len(references[0].fields) == 3)
    
    def test_validate_reference_fields(self):
        ref = Reference()
        ref.set_field('title', 'Some article title')
        ref.set_field('year', '32')
        raw_text = "Some article title and something else"
        self.iec._validate_reference_fields(ref, raw_text)
        self.failUnless(ref.get_field('title').valid == True)
        self.failUnless(ref.get_field('year').valid == False)
    
    def _get_soup(self, file_name):
        file_path = normpath(join(dirname(__file__), ('../../../../tests/'
                                     'fixtures/wrappers/' + file_name)))
        file = open(file_path)
        soup = BeautifulSoup(file.read())
        file.close()
        return soup
    
    def test_set_value_guides(self):
        value_guides = self.iec.value_guides
        self.failUnless(len(value_guides) == 5)
Exemplo n.º 8
0
 def __init__(self, url):
     super(WrapperGenerator, self).__init__()
     self.name = 'WrapTrainer'
     self.url = url
     self.factory = UtilFactory()
     self.ie_controller = IEController(self.factory)
Exemplo n.º 9
0
 def __init__(self, url):
     super(WrapperGenerator, self).__init__()
     self.name = 'WrapTrainer'
     self.url = url
     self.factory = UtilFactory()
     self.ie_controller = IEController(self.factory)
Exemplo n.º 10
0
class ExtractionStats(object):
    def __init__(self):

        self.info = {}
        
        self.nexamples = 4
        self.base_path = '/home/rxuriguera/benchmark/pages/'
        self.fields = ['addres', 'author', 'isbn', 'issn', 'journal', 'number', 'pages', 'publisher', 'title', 'volume', 'year']
        self.libraries = ['informaworld']#['acm', 'citeulike', 'computerorg', 'econpapers', 'ideas', 'informaworld', 'sciencedirect', 'scientificcommons', 'springer']
        
        self.factory = UtilFactory()
        self.iec = IEController(self.factory, secs_between_reqs=0,
                                wrapper_gen_examples=self.nexamples)
        self.rec = ReferencesController(self.factory)

    def save_msg(self, msg):
        print msg
        self.file.write(''.join([msg, '\n']))
        self.file.flush()
        
    def run(self):
        self.info = {}

        for library in self.libraries:
            lib_info = self.info.setdefault(library, []) #@UnusedVariable
            
            self.run_library(library)
        
        
        
    def run_library(self, library):
        self.file = open(''.join([self.base_path, library, '/extraction-results-', str(self.nexamples), '-corrected.csv']), 'w')
        self.session = create_session(''.join(['sqlite:///', self.base_path, '/', library, '/extraction-stats-', library, '-', str(self.nexamples), '-corrected.db']), debug=True)
        #self.session = create_session('sqlite:///:memory:', debug=True)
        self.wg = gateways.WrapperGateway(self.session)
        self.eg = gateways.ExtractionGateway(self.session)

        
        
        self.save_msg('Extraction results for library: %s' % library)
        
        files = open(''.join([self.base_path, library, '/', 'filelist.txt']), 'r')
        html_url, text_file = files.readline().split(' ', 1)
        files.seek(0)
        url = html_url.rsplit('/', 1)[0] #@UnusedVariable
        
        
        #self.import_generate(library, url)
        

        
        references = []
        for line in files.readlines():
            line = line.strip()
            html_url, text_file = line.split(' ', 1)
            
            text_file = open(text_file, 'r')
            text = text_file.read()
            text_file.close()
            
            top_results = [SearchResult('Some result', html_url)]
            print html_url

            refs, result = self.iec.extract_reference(top_results, text) #@UnusedVariable
            
            if refs:
                references.append(refs[0])
            else:
                references.append(None)
            
        # Load control references
        control_file = ''.join([self.base_path, library, '/extraction-results-control.bib'])
        control = self.rec._parse_entries_file(control_file)
         
        for control, extracted in zip(control, references):
            if not extracted:
                continue
            
            self.save_msg(extracted.entry)
            self.save_msg('\n')
            
            total_control_fields = 0
            total_extracted_fields = 0
            
            correct = 0
            parcial = 0
            error = 0
            
            valid = 0
            invalid = 0
            
            for field in control.fields:
                if field in ['url', 'reference_type', 'reference_id']:
                    continue
                
                control_value = control.get_field(field)
                total_control_fields += 1
                
                extracted_value = extracted.get_field(field)
                if not extracted_value:
                    continue
                
                if extracted_value.valid:
                    valid += 1
                else:
                    invalid += 1
                
                control_value = control_value.value
                extracted_value = extracted_value.value
                
                if type(control_value) is list:
                    self.save_msg('Comparing field %s values:\n\t%s\n\t%s' % (field, simplejson.dumps(control_value), simplejson.dumps(extracted_value)))
                    self.save_msg('\tCHECK MANUALLY')
                    continue
                
                control_value = control_value.strip()
                extracted_value = extracted_value.strip()
                
                control_regex = re.escape(control_value)
                extracted_regex = re.escape(extracted_value) #@UnusedVariable
                
                self.save_msg('Comparing field %s values:\n\t%s\n\t%s' % (field, control_value, extracted_value))
                
                if control_value == extracted_value:
                    correct += 1
                    self.save_msg('\tCorrect')
                elif re.search(control_regex, extracted_value): #or re.match(extracted_regex, control_value):
                    parcial += 1
                    self.save_msg('\tParcial')
                else:
                    error += 1
                    self.save_msg('\tIncorrect')
                
                total_extracted_fields += 1
            
            self.save_msg('')
            self.save_msg('Marked Valid;Marked invalid')
            self.save_msg('%d;%d' % (valid, invalid))
            
            self.save_msg('Total available;Total extracted;Incorrect;Parcial;Correct')
            self.save_msg('%d;%d;%d;%d;%d' % (total_control_fields, total_extracted_fields, error, parcial, correct))
            
            self.save_msg('\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n')
        self.file.close()
        
        
    def import_generate(self, library, url):
        # Import references
        importer = ReferenceImporter()
        importer.import_references(''.join([self.base_path, library, '/', library, '-', str(self.nexamples), '.bib']))
        
        # Generate wrappers
        generator = WrapperGenerator(url)
        generator.set_wrapper_gen_examples(self.nexamples)
        generator.generate_wrappers()    
Exemplo n.º 11
0
class TestIEController(unittest.TestCase):
    def setUp(self):
        factory = UtilFactory()
        self.iec = IEController(factory, ReferenceFormat.BIBTEX)
        self.top_results = [
            SearchResult(
                'result01',
                'http://portal.acm.org/citation.cfm?id=507338.507355'),
            SearchResult(
                'result01',
                'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf')
        ]
        self.empty_page = BeautifulSoup("<html><head/><body/></html>")
        self.page = self._get_soup('acm01.html')
        self.text = 'ss'

    def tearDown(self):
        pass

    def test_use_reference_wrappers_page_with_no_wrapper(self):
        references = self.iec._use_reference_wrappers('some_source',
                                                      self.empty_page,
                                                      self.text)
        self.failUnless(len(references) == 0)

    def xtest_use_reference_wrappers_page_with_wrapper(self):
        references = self.iec._use_reference_wrappers('http://portal.acm.org',
                                                      self.page, self.text)
        self.failUnless(len(references) == 1)

    def test_format_reference_same_format(self):
        ref = Reference(format=ReferenceFormat.BIBTEX, entry='formatted entry')
        self.iec._format_reference(ref)
        self.failUnless(ref.get_entry() == 'formatted entry')

    def test_format_reference_different_format(self):
        ref = Reference()
        ref.set_field('reference_id', 'Lmadsen99')
        ref.set_field('title', 'Some article title')

        self.iec._format_reference(ref)

        self.failUnless(ref.get_entry().startswith('@article{Lmadsen99,'))
        self.failUnless(ref.get_format() == self.iec.format)

    def xtest_use_rule_wrappers(self):
        references = self.iec._use_rule_wrappers(u'some_source',
                                                 'test 2007 content', '')
        self.failUnless(len(references) == 1)
        self.failUnless(len(references[0].fields) == 3)

    def test_validate_reference_fields(self):
        ref = Reference()
        ref.set_field('title', 'Some article title')
        ref.set_field('year', '32')
        raw_text = "Some article title and something else"
        self.iec._validate_reference_fields(ref, raw_text)
        self.failUnless(ref.get_field('title').valid == True)
        self.failUnless(ref.get_field('year').valid == False)

    def _get_soup(self, file_name):
        file_path = normpath(
            join(dirname(__file__), ('../../../../tests/'
                                     'fixtures/wrappers/' + file_name)))
        file = open(file_path)
        soup = BeautifulSoup(file.read())
        file.close()
        return soup

    def test_set_value_guides(self):
        value_guides = self.iec.value_guides
        self.failUnless(len(value_guides) == 5)