Exemplo n.º 1
0
    def __init__(self, query='', random_agent=False, debug=False):
        self.query = query
        self.debug = debug
        self.browser = Browser(debug=debug)
        self.prepare()

        if random_agent:
            self.browser.set_random_user_agent()
Exemplo n.º 2
0
    def __init__(self, query='', random_agent=False, debug=False):
        self.query = query
        self.debug = debug
        self.browser = Browser(debug=debug)
        self.prepare()

        if random_agent:
            self.browser.set_random_user_agent() 
Exemplo n.º 3
0
    def __init__(self,
                 factory,
                 target_format=ReferenceFormat.BIBTEX,
                 max_wrappers=MAX_WRAPPERS,
                 max_examples=MAX_EXAMPLES,
                 max_examples_from_db=MAX_EXAMPLES_FROM_DB,
                 min_validity=MIN_VALIDITY,
                 secs_between_reqs=SECONDS_BETWEEN_REQUESTS,
                 wrapper_gen_examples=WRAPPER_GEN_EXAMPLES):
        super(IEController, self).__init__(factory)
        self.browser = Browser()
        self.format = target_format
        self.field_validation = {}
        self._set_field_validation()
        self.value_guides = configuration.wrapper_properties['value_guide']

        self.max_wrappers = max_wrappers
        self.max_examples = max_examples
        self.max_examples_from_db = max_examples_from_db
        self.min_validity = min_validity
        self.secs_between_reqs = secs_between_reqs
        self.wrapper_gen_examples = wrapper_gen_examples
Exemplo n.º 4
0
 def _get_content(self, url):
     """
     This method looks for the content of an example's URL. In order not to
     overload the server, it sleeps for some time between multiple calls. 
     """
     time_to_sleep = (self.seconds_between_requests - 
                     (datetime.now() - self.last_request).seconds)
     if time_to_sleep > 0:
         sleep(time_to_sleep)
     
     content = None
     try:
         content = Browser().get_page(url)
         content = ContentCleaner().clean_content(content)
     except BrowserError as e:
         log.error('Error retrieving page %s: %s' % (url, #@UndefinedVariable
                                                     e.error))
     self.last_request = datetime.now()
     return content
Exemplo n.º 5
0
 def __init__(self, factory, target_format=ReferenceFormat.BIBTEX,
              max_wrappers=MAX_WRAPPERS,
              max_examples=MAX_EXAMPLES,
              max_examples_from_db=MAX_EXAMPLES_FROM_DB,
              min_validity=MIN_VALIDITY,
              secs_between_reqs=SECONDS_BETWEEN_REQUESTS,
              wrapper_gen_examples=WRAPPER_GEN_EXAMPLES):
     super(IEController, self).__init__(factory)
     self.browser = Browser()
     self.format = target_format
     self.field_validation = {}
     self._set_field_validation()
     self.value_guides = configuration.wrapper_properties['value_guide']
     
     self.max_wrappers = max_wrappers
     self.max_examples = max_examples
     self.max_examples_from_db = max_examples_from_db
     self.min_validity = min_validity
     self.secs_between_reqs = secs_between_reqs
     self.wrapper_gen_examples = wrapper_gen_examples
Exemplo n.º 6
0
class IEController(Controller):
    def __init__(self,
                 factory,
                 target_format=ReferenceFormat.BIBTEX,
                 max_wrappers=MAX_WRAPPERS,
                 max_examples=MAX_EXAMPLES,
                 max_examples_from_db=MAX_EXAMPLES_FROM_DB,
                 min_validity=MIN_VALIDITY,
                 secs_between_reqs=SECONDS_BETWEEN_REQUESTS,
                 wrapper_gen_examples=WRAPPER_GEN_EXAMPLES):
        super(IEController, self).__init__(factory)
        self.browser = Browser()
        self.format = target_format
        self.field_validation = {}
        self._set_field_validation()
        self.value_guides = configuration.wrapper_properties['value_guide']

        self.max_wrappers = max_wrappers
        self.max_examples = max_examples
        self.max_examples_from_db = max_examples_from_db
        self.min_validity = min_validity
        self.secs_between_reqs = secs_between_reqs
        self.wrapper_gen_examples = wrapper_gen_examples

    def extract_reference(self, top_results, raw_text):
        """
        Returns a list of References if they can be extracted or an empty 
        list otherwise.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """

        log.info('Using %d top results' %
                 len(top_results))  #@UndefinedVariable
        page = None
        references = []
        for result in top_results:
            try:
                log.debug('Retrieving page for result %s' %
                          result.url)  #@UndefinedVariable
                page = self.browser.get_page(result.url)
            except BrowserError as e:
                log.error('Error retrieving page %s: %s' % (
                    result.url,  #@UndefinedVariable
                    e.error))
                continue

            page = ContentCleaner().clean_content(page)

            references = self._use_reference_wrappers(result.base_url, page,
                                                      raw_text)
            if not references:
                references = self._use_rule_wrappers(result.base_url, page,
                                                     raw_text)

            if references:
                break

        # Convert to target format, if necessary
        for reference in references:
            self._format_reference(reference)

        # Return the extracted reference and the result that has been used
        return (references, result)

    def _use_rule_wrappers(self, source, page, raw_text):
        """
        Look if there is any wrapper in the database for the given source.
        """
        log.info('Attempting to extract reference with ruled wrappers'
                 )  #@UndefinedVariable
        fields = {}
        reference = Reference()
        wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers)
        wrapper_field_collections = wrapper_manager.find_wrapper_collections(
            source)

        for collection in wrapper_field_collections:
            # Get the wrappers for the current collection
            url, field = collection.url, collection.field
            wrappers = wrapper_manager.get_wrappers(url, field)
            log.debug('Collection %s:%s has %d wrappers' % (
                url,
                field,  #@UndefinedVariable
                len(wrappers)))

            # Get field validator
            try:
                validator = self.field_validation[collection.field][1]
            except KeyError:
                validator = None

            # Extract information using the wrappers we have
            for wrapper in wrappers:
                info = wrapper.extract_info(page)
                # we expect 'info' to be a string
                if type(info) == list and not (collection.field == 'author' or
                                               collection.field == 'editor'):
                    continue
                log.debug('Info extracted by wrapper: %s' %
                          info)  #@UndefinedVariable

                valid = validator.validate(info,
                                           raw_text) if validator else True
                # Save the extracted info even if it's not correct. It will
                # be overwritten afterwards if necessary
                reference.set_field(field, info, valid)

                if not valid:
                    log.debug(
                        'The extracted information is not valid. '  #@UndefinedVariable
                        'Downvoting wrapper.')
                    wrapper.downvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                else:
                    log.debug(
                        'The extracted information is valid. '  #@UndefinedVariable
                        'Upvoting wrapper')
                    wrapper.upvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                    fields[field] = info
                    break

        if len(reference.fields) > 0:
            log.info('Extracted reference')  #@UndefinedVariable
            return [reference]
        else:
            log.info('Could not extract reference using ruled wrappers'
                     )  #@UndefinedVariable
            return []

    def _use_reference_wrappers(self, source, page, raw_text):
        """
        Use a reference wrapper to get the reference from a given page.
        Returns a list of References with the full entry, format and a 
        structure with the different fields.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """
        log.info('Attempting to extract reference with a reference wrapper'
                 )  #@UndefinedVariable
        references = []
        entry, format = ReferenceWrapper().extract_info(source, page)
        if not entry:
            log.debug('Could not find any entry using a reference wrapper'
                      )  #@UndefinedVariable
            return references

        # Create a parser for the given reference format
        try:
            parser = self.util_factory.create_parser(format)
        except UtilCreationError as e:
            log.error('Could not create a parser for %s: %s' % (
                format,  #@UndefinedVariable
                e.args))
            return references

        if not parser.check_format(entry):
            log.error('Given entry is not in %s' % format)  #@UndefinedVariable
            return references

        # There may be more than one entry for the same file.
        log.debug('Parsing extracted entries')  #@UndefinedVariable
        try:
            entries = parser.split_source(entry)
            for entry in entries:
                fields = parser.parse_entry(entry)
                reference = Reference(fields, format, entry)
                self._validate_reference_fields(reference, raw_text)
                references.append(reference)
        except Exception, e:
            log.error('Error parsing extracted entry: %s ' %
                      e)  #@UndefinedVariable

        return references
class ReferenceWrapper(Wrapper):
    """
    Offers methods to extract complete references from som webpages
    """
    _available_wrappers = {'http://portal.acm.org':'portal_acm',
                           'http://citeseerx.ist.psu.edu':'citeseerx'}
    _browser = Browser()
    
    def extract_info(self, source, page):
        """
        Extracts a reference from the given page.
        """
        if source not in self._available_wrappers.keys():
            log.debug('No reference wrapper available for source %s' % source) #@UndefinedVariable
            return (None, None)
        
        wrapper_method = getattr(self,
                                 '_do_' + self._available_wrappers[source])
        return wrapper_method(source, page) 

    def get_available_wrappers(self):
        return self._available_wrappers.keys()

    def _do_portal_acm(self, source, page):
        """
        Searches the page for a link to the reference, and then retrieves the
        reference.
        Returns a tuple with the full reference and its format.
        """ 
        log.info('Using ACM Portal reference wrapper') #@UndefinedVariable
        ref = (None, None)
        anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')})
        if not anchor:
            return ref
        jscript = anchor['onclick'].replace('window.open', '').strip('\(\)')
        ref_url = jscript.split(',')[0].strip('\'')
        ref_url = source + '/' + ref_url
        
        try:
            page = BeautifulSoup(self._browser.get_page(ref_url))
        except BrowserError:
            log.error('Browse error while retrieving entry page') #@UndefinedVariable
            return ref
        
        pre = page.find('pre')
        if not pre:
            return ref
        
        # As the wrapper has been hardcoded, we already know what will be the
        # format of the reference
        return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX)
    
    def _do_citeseerx(self, source, page):
        """
        Searches the page for a link to the reference, and then retrieves the
        reference.
        Returns a tuple with the full reference and its format.
        """ 
        log.info('Using CiteSeerX reference wrapper') #@UndefinedVariable
        ref = (None, None)
        
        try:
            ref_element = page.find('div', {'class':'content'},
                                    text=re.compile('@\w*{'))
            ref_element = ref_element.parent.findAll(text=True)
            reference = ''.join(ref_element)
        except Exception, e:
            log.warn('Could not find reference in citeseerx page: %s' % e) #@UndefinedVariable
            return ref
        
        return (reference.strip(), ReferenceFormat.BIBTEX)
Exemplo n.º 8
0
class IEController(Controller):
    def __init__(self, factory, target_format=ReferenceFormat.BIBTEX,
                 max_wrappers=MAX_WRAPPERS,
                 max_examples=MAX_EXAMPLES,
                 max_examples_from_db=MAX_EXAMPLES_FROM_DB,
                 min_validity=MIN_VALIDITY,
                 secs_between_reqs=SECONDS_BETWEEN_REQUESTS,
                 wrapper_gen_examples=WRAPPER_GEN_EXAMPLES):
        super(IEController, self).__init__(factory)
        self.browser = Browser()
        self.format = target_format
        self.field_validation = {}
        self._set_field_validation()
        self.value_guides = configuration.wrapper_properties['value_guide']
        
        self.max_wrappers = max_wrappers
        self.max_examples = max_examples
        self.max_examples_from_db = max_examples_from_db
        self.min_validity = min_validity
        self.secs_between_reqs = secs_between_reqs
        self.wrapper_gen_examples = wrapper_gen_examples
        
    def extract_reference(self, top_results, raw_text):
        """
        Returns a list of References if they can be extracted or an empty 
        list otherwise.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """
        
        log.info('Using %d top results' % len(top_results)) #@UndefinedVariable
        page = None
        references = []
        for result in top_results:
            try:
                log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable
                page = self.browser.get_page(result.url)
            except BrowserError as e:
                log.error('Error retrieving page %s: %s' % (result.url, #@UndefinedVariable
                                                            e.error))
                continue
            
            page = ContentCleaner().clean_content(page)
            
            references = self._use_reference_wrappers(result.base_url, page,
                                                      raw_text)
            if not references:
                references = self._use_rule_wrappers(result.base_url, page,
                                                     raw_text)
                
            if references:
                break
        
        # Convert to target format, if necessary
        for reference in references:
            self._format_reference(reference)
        
        # Return the extracted reference and the result that has been used
        return (references, result)
    
    def _use_rule_wrappers(self, source, page, raw_text):
        """
        Look if there is any wrapper in the database for the given source.
        """
        log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable
        fields = {}
        reference = Reference()
        wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers)
        wrapper_field_collections = wrapper_manager.find_wrapper_collections(source)
        
        for collection in wrapper_field_collections:
            # Get the wrappers for the current collection
            url, field = collection.url, collection.field
            wrappers = wrapper_manager.get_wrappers(url, field)
            log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable
                                                            len(wrappers)))
            
            # Get field validator
            try:
                validator = self.field_validation[collection.field][1]
            except KeyError:
                validator = None
            
            # Extract information using the wrappers we have
            for wrapper in wrappers:
                info = wrapper.extract_info(page)
                # we expect 'info' to be a string
                if type(info) == list and not (collection.field == 'author' 
                     or collection.field == 'editor'):
                    continue 
                log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable
                
                valid = validator.validate(info, raw_text) if validator else True
                # Save the extracted info even if it's not correct. It will
                # be overwritten afterwards if necessary
                reference.set_field(field, info, valid)
                
                if not valid: 
                    log.debug('The extracted information is not valid. ' #@UndefinedVariable
                              'Downvoting wrapper.') 
                    wrapper.downvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                else:
                    log.debug('The extracted information is valid. ' #@UndefinedVariable
                              'Upvoting wrapper') 
                    wrapper.upvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                    fields[field] = info
                    break
                
        if len(reference.fields) > 0:
            log.info('Extracted reference')  #@UndefinedVariable
            return [reference]
        else:
            log.info('Could not extract reference using ruled wrappers')  #@UndefinedVariable
            return []
    
    def _use_reference_wrappers(self, source, page, raw_text):
        """
        Use a reference wrapper to get the reference from a given page.
        Returns a list of References with the full entry, format and a 
        structure with the different fields.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """
        log.info('Attempting to extract reference with a reference wrapper') #@UndefinedVariable
        references = []
        entry, format = ReferenceWrapper().extract_info(source, page)
        if not entry:
            log.debug('Could not find any entry using a reference wrapper') #@UndefinedVariable
            return references
        
        # Create a parser for the given reference format
        try:
            parser = self.util_factory.create_parser(format)
        except UtilCreationError as e:
            log.error('Could not create a parser for %s: %s' % (format, #@UndefinedVariable
                                                                e.args))
            return references
        
        if not parser.check_format(entry):
            log.error('Given entry is not in %s' % format) #@UndefinedVariable
            return references
        
        # There may be more than one entry for the same file.
        log.debug('Parsing extracted entries') #@UndefinedVariable
        try:
            entries = parser.split_source(entry)
            for entry in entries:
                fields = parser.parse_entry(entry)
                reference = Reference(fields, format, entry)
                self._validate_reference_fields(reference, raw_text)
                references.append(reference)
        except Exception, e:
            log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable

        return references
Exemplo n.º 9
0
class Searcher(object):
    """
    Base class for searching with a search engine
    """
    GOOGLE = 0
    SCHOLAR = 1
    BING = 2
    YAHOO = 3

    def __init__(self, query='', random_agent=False, debug=False):
        self.query = query
        self.debug = debug
        self.browser = Browser(debug=debug)
        self.prepare()

        if random_agent:
            self.browser.set_random_user_agent()

    def prepare(self):
        self.results_info = None
        self.eor = False  # end of results
        self._page = 0
        self._results_per_page = 30
        self._last_from = 0

    def get_query(self):
        return self.__query

    def set_query(self, value):
        self.__query = value
        self.prepare()

    query = property(get_query, set_query)

    @property
    def num_results(self):
        if not self.results_info:
            page = self._get_results_page()
            self.results_info = self._extract_info(page)
            if self.results_info['total'] == 0:
                self.eor = True
        return self.results_info['total']

    @property
    def search_engine_url(self):
        raise NotImplementedError()

    def _get_page(self):
        return self._page

    def _set_page(self, page):
        self._page = page

    page = property(_get_page, _set_page)

    def _get_results_per_page(self):
        return self._results_per_page

    def _set_results_par_page(self, rpp):
        self._results_per_page = rpp

    results_per_page = property(_get_results_per_page, _set_results_par_page)

    def get_results(self):
        """ Gets a page of results """
        if self.eor:
            return []

        page = self._get_results_page()
        search_info = self._extract_info(page)
        if not self.results_info:
            self.results_info = search_info
            if self.num_results == 0:
                self.eor = True
                return []
        results = self._extract_results(page)
        if not results:
            self.eor = True
            return []
        if self._page > 0 and search_info['from'] == self._last_from:
            self.eor = True
            return []
        if search_info['to'] == search_info['total']:
            self.eor = True
        self._page += 1
        self._last_from = search_info['from']
        return results

    def _maybe_raise(self, cls, *arg):
        if self.debug:
            raise cls(*arg)

    def _get_safe_url(self):
        return self.search_engine_url % {
            'query': urllib.quote_plus(self.query),
            'start': self._page * self._results_per_page,
            'num': self._results_per_page
        }

    def _get_results_page(self):
        safe_url = self._get_safe_url()

        # Wait a random time between 0.5 and 1,5 seconds before doing the
        # search
        #time_to_wait = random.randrange(5, 15, 2) / 10.0
        #log.debug('Waiting %g before searching %s' % (time_to_wait, safe_url))
        #time.sleep(time_to_wait)

        try:
            page = self.browser.get_page(safe_url)
        except BrowserError, e:
            raise SearchError("Failed getting %s: %s" % (e.url, e.error))
        return BeautifulSoup(page)
Exemplo n.º 10
0
class Searcher(object):
    """
    Base class for searching with a search engine
    """
    GOOGLE = 0
    SCHOLAR = 1
    BING = 2
    YAHOO = 3
    
    def __init__(self, query='', random_agent=False, debug=False):
        self.query = query
        self.debug = debug
        self.browser = Browser(debug=debug)
        self.prepare()

        if random_agent:
            self.browser.set_random_user_agent() 

    def prepare(self):
        self.results_info = None
        self.eor = False # end of results
        self._page = 0
        self._results_per_page = 30
        self._last_from = 0

    def get_query(self):
        return self.__query

    def set_query(self, value):
        self.__query = value
        self.prepare()
        
    query = property(get_query, set_query)
            
    @property
    def num_results(self):
        if not self.results_info:
            page = self._get_results_page()
            self.results_info = self._extract_info(page)
            if self.results_info['total'] == 0:
                self.eor = True
        return self.results_info['total']

    @property
    def search_engine_url(self):
        raise NotImplementedError()    

    def _get_page(self):
        return self._page

    def _set_page(self, page):
        self._page = page

    page = property(_get_page, _set_page)

    def _get_results_per_page(self):
        return self._results_per_page

    def _set_results_par_page(self, rpp):
        self._results_per_page = rpp

    results_per_page = property(_get_results_per_page, _set_results_par_page)

    def get_results(self):
        """ Gets a page of results """
        if self.eor:
            return []

        page = self._get_results_page()
        search_info = self._extract_info(page)
        if not self.results_info:
            self.results_info = search_info
            if self.num_results == 0:
                self.eor = True
                return []
        results = self._extract_results(page)
        if not results:
            self.eor = True
            return []
        if self._page > 0 and search_info['from'] == self._last_from:
            self.eor = True
            return []
        if search_info['to'] == search_info['total']:
            self.eor = True
        self._page += 1
        self._last_from = search_info['from']
        return results

    def _maybe_raise(self, cls, *arg):
        if self.debug:
            raise cls(*arg)

    def _get_safe_url(self):
        return self.search_engine_url % {'query':urllib.quote_plus(self.query),
                                'start':self._page * self._results_per_page,
                                'num'  :self._results_per_page }

    def _get_results_page(self):
        safe_url = self._get_safe_url() 
        
        # Wait a random time between 0.5 and 1,5 seconds before doing the 
        # search
        #time_to_wait = random.randrange(5, 15, 2) / 10.0
        #log.debug('Waiting %g before searching %s' % (time_to_wait, safe_url))
        #time.sleep(time_to_wait)
        
        try:
            page = self.browser.get_page(safe_url)
        except BrowserError, e:
            raise SearchError("Failed getting %s: %s" % (e.url, e.error))
        return BeautifulSoup(page)