Exemplo n.º 1
0
    def __init__(self, query='', random_agent=False, debug=False):
        self.query = query
        self.debug = debug
        self.browser = Browser(debug=debug)
        self.prepare()

        if random_agent:
            self.browser.set_random_user_agent()
Exemplo n.º 2
0
 def _get_content(self, url):
     """
     This method looks for the content of an example's URL. In order not to
     overload the server, it sleeps for some time between multiple calls. 
     """
     time_to_sleep = (self.seconds_between_requests - 
                     (datetime.now() - self.last_request).seconds)
     if time_to_sleep > 0:
         sleep(time_to_sleep)
     
     content = None
     try:
         content = Browser().get_page(url)
         content = ContentCleaner().clean_content(content)
     except BrowserError as e:
         log.error('Error retrieving page %s: %s' % (url, #@UndefinedVariable
                                                     e.error))
     self.last_request = datetime.now()
     return content
Exemplo n.º 3
0
    def __init__(self,
                 factory,
                 target_format=ReferenceFormat.BIBTEX,
                 max_wrappers=MAX_WRAPPERS,
                 max_examples=MAX_EXAMPLES,
                 max_examples_from_db=MAX_EXAMPLES_FROM_DB,
                 min_validity=MIN_VALIDITY,
                 secs_between_reqs=SECONDS_BETWEEN_REQUESTS,
                 wrapper_gen_examples=WRAPPER_GEN_EXAMPLES):
        super(IEController, self).__init__(factory)
        self.browser = Browser()
        self.format = target_format
        self.field_validation = {}
        self._set_field_validation()
        self.value_guides = configuration.wrapper_properties['value_guide']

        self.max_wrappers = max_wrappers
        self.max_examples = max_examples
        self.max_examples_from_db = max_examples_from_db
        self.min_validity = min_validity
        self.secs_between_reqs = secs_between_reqs
        self.wrapper_gen_examples = wrapper_gen_examples
class ReferenceWrapper(Wrapper):
    """
    Offers methods to extract complete references from som webpages
    """
    _available_wrappers = {'http://portal.acm.org':'portal_acm',
                           'http://citeseerx.ist.psu.edu':'citeseerx'}
    _browser = Browser()
    
    def extract_info(self, source, page):
        """
        Extracts a reference from the given page.
        """
        if source not in self._available_wrappers.keys():
            log.debug('No reference wrapper available for source %s' % source) #@UndefinedVariable
            return (None, None)
        
        wrapper_method = getattr(self,
                                 '_do_' + self._available_wrappers[source])
        return wrapper_method(source, page) 

    def get_available_wrappers(self):
        return self._available_wrappers.keys()

    def _do_portal_acm(self, source, page):
        """
        Searches the page for a link to the reference, and then retrieves the
        reference.
        Returns a tuple with the full reference and its format.
        """ 
        log.info('Using ACM Portal reference wrapper') #@UndefinedVariable
        ref = (None, None)
        anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')})
        if not anchor:
            return ref
        jscript = anchor['onclick'].replace('window.open', '').strip('\(\)')
        ref_url = jscript.split(',')[0].strip('\'')
        ref_url = source + '/' + ref_url
        
        try:
            page = BeautifulSoup(self._browser.get_page(ref_url))
        except BrowserError:
            log.error('Browse error while retrieving entry page') #@UndefinedVariable
            return ref
        
        pre = page.find('pre')
        if not pre:
            return ref
        
        # As the wrapper has been hardcoded, we already know what will be the
        # format of the reference
        return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX)
    
    def _do_citeseerx(self, source, page):
        """
        Searches the page for a link to the reference, and then retrieves the
        reference.
        Returns a tuple with the full reference and its format.
        """ 
        log.info('Using CiteSeerX reference wrapper') #@UndefinedVariable
        ref = (None, None)
        
        try:
            ref_element = page.find('div', {'class':'content'},
                                    text=re.compile('@\w*{'))
            ref_element = ref_element.parent.findAll(text=True)
            reference = ''.join(ref_element)
        except Exception, e:
            log.warn('Could not find reference in citeseerx page: %s' % e) #@UndefinedVariable
            return ref
        
        return (reference.strip(), ReferenceFormat.BIBTEX)