def __init__(self, query='', random_agent=False, debug=False): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.prepare() if random_agent: self.browser.set_random_user_agent()
def _get_content(self, url): """ This method looks for the content of an example's URL. In order not to overload the server, it sleeps for some time between multiple calls. """ time_to_sleep = (self.seconds_between_requests - (datetime.now() - self.last_request).seconds) if time_to_sleep > 0: sleep(time_to_sleep) content = None try: content = Browser().get_page(url) content = ContentCleaner().clean_content(content) except BrowserError as e: log.error('Error retrieving page %s: %s' % (url, #@UndefinedVariable e.error)) self.last_request = datetime.now() return content
def __init__(self, factory, target_format=ReferenceFormat.BIBTEX, max_wrappers=MAX_WRAPPERS, max_examples=MAX_EXAMPLES, max_examples_from_db=MAX_EXAMPLES_FROM_DB, min_validity=MIN_VALIDITY, secs_between_reqs=SECONDS_BETWEEN_REQUESTS, wrapper_gen_examples=WRAPPER_GEN_EXAMPLES): super(IEController, self).__init__(factory) self.browser = Browser() self.format = target_format self.field_validation = {} self._set_field_validation() self.value_guides = configuration.wrapper_properties['value_guide'] self.max_wrappers = max_wrappers self.max_examples = max_examples self.max_examples_from_db = max_examples_from_db self.min_validity = min_validity self.secs_between_reqs = secs_between_reqs self.wrapper_gen_examples = wrapper_gen_examples
class ReferenceWrapper(Wrapper): """ Offers methods to extract complete references from som webpages """ _available_wrappers = {'http://portal.acm.org':'portal_acm', 'http://citeseerx.ist.psu.edu':'citeseerx'} _browser = Browser() def extract_info(self, source, page): """ Extracts a reference from the given page. """ if source not in self._available_wrappers.keys(): log.debug('No reference wrapper available for source %s' % source) #@UndefinedVariable return (None, None) wrapper_method = getattr(self, '_do_' + self._available_wrappers[source]) return wrapper_method(source, page) def get_available_wrappers(self): return self._available_wrappers.keys() def _do_portal_acm(self, source, page): """ Searches the page for a link to the reference, and then retrieves the reference. Returns a tuple with the full reference and its format. """ log.info('Using ACM Portal reference wrapper') #@UndefinedVariable ref = (None, None) anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')}) if not anchor: return ref jscript = anchor['onclick'].replace('window.open', '').strip('\(\)') ref_url = jscript.split(',')[0].strip('\'') ref_url = source + '/' + ref_url try: page = BeautifulSoup(self._browser.get_page(ref_url)) except BrowserError: log.error('Browse error while retrieving entry page') #@UndefinedVariable return ref pre = page.find('pre') if not pre: return ref # As the wrapper has been hardcoded, we already know what will be the # format of the reference return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX) def _do_citeseerx(self, source, page): """ Searches the page for a link to the reference, and then retrieves the reference. Returns a tuple with the full reference and its format. """ log.info('Using CiteSeerX reference wrapper') #@UndefinedVariable ref = (None, None) try: ref_element = page.find('div', {'class':'content'}, text=re.compile('@\w*{')) ref_element = ref_element.parent.findAll(text=True) reference = ''.join(ref_element) except Exception, e: log.warn('Could not find reference in citeseerx page: %s' % e) #@UndefinedVariable return ref return (reference.strip(), ReferenceFormat.BIBTEX)