class IEController(Controller): def __init__(self, factory, target_format=ReferenceFormat.BIBTEX, max_wrappers=MAX_WRAPPERS, max_examples=MAX_EXAMPLES, max_examples_from_db=MAX_EXAMPLES_FROM_DB, min_validity=MIN_VALIDITY, secs_between_reqs=SECONDS_BETWEEN_REQUESTS, wrapper_gen_examples=WRAPPER_GEN_EXAMPLES): super(IEController, self).__init__(factory) self.browser = Browser() self.format = target_format self.field_validation = {} self._set_field_validation() self.value_guides = configuration.wrapper_properties['value_guide'] self.max_wrappers = max_wrappers self.max_examples = max_examples self.max_examples_from_db = max_examples_from_db self.min_validity = min_validity self.secs_between_reqs = secs_between_reqs self.wrapper_gen_examples = wrapper_gen_examples def extract_reference(self, top_results, raw_text): """ Returns a list of References if they can be extracted or an empty list otherwise. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Using %d top results' % len(top_results)) #@UndefinedVariable page = None references = [] for result in top_results: try: log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable page = self.browser.get_page(result.url) except BrowserError as e: log.error('Error retrieving page %s: %s' % ( result.url, #@UndefinedVariable e.error)) continue page = ContentCleaner().clean_content(page) references = self._use_reference_wrappers(result.base_url, page, raw_text) if not references: references = self._use_rule_wrappers(result.base_url, page, raw_text) if references: break # Convert to target format, if necessary for reference in references: self._format_reference(reference) # Return the extracted reference and the result that has been used return (references, result) def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers' ) #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections( source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % ( url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug( 'The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug( 'The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers' ) #@UndefinedVariable return [] def _use_reference_wrappers(self, source, page, raw_text): """ Use a reference wrapper to get the reference from a given page. Returns a list of References with the full entry, format and a structure with the different fields. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Attempting to extract reference with a reference wrapper' ) #@UndefinedVariable references = [] entry, format = ReferenceWrapper().extract_info(source, page) if not entry: log.debug('Could not find any entry using a reference wrapper' ) #@UndefinedVariable return references # Create a parser for the given reference format try: parser = self.util_factory.create_parser(format) except UtilCreationError as e: log.error('Could not create a parser for %s: %s' % ( format, #@UndefinedVariable e.args)) return references if not parser.check_format(entry): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing extracted entries') #@UndefinedVariable try: entries = parser.split_source(entry) for entry in entries: fields = parser.parse_entry(entry) reference = Reference(fields, format, entry) self._validate_reference_fields(reference, raw_text) references.append(reference) except Exception, e: log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable return references
class Searcher(object): """ Base class for searching with a search engine """ GOOGLE = 0 SCHOLAR = 1 BING = 2 YAHOO = 3 def __init__(self, query='', random_agent=False, debug=False): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.prepare() if random_agent: self.browser.set_random_user_agent() def prepare(self): self.results_info = None self.eor = False # end of results self._page = 0 self._results_per_page = 30 self._last_from = 0 def get_query(self): return self.__query def set_query(self, value): self.__query = value self.prepare() query = property(get_query, set_query) @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info['total'] == 0: self.eor = True return self.results_info['total'] @property def search_engine_url(self): raise NotImplementedError() def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] page = self._get_results_page() search_info = self._extract_info(page) if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] results = self._extract_results(page) if not results: self.eor = True return [] if self._page > 0 and search_info['from'] == self._last_from: self.eor = True return [] if search_info['to'] == search_info['total']: self.eor = True self._page += 1 self._last_from = search_info['from'] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_safe_url(self): return self.search_engine_url % { 'query': urllib.quote_plus(self.query), 'start': self._page * self._results_per_page, 'num': self._results_per_page } def _get_results_page(self): safe_url = self._get_safe_url() # Wait a random time between 0.5 and 1,5 seconds before doing the # search #time_to_wait = random.randrange(5, 15, 2) / 10.0 #log.debug('Waiting %g before searching %s' % (time_to_wait, safe_url)) #time.sleep(time_to_wait) try: page = self.browser.get_page(safe_url) except BrowserError, e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page)
class IEController(Controller): def __init__(self, factory, target_format=ReferenceFormat.BIBTEX, max_wrappers=MAX_WRAPPERS, max_examples=MAX_EXAMPLES, max_examples_from_db=MAX_EXAMPLES_FROM_DB, min_validity=MIN_VALIDITY, secs_between_reqs=SECONDS_BETWEEN_REQUESTS, wrapper_gen_examples=WRAPPER_GEN_EXAMPLES): super(IEController, self).__init__(factory) self.browser = Browser() self.format = target_format self.field_validation = {} self._set_field_validation() self.value_guides = configuration.wrapper_properties['value_guide'] self.max_wrappers = max_wrappers self.max_examples = max_examples self.max_examples_from_db = max_examples_from_db self.min_validity = min_validity self.secs_between_reqs = secs_between_reqs self.wrapper_gen_examples = wrapper_gen_examples def extract_reference(self, top_results, raw_text): """ Returns a list of References if they can be extracted or an empty list otherwise. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Using %d top results' % len(top_results)) #@UndefinedVariable page = None references = [] for result in top_results: try: log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable page = self.browser.get_page(result.url) except BrowserError as e: log.error('Error retrieving page %s: %s' % (result.url, #@UndefinedVariable e.error)) continue page = ContentCleaner().clean_content(page) references = self._use_reference_wrappers(result.base_url, page, raw_text) if not references: references = self._use_rule_wrappers(result.base_url, page, raw_text) if references: break # Convert to target format, if necessary for reference in references: self._format_reference(reference) # Return the extracted reference and the result that has been used return (references, result) def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections(source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug('The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug('The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers') #@UndefinedVariable return [] def _use_reference_wrappers(self, source, page, raw_text): """ Use a reference wrapper to get the reference from a given page. Returns a list of References with the full entry, format and a structure with the different fields. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Attempting to extract reference with a reference wrapper') #@UndefinedVariable references = [] entry, format = ReferenceWrapper().extract_info(source, page) if not entry: log.debug('Could not find any entry using a reference wrapper') #@UndefinedVariable return references # Create a parser for the given reference format try: parser = self.util_factory.create_parser(format) except UtilCreationError as e: log.error('Could not create a parser for %s: %s' % (format, #@UndefinedVariable e.args)) return references if not parser.check_format(entry): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing extracted entries') #@UndefinedVariable try: entries = parser.split_source(entry) for entry in entries: fields = parser.parse_entry(entry) reference = Reference(fields, format, entry) self._validate_reference_fields(reference, raw_text) references.append(reference) except Exception, e: log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable return references
class Searcher(object): """ Base class for searching with a search engine """ GOOGLE = 0 SCHOLAR = 1 BING = 2 YAHOO = 3 def __init__(self, query='', random_agent=False, debug=False): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.prepare() if random_agent: self.browser.set_random_user_agent() def prepare(self): self.results_info = None self.eor = False # end of results self._page = 0 self._results_per_page = 30 self._last_from = 0 def get_query(self): return self.__query def set_query(self, value): self.__query = value self.prepare() query = property(get_query, set_query) @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info['total'] == 0: self.eor = True return self.results_info['total'] @property def search_engine_url(self): raise NotImplementedError() def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] page = self._get_results_page() search_info = self._extract_info(page) if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] results = self._extract_results(page) if not results: self.eor = True return [] if self._page > 0 and search_info['from'] == self._last_from: self.eor = True return [] if search_info['to'] == search_info['total']: self.eor = True self._page += 1 self._last_from = search_info['from'] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_safe_url(self): return self.search_engine_url % {'query':urllib.quote_plus(self.query), 'start':self._page * self._results_per_page, 'num' :self._results_per_page } def _get_results_page(self): safe_url = self._get_safe_url() # Wait a random time between 0.5 and 1,5 seconds before doing the # search #time_to_wait = random.randrange(5, 15, 2) / 10.0 #log.debug('Waiting %g before searching %s' % (time_to_wait, safe_url)) #time.sleep(time_to_wait) try: page = self.browser.get_page(safe_url) except BrowserError, e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page)