def license_detect(self, record): relevant_publishers = [] for bu in self._base_urls: cburl = self.clean_url(bu) relevant_publishers += Publisher.find_by_journal_url('http://' + cburl) lic_statements = [] for pub in relevant_publishers: for l in pub['licenses']: lic_statement = {} lic_statement[l['license_statement']] = {'type': l['license_type'], 'version': l.get('version', '')} lic_statements.append(lic_statement) if not lic_statements: lic_statements = self._license_mappings for url in record.provider_urls: if self.supports_base_url(url): # TODO refactor self.simple_extract into several pieces # a downloader, a matcher, and a f() that records the license info # so the first two (and perhaps a general version of the third) # can be used here instead of this plugin having to do # all the work itself. r, content, content_length = http_stream_get(url) extra_provenance = { "accepted_author_manuscript": False } for amm in self._author_manuscript_mappings: statement = amm.keys()[0].strip() if statement in content: extra_provenance = amm[statement] break self.simple_extract(lic_statements, record, url, content=content, extra_provenance=extra_provenance) return self._short_name, self.__version__
def simple_extract(self, lic_statements, record, url, first_match=False, content='', handler=''): """ Generic code which looks for a particular string in a given web page (URL), determines the licence conditions of the article and populates the record['bibjson']['license'] (note the US spelling) field. The URL it analyses, the statements it looks for and the resulting licenses are passed in. This is not a plugin for a particular publisher - it just contains (allows re-use) the logic that any "dumb string matching" plugin would use. :param lic_statements: licensing statements to look for on this publisher's pages. Take the form of {statement: meaning} where meaning['type'] identifies the license (see licenses.py) and meaning['version'] identifies the license version (if available) See a publisher plugin for an example, e.g. bmc.py :param record: a request for the OAG status of an article, see OAG docs for more info. :param url: source url of the item to be fetched. This is where the HTML page that's going to be scraped is expected to reside. :param first_match: stop trying license statements if one of them is found at the target url. By default, this code will try out all supplied license statements and simply add multiple 'license' objects to the record it's been passed. If you want "first successful match only" behaviour, set this to True. """ if not handler: handler = self._short_name # can't put it in the method signature above, self is unresolved if not content: # get content from the web unless it's being passed into this method r, content, source_size = util.http_stream_get(url) if r.status_code != requests.codes.ok: raise PluginException(PluginException.HTTP, "could not retrieve content from " + url + " - " + str(r.status_code)) else: source_size = len(content) content = self.normalise_string(content) if not content: return # see if one of the licensing statements is in content # and populate record with appropriate license info for statement_mapping in lic_statements: # get the statement string itself - always the first key of the dict # mapping statements to licensing info statement = statement_mapping.keys()[0] # use a modified version of the license statement for # comparison - one which has been subjected to the same # normalisation as the incoming content (whitespace, # lowercasing etc.) cmp_statement = self.normalise_string(statement) # do not try to match empty statements, will always result in a match if not cmp_statement: continue # logging.debug(cmp_statement) #content = content.decode('utf-8', errors='replace').encode('utf-8', errors='replace') #print 'cmp statement type', type(cmp_statement) #print 'content type', type(content) #if type(cmp_statement) == unicode: # print 'converting cmp_statement to str' # cmp_statement = cmp_statement.encode('utf-8', 'ignore') #if type(content) == unicode: # content = content.encode('utf-8', 'ignore') if type(cmp_statement) == str: #print 'converting cmp_statement to unicode' cmp_statement = cmp_statement.decode('utf-8', 'replace') if type(content) == str: content = content.decode('utf-8', 'replace') #print 'after safeguards' #print 'cmp statement type', type(cmp_statement) #print 'content type', type(content) match = cmp_statement in content if not match: cmp_statement = self.strip_html(cmp_statement) content = self.strip_html(content) if cmp_statement: # if there's anything left of the statement after the html stripping... # otherwise '' in 'string' == True! so lots of false positives match = cmp_statement in content else: continue if match: # logging.debug('... matches') # okay, statement found on the page -> get license type lic_type = statement_mapping[statement]['type'] # license identified, now use that to construct the license object license = deepcopy(LICENSES[lic_type]) license['open_access'] = oa_policy.oa_for_license(lic_type) # set some defaults which have to be there, even if empty license.setdefault('version','') license.setdefault('description','') license.setdefault('jurisdiction','') # TODO later (or later version of OAG!) # Copy over all information about the license from the license # statement mapping. In essence, transfer the knowledge of the # publisher plugin authors to the license object. # Consequence: Values coming from the publisher plugin overwrite # values specified in the licenses module. license.update(statement_mapping[statement]) # add provenance information to the license object provenance = { 'date': datetime.strftime(datetime.now(), config.date_format), 'source': url, "source_size" : source_size, 'agent': config.agent, 'category': 'page_scrape', # TODO we need to think how the # users get to know what the values here mean.. docs? 'description': self.gen_provenance_description(url, statement), 'handler': handler, # the name of the plugin processing this record 'handler_version': self.__version__ # version of the plugin processing this record } license['provenance'] = provenance record.add_license_object(license) if first_match: break
def test_06_http_stream_get_long_timeout(self): r, content, downloaded_bytes = util.http_stream_get(self.app_url + "/stream/long_timeout") assert r assert content is not None assert content == "H" #the sleep happens after the first character/byte has been transmitted assert downloaded_bytes == 1
def test_05_http_stream_get_timeout(self): r, content, downloaded_bytes = util.http_stream_get(self.app_url + "/stream/timeout") assert r assert downloaded_bytes == 2 assert content == "HH"
def test_04_http_stream_get_normal(self): r, content, downloaded_bytes = util.http_stream_get(self.app_url + "/stream/normal") assert r assert downloaded_bytes == 2 assert content == "HH"
def license_detect(self, record): # get all the URL-s from ES into a list # need some way of getting facets from the DAO, ideally # directly in list form as well as the raw form all_configs = Publisher.all(sort=[{'publisher_name': 'asc'}]) # always get them in the same order relative to each other url_index = self._generate_publisher_config_index_by_url(all_configs) url_index = OrderedDict(sorted(url_index.iteritems(), key=lambda x: len(x[0]), reverse=True)) # longest url-s first id_index = self._generate_publisher_config_index_by_id(all_configs) # get all the configs that match matching_configs = [] work_on = record.provider_urls work_on = self.clean_urls(work_on, strip_leading_www=True) for config_url, config_id in url_index.items(): for incoming_url in work_on: if incoming_url.startswith(config_url): matching_configs.append(id_index[config_id]) # future: # use tries to prefix match them to the incoming URL # if the results of this could be ordered by URL length that # would be great, or stop at first match option urls_contents = {} # prefetch the content, we'll be reusing it a lot for incoming_url in record.provider_urls: unused_response, urls_contents[incoming_url], unused_content_length = util.http_stream_get(incoming_url) # order their license statements by whether they have a version, # and then by length successful_config = None current_licenses_count = len(record.license) new_licenses_count = 0 for config in matching_configs: matching_config_licenses = config['licenses'] matching_config_licenses = sorted( matching_config_licenses, key=lambda lic: ( lic.get('version'), # with reverse=True, this will actually sort licenses in REVERSE ALPHABETICAL order of their versions, blank versions go last len(lic['license_statement']) # longest first with reverse=True ), reverse=True ) # try matching like that lic_statements = [] for l in matching_config_licenses: lic_statement = {} lic_statement[l['license_statement']] = {'type': l['license_type'], 'version': l['version']} lic_statements.append(lic_statement) for incoming_url, content in urls_contents.iteritems(): self.simple_extract(lic_statements, record, incoming_url, first_match=True, content=content, handler=config.publisher_name) new_licenses_count = len(record.license) # if we find a license, stop trying the different URL-s if new_licenses_count > current_licenses_count: break # if we find a license, stop trying the configs and record which config found it if new_licenses_count > current_licenses_count: # found it! successful_config = config break # if no config exists which can match the license, then try the flat list # do not try the flat list of statements if a matching config has been found # this keeps these "virtual" plugins, i.e. the configs, consistent with how # the rest of the system operates lic_statements = [] flat_license_list_success = False if len(matching_configs) <= 0: all_statements = LicenseStatement.all() all_statements = sorted( all_statements, key=lambda lic: ( lic.get('version', '') == '', # does it NOT have a version? last! # see http://stackoverflow.com/questions/9386501/sorting-in-python-and-empty-strings len(lic['license_statement']) # length of license statement ) ) for l in all_statements: lic_statement = {} lic_statement[l['license_statement']] = {'type': l['license_type'], 'version': l.get('version', '')} lic_statements.append(lic_statement) for incoming_url, content in urls_contents.iteritems(): self.simple_extract(lic_statements, record, incoming_url, first_match=True, content=content) # default handler - the plugin's name new_licenses_count = len(record.license) # if we find a license, stop trying the different URL-s if new_licenses_count > current_licenses_count: break if new_licenses_count > current_licenses_count: # one of the flat license index did it flat_license_list_success = True if successful_config: return successful_config.publisher_name, self.__version__ elif flat_license_list_success: return self._short_name, self.__version__ # in case everything fails, return 'oag' as the handler to # be consistent with the failure handler in the workflow module # so that way, all "completely failed" licenses will have 'oag' # on them, except that the GSM ones will have the GSM's current # version return 'oag', self.__version__