예제 #1
0
    def license_detect(self, record):

        relevant_publishers = []
        for bu in self._base_urls:
            cburl = self.clean_url(bu)
            relevant_publishers += Publisher.find_by_journal_url('http://' + cburl)

        lic_statements = []
        for pub in relevant_publishers:
            for l in pub['licenses']:
                lic_statement = {}
                lic_statement[l['license_statement']] = {'type': l['license_type'], 'version': l.get('version', '')}
                lic_statements.append(lic_statement)

        if not lic_statements:
            lic_statements = self._license_mappings

        for url in record.provider_urls:
            if self.supports_base_url(url):
                # TODO refactor self.simple_extract into several pieces
                # a downloader, a matcher, and a f() that records the license info
                # so the first two (and perhaps a general version of the third)
                # can be used here instead of this plugin having to do
                # all the work itself.
                r, content, content_length = http_stream_get(url)

                extra_provenance = {
                    "accepted_author_manuscript": False
                }

                for amm in self._author_manuscript_mappings:
                    statement = amm.keys()[0].strip()
                    if statement in content:
                        extra_provenance = amm[statement]
                        break

                self.simple_extract(lic_statements, record, url, content=content,
                                    extra_provenance=extra_provenance)


        return self._short_name, self.__version__
예제 #2
0
    def simple_extract(self, lic_statements, record, url, first_match=False, content='', handler=''):
        """
        Generic code which looks for a particular string in a given web
        page (URL), determines the licence conditions of the article and
        populates the record['bibjson']['license'] (note the US
        spelling) field.

        The URL it analyses, the statements it looks for and the
        resulting licenses are passed in. This is not a plugin for a
        particular publisher - it just contains (allows re-use) the
        logic that any "dumb string matching" plugin would use.

        :param lic_statements: licensing statements to look for on this
        publisher's pages. Take the form of {statement: meaning} where
        meaning['type'] identifies the license (see licenses.py) and
        meaning['version'] identifies the license version (if available)
        See a publisher plugin for an example, e.g. bmc.py

        :param record: a request for the OAG status of an article, see
        OAG docs for more info.

        :param url: source url of the item to be fetched. This is where
        the HTML page that's going to be scraped is expected to reside.

        :param first_match: stop trying license statements if one of
        them is found at the target url. By default, this code will try
        out all supplied license statements and simply add multiple
        'license' objects to the record it's been passed. If you want
        "first successful match only" behaviour, set this to True.
        """
        if not handler:
            handler = self._short_name  # can't put it in the method signature above, self is unresolved

        if not content:
            # get content from the web unless it's being passed into this method
            r, content, source_size = util.http_stream_get(url)
            if r.status_code != requests.codes.ok:
                raise PluginException(PluginException.HTTP, "could not retrieve content from " + url + " - " + str(r.status_code))
        else:
            source_size = len(content)

        content = self.normalise_string(content)

        if not content:
            return
        
        # see if one of the licensing statements is in content 
        # and populate record with appropriate license info
        for statement_mapping in lic_statements:
            # get the statement string itself - always the first key of the dict
            # mapping statements to licensing info
            statement = statement_mapping.keys()[0]

            # use a modified version of the license statement for
            # comparison - one which has been subjected to the same
            # normalisation as the incoming content (whitespace,
            # lowercasing etc.)
            cmp_statement = self.normalise_string(statement)
            # do not try to match empty statements, will always result in a match
            if not cmp_statement:
                continue

            # logging.debug(cmp_statement)

            #content = content.decode('utf-8', errors='replace').encode('utf-8', errors='replace')
            #print 'cmp statement type', type(cmp_statement)
            #print 'content type', type(content)

            #if type(cmp_statement) == unicode:
            #    print 'converting cmp_statement to str'
            #    cmp_statement = cmp_statement.encode('utf-8', 'ignore')
            #if type(content) == unicode:
            #    content = content.encode('utf-8', 'ignore')

            if type(cmp_statement) == str:
                #print 'converting cmp_statement to unicode'
                cmp_statement = cmp_statement.decode('utf-8', 'replace')
            if type(content) == str:
                content = content.decode('utf-8', 'replace')

            #print 'after safeguards'
            #print 'cmp statement type', type(cmp_statement)
            #print 'content type', type(content)

            match = cmp_statement in content

            if not match:
                cmp_statement = self.strip_html(cmp_statement)
                content = self.strip_html(content)
                if cmp_statement:  # if there's anything left of the statement after the html stripping...
                                   # otherwise '' in 'string' == True! so lots of false positives
                    match = cmp_statement in content
                else:
                    continue

            if match:
                # logging.debug('... matches')

                # okay, statement found on the page -> get license type
                lic_type = statement_mapping[statement]['type']

                # license identified, now use that to construct the license object
                license = deepcopy(LICENSES[lic_type])
                license['open_access'] = oa_policy.oa_for_license(lic_type)
                # set some defaults which have to be there, even if empty
                license.setdefault('version','')
                license.setdefault('description','')
                license.setdefault('jurisdiction','') # TODO later (or later version of OAG!)
                
                # Copy over all information about the license from the license
                # statement mapping. In essence, transfer the knowledge of the 
                # publisher plugin authors to the license object.
                # Consequence: Values coming from the publisher plugin overwrite
                # values specified in the licenses module.
                license.update(statement_mapping[statement])
                
                # add provenance information to the license object
                provenance = {
                    'date': datetime.strftime(datetime.now(), config.date_format),
                    'source': url,
                    "source_size" : source_size,
                    'agent': config.agent,
                    'category': 'page_scrape', # TODO we need to think how the
                        # users get to know what the values here mean.. docs?
                    'description': self.gen_provenance_description(url, statement),
                    'handler': handler, # the name of the plugin processing this record
                    'handler_version': self.__version__ # version of the plugin processing this record
                }

                license['provenance'] = provenance
                record.add_license_object(license)
                
                if first_match:
                    break
 def test_06_http_stream_get_long_timeout(self):
     r, content, downloaded_bytes = util.http_stream_get(self.app_url + "/stream/long_timeout")
     assert r
     assert content is not None
     assert content == "H" #the sleep happens after the first character/byte has been transmitted
     assert downloaded_bytes == 1
 def test_05_http_stream_get_timeout(self):
     r, content, downloaded_bytes = util.http_stream_get(self.app_url + "/stream/timeout")
     assert r
     assert downloaded_bytes == 2
     assert content == "HH"
 def test_04_http_stream_get_normal(self):
     r, content, downloaded_bytes = util.http_stream_get(self.app_url + "/stream/normal")
     assert r
     assert downloaded_bytes == 2
     assert content == "HH"
    def license_detect(self, record):
        # get all the URL-s from ES into a list
        #     need some way of getting facets from the DAO, ideally
        #     directly in list form as well as the raw form
        all_configs = Publisher.all(sort=[{'publisher_name': 'asc'}])  # always get them in the same order relative to each other
        url_index = self._generate_publisher_config_index_by_url(all_configs)
        url_index = OrderedDict(sorted(url_index.iteritems(), key=lambda x: len(x[0]), reverse=True))  # longest url-s first
        id_index = self._generate_publisher_config_index_by_id(all_configs)

        # get all the configs that match
        matching_configs = []
        work_on = record.provider_urls
        work_on = self.clean_urls(work_on, strip_leading_www=True)

        for config_url, config_id in url_index.items():
            for incoming_url in work_on:
                if incoming_url.startswith(config_url):
                    matching_configs.append(id_index[config_id])
        # future:
        # use tries to prefix match them to the incoming URL
        #     if the results of this could be ordered by URL length that
        #     would be great, or stop at first match option

        urls_contents = {}
        # prefetch the content, we'll be reusing it a lot
        for incoming_url in record.provider_urls:
            unused_response, urls_contents[incoming_url], unused_content_length = util.http_stream_get(incoming_url)

        # order their license statements by whether they have a version,
        # and then by length

        successful_config = None
        current_licenses_count = len(record.license)
        new_licenses_count = 0
        for config in matching_configs:
            matching_config_licenses = config['licenses']

            matching_config_licenses = sorted(
                matching_config_licenses,
                key=lambda lic: (
                    lic.get('version'),  # with reverse=True, this will actually sort licenses in REVERSE ALPHABETICAL order of their versions, blank versions go last
                    len(lic['license_statement'])  # longest first with reverse=True
                ),
                reverse=True
            )

            # try matching like that
            lic_statements = []
            for l in matching_config_licenses:
                lic_statement = {}
                lic_statement[l['license_statement']] = {'type': l['license_type'], 'version': l['version']}
                lic_statements.append(lic_statement)

            for incoming_url, content in urls_contents.iteritems():
                self.simple_extract(lic_statements, record, incoming_url, first_match=True, content=content, handler=config.publisher_name)
                new_licenses_count = len(record.license)
                # if we find a license, stop trying the different URL-s
                if new_licenses_count > current_licenses_count:
                    break
            # if we find a license, stop trying the configs and record which config found it
            if new_licenses_count > current_licenses_count:
                # found it!
                successful_config = config
                break

        # if no config exists which can match the license, then try the flat list
        # do not try the flat list of statements if a matching config has been found
        # this keeps these "virtual" plugins, i.e. the configs, consistent with how
        # the rest of the system operates
        lic_statements = []
        flat_license_list_success = False
        if len(matching_configs) <= 0:
            all_statements = LicenseStatement.all()
            all_statements = sorted(
                all_statements,
                key=lambda lic: (
                    lic.get('version', '') == '',  # does it NOT have a version? last!
                    # see http://stackoverflow.com/questions/9386501/sorting-in-python-and-empty-strings

                    len(lic['license_statement'])  # length of license statement
                )
            )

            for l in all_statements:
                lic_statement = {}
                lic_statement[l['license_statement']] = {'type': l['license_type'], 'version': l.get('version', '')}
                lic_statements.append(lic_statement)

            for incoming_url, content in urls_contents.iteritems():
                self.simple_extract(lic_statements, record, incoming_url, first_match=True, content=content)  # default handler - the plugin's name
                new_licenses_count = len(record.license)
                # if we find a license, stop trying the different URL-s
                if new_licenses_count > current_licenses_count:
                    break

            if new_licenses_count > current_licenses_count:
            # one of the flat license index did it
                flat_license_list_success = True

        if successful_config:
            return successful_config.publisher_name, self.__version__
        elif flat_license_list_success:
            return self._short_name, self.__version__

        # in case everything fails, return 'oag' as the handler to
        # be consistent with the failure handler in the workflow module
        # so that way, all "completely failed" licenses will have 'oag'
        # on them, except that the GSM ones will have the GSM's current
        # version
        return 'oag', self.__version__