示例#1
0
def licenses(filename):
    '''Feed the licenses from a JSON file'''
    if filename.startswith('http'):
        json_licenses = requests.get(filename).json()
    else:
        with open(filename) as fp:
            json_licenses = json.load(fp)

    if len(json_licenses):
        log.info('Dropping existing licenses')
        License.drop_collection()

    for json_license in json_licenses:
        flags = []
        for field, flag in FLAGS_MAP.items():
            if json_license.get(field, False):
                flags.append(flag)

        license = License.objects.create(
            id=json_license['id'],
            title=json_license['title'],
            url=json_license['url'] or None,
            maintainer=json_license['maintainer'] or None,
            flags=flags,
            active=json_license.get('active', False),
        )
        log.info('Added license "%s"', license.title)
    try:
        License.objects.get(id=DEFAULT_LICENSE['id'])
    except License.DoesNotExist:
        License.objects.create(**DEFAULT_LICENSE)
        log.info('Added license "%s"', DEFAULT_LICENSE['title'])
    log.info('Done')
示例#2
0
 def test_prioritize_title_over_alternate_title(self):
     title = faker.sentence()
     license = LicenseFactory(title=title)
     LicenseFactory(alternate_titles=[title])
     found = License.guess(title)
     assert isinstance(found, License)
     assert license.id == found.id
示例#3
0
文件: models.py 项目: odtvince/udata
 def license(self):
     return License.objects(id=self.license_id).first()
 def test_match_by_alternate_title_with_low_edit_distance(self):
     license = LicenseFactory(alternate_titles=['License'])
     found = License.guess('Licence')
     assert isinstance(found, License)
     assert license.id == found.id
 def test_match_by_alternate_title_with_mismatching_case(self):
     license = LicenseFactory(alternate_titles=['License ODBl'])
     found = License.guess('License ODBL')
     assert isinstance(found, License)
     assert license.id == found.id
 def test_imatch_by_alternate_url(self):
     alternate_url = '%s/CAPS.php' % faker.uri()
     license = LicenseFactory(alternate_urls=[alternate_url])
     found = License.guess(alternate_url)
     assert isinstance(found, License)
     assert license.id == found.id
 def test_exact_match_by_title_with_spaces(self):
     license = LicenseFactory()
     found = License.guess(' {0} '.format(license.title))
     assert isinstance(found, License)
     assert license.id == found.id
示例#8
0
 def test_exact_match_by_title_with_spaces(self):
     license = LicenseFactory()
     found = License.guess(' {0} '.format(license.title))
     self.assertIsInstance(found, License)
     self.assertEqual(license.id, found.id)
示例#9
0
 def test_empty_string(self):
     found = License.guess('')
     assert found is None
示例#10
0
 def test_match_by_alternate_title_with_extra_inner_space(self):
     license = LicenseFactory(alternate_titles=['License ODBl'])
     found = License.guess('License  ODBl')  # 2 spaces instead of 1
     assert isinstance(found, License)
     assert license.id == found.id
示例#11
0
 def test_match_by_alternate_title_with_mismatching_case(self):
     license = LicenseFactory(alternate_titles=['License ODBl'])
     found = License.guess('License ODBL')
     assert isinstance(found, License)
     assert license.id == found.id
示例#12
0
 def test_exact_match_by_alternate_title_with_spaces(self):
     alternate_title = faker.sentence()
     license = LicenseFactory(alternate_titles=[alternate_title])
     found = License.guess(' {0} '.format(alternate_title))
     assert isinstance(found, License)
     assert license.id == found.id
示例#13
0
 def test_match_by_alternate_title_with_low_edit_distance(self):
     license = LicenseFactory(alternate_titles=['License'])
     found = License.guess('Licence')
     assert isinstance(found, License)
     assert license.id == found.id
示例#14
0
 def test_exact_match_by_title_with_spaces(self):
     license = LicenseFactory()
     found = License.guess(' {0} '.format(license.title))
     assert isinstance(found, License)
     assert license.id == found.id
示例#15
0
 def test_exact_match_by_title(self):
     license = LicenseFactory()
     found = License.guess(license.title)
     assert isinstance(found, License)
     assert license.id == found.id
示例#16
0
 def test_exact_match_by_alternate_url(self):
     alternate_url = faker.uri()
     license = LicenseFactory(alternate_urls=[alternate_url])
     found = License.guess(alternate_url)
     assert isinstance(found, License)
     assert license.id == found.id
示例#17
0
 def test_not_found_with_default(self):
     license = LicenseFactory()
     found = License.guess('should not be found', default=license)
     self.assertEqual(found.id, license.id)
示例#18
0
 def test_multiple_strings(self):
     license = LicenseFactory()
     found = License.guess('should not match', license.id)
     self.assertIsInstance(found, License)
     self.assertEqual(license.id, found.id)
示例#19
0
 def test_empty_string(self):
     found = License.guess('')
     self.assertIsNone(found)
 def test_imatch_by_id(self):
     license = LicenseFactory(id='CAPS-ID')
     found = License.guess(license.id)
     assert isinstance(found, License)
     assert license.id == found.id
示例#21
0
 def test_match_by_title_with_extra_inner_space(self):
     license = LicenseFactory(title='License ODBl')
     found = License.guess('License  ODBl')  # 2 spaces instead of 1
     self.assertIsInstance(found, License)
     self.assertEqual(license.id, found.id)
示例#22
0
 def test_none(self):
     found = License.guess(None)
     assert found is None
示例#23
0
 def test_not_found_with_default(self):
     license = LicenseFactory()
     found = License.guess('should not be found', default=license)
     assert found.id == license.id
示例#24
0
 def test_multiple_strings(self):
     license = LicenseFactory()
     found = License.guess('should not match', license.id)
     assert isinstance(found, License)
     assert license.id == found.id
 def test_imatch_by_url(self):
     url = '%s/CAPS.php' % faker.uri()
     license = LicenseFactory(url=url)
     found = License.guess(license.url)
     assert isinstance(found, License)
     assert license.id == found.id
示例#26
0
    def remote_datasets(self):
        response = self.get('package_list')
        for name in response['result']:
            details = self.get('package_show', {'id': name})['result']
            dataset = self.get_harvested(Dataset, details['id'])

            # Core attributes
            dataset.slug = details['name']
            dataset.title = details['title']
            dataset.description = details.get('notes', 'No description')
            dataset.license = License.objects(id=details['license_id']).first() or License.objects.get(id='notspecified')
            dataset.tags = [tag['name'].lower() for tag in details['tags']]

            dataset.frequency = self.map('frequency', details) or 'unknown'
            dataset.created_at = parse(details['metadata_created'])
            dataset.last_modified = parse(details['metadata_modified'])

            if any_field(details, 'territorial_coverage', 'territorial_coverage_granularity'):
                coverage = TerritorialCoverage(
                    codes=[code.strip() for code in details.get('territorial_coverage', '').split(',') if code.strip()],
                    granularity=self.map('territorial_coverage_granularity', details),
                )
                dataset.extras['territorial_coverage'] = coverage
                try:
                    dataset.spatial = territorial_to_spatial(dataset)
                except Exception as e:
                    print 'Error while processing spatial coverage for {0}:'.format(dataset.title), e

            if all_field(details, 'temporal_coverage_from', 'temporal_coverage_to'):
                try:
                    dataset.temporal_coverage = db.DateRange(
                        start=daterange_start(details.get('temporal_coverage_from')),
                        end=daterange_end(details.get('temporal_coverage_to')),
                    )
                except:
                    log.error('Unable to parse temporal coverage for dataset %s', details['id'])

            # Organization
            if details.get('organization'):
                dataset.organization = self.get_harvested(Organization, details['organization']['id'], False)
            else:
                # Need to fetch user from roles
                roles = self.get('roles_show', {'domain_object': name})['result']['roles']
                for role in roles:
                    if role['role'] == 'admin' and role['context'] == 'Package':
                        dataset.owner = self.get_harvested(User, role['user_id'])
                        break

            # Supplier
            if details.get('supplier_id'):
                dataset.supplier = self.get_harvested(Organization, details['supplier_id'], False)

            # Remote URL
            if details.get('url'):
                dataset.extras['remote_url'] = details['url']

            # Extras
            if 'extras' in details:
                extra_mapping = self.harvester.mapping.get('from_extras', {})
                for extra in details['extras']:
                    if extra['key'] in self.harvester.mapping:
                        value = self.harvester.mapping[extra['key']].get(extra['value'])
                    else:
                        value = extra['value']
                    if extra['key'] in extra_mapping:
                        setattr(dataset, extra_mapping[extra['key']], value)
                    else:
                        dataset.extras[extra['key']] = value

            # Resources
            for res in details['resources']:
                try:
                    resource = get_by(dataset.resources, 'id', UUID(res['id']))
                except:
                    log.error('Unable to parse resource %s', res['id'])
                    continue
                if not resource:
                    resource = Resource(id=res['id'])
                    dataset.resources.append(resource)
                resource.title = res.get('name', '') or ''
                resource.url = res['url']
                resource.description = res.get('description')
                resource.format = res.get('format')
                resource.hash = res.get('hash')
                resource.created = parse(res['created'])
                resource.modified = parse(res['revision_timestamp'])
                resource.published = resource.published or resource.created
            yield dataset

            if dataset.id:
                followers = self.get('dataset_follower_list', {'id': name})['result']
                for follower in followers:
                    user = self.get_harvested(User, follower['id'], False)
                    if user:
                        follow, created = FollowDataset.objects.get_or_create(follower=user, following=dataset)
 def test_exact_match_by_title(self):
     license = LicenseFactory()
     found = License.guess(license.title)
     assert isinstance(found, License)
     assert license.id == found.id
示例#28
0
 def test_match_by_url_scheme_mismatch(self):
     license = LicenseFactory(url='https://example.com/license')
     found = License.guess('http://example.com/license')
     assert isinstance(found, License)
     assert license.id == found.id
 def test_exact_match_by_alternate_title_with_spaces(self):
     alternate_title = faker.sentence()
     license = LicenseFactory(alternate_titles=[alternate_title])
     found = License.guess(' {0} '.format(alternate_title))
     assert isinstance(found, License)
     assert license.id == found.id
示例#30
0
 def test_match_by_alternate_url_scheme_slash_mismatch(self):
     alternate_url = 'https://example.com/license'
     license = LicenseFactory(alternate_urls=[alternate_url])
     found = License.guess('http://example.com/license/')
     assert isinstance(found, License)
     assert license.id == found.id
 def test_match_by_alternate_title_with_extra_inner_space(self):
     license = LicenseFactory(alternate_titles=['License ODBl'])
     found = License.guess('License  ODBl')  # 2 spaces instead of 1
     assert isinstance(found, License)
     assert license.id == found.id
示例#32
0
 def test_exact_match_by_title_with_mismatch_slug(self):
     license = LicenseFactory(title="Licence Ouverte v2", slug="licence-2")
     found = License.guess(license.title)
     assert isinstance(found, License)
     assert license.id == found.id
 def test_multiple_strings(self):
     license = LicenseFactory()
     found = License.guess('should not match', license.id)
     assert isinstance(found, License)
     assert license.id == found.id
示例#34
0
 def test_no_with_multiple_alternate_titles_from_different_licences(self):
     LicenseFactory(alternate_titles=['Licence Ouverte v2'])
     LicenseFactory(alternate_titles=['Licence Ouverte v2.0'])
     found = License.guess('Licence Ouverte v2.0')
     assert found is None
示例#35
0
 def test_not_found(self):
     found = License.guess('should not be found')
     self.assertIsNone(found)
示例#36
0
    def process(self, item):
        ods_dataset = item.kwargs['dataset']
        dataset_id = ods_dataset['datasetid']
        ods_metadata = ods_dataset['metas']
        ods_interopmetas = ods_dataset.get('interop_metas', {})

        if not ods_dataset.get('has_records'):
            msg = 'Dataset {datasetid} has no record'.format(**ods_dataset)
            raise HarvestSkipException(msg)

        if 'inspire' in ods_interopmetas and not self.has_feature('inspire'):
            msg = 'Dataset {datasetid} has INSPIRE metadata'
            raise HarvestSkipException(msg.format(**ods_dataset))

        dataset = self.get_dataset(item.remote_id)

        dataset.title = ods_metadata['title']
        dataset.frequency = 'unknown'
        description = ods_metadata.get('description', '').strip()
        dataset.description = parse_html(description)
        dataset.private = False

        # Detect Organization
        try:
            organization_acronym = ods_metadata['publisher']
        except KeyError:
            pass
        else:
            orgObj = Organization.objects(acronym=organization_acronym).first()
            if orgObj:
                dataset.organization = orgObj.id
            else:
                orgObj = Organization()
                orgObj.acronym = organization_acronym
                orgObj.name = organization_acronym
                orgObj.description = organization_acronym
                orgObj.save()

                dataset.organization = orgObj.id

        tags = set()
        if 'keyword' in ods_metadata:
            if isinstance(ods_metadata['keyword'], list):
                tags |= set(ods_metadata['keyword'])
            else:
                tags.add(ods_metadata['keyword'])

        if 'theme' in ods_metadata:
            if isinstance(ods_metadata['theme'], list):
                for theme in ods_metadata['theme']:
                    tags.update([t.strip().lower() for t in theme.split(',')])
            else:
                themes = ods_metadata['theme'].split(',')
                tags.update([t.strip().lower() for t in themes])

        dataset.tags = list(tags)
        dataset.tags.append(urlparse(self.source.url).hostname)

        # Detect license
        default_license = dataset.license or License.default()
        license_id = ods_metadata.get('license')
        dataset.license = License.guess(license_id,
                                        self.LICENSES.get(license_id),
                                        default=default_license)

        self.process_resources(dataset, ods_dataset, ('csv', 'json'))

        if 'geo' in ods_dataset['features']:
            exports = ['geojson']
            if ods_metadata['records_count'] <= self.SHAPEFILE_RECORDS_LIMIT:
                exports.append('shp')
            self.process_resources(dataset, ods_dataset, exports)

        self.process_extra_files(dataset, ods_dataset, 'alternative_export')
        self.process_extra_files(dataset, ods_dataset, 'attachment')

        dataset.extras['ods:url'] = self.explore_url(dataset_id)
        dataset.extras['harvest:name'] = self.source.name
        
        if 'references' in ods_metadata:
            dataset.extras['ods:references'] = ods_metadata['references']
        dataset.extras['ods:has_records'] = ods_dataset['has_records']
        dataset.extras['ods:geo'] = 'geo' in ods_dataset['features']

        return dataset
示例#37
0
 def test_none(self):
     found = License.guess(None)
     self.assertIsNone(found)
 def test_not_found(self):
     found = License.guess('should not be found')
     assert found is None
示例#39
0
 def test_exact_match_by_title(self):
     license = LicenseFactory()
     found = License.guess(license.title)
     self.assertIsInstance(found, License)
     self.assertEqual(license.id, found.id)
 def test_not_found_with_default(self):
     license = LicenseFactory()
     found = License.guess('should not be found', default=license)
     assert found.id == license.id
示例#41
0
 def test_match_by_title_with_low_edit_distance(self):
     license = LicenseFactory(title='License')
     found = License.guess('Licence')
     self.assertIsInstance(found, License)
     self.assertEqual(license.id, found.id)
 def test_none(self):
     found = License.guess(None)
     assert found is None
示例#43
0
 def test_match_by_title_with_mismatching_case(self):
     license = LicenseFactory(title='License ODBl')
     found = License.guess('License ODBL')
     self.assertIsInstance(found, License)
     self.assertEqual(license.id, found.id)
 def test_empty_string(self):
     found = License.guess('')
     assert found is None
示例#45
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], schema)

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = data['notes']
        dataset.license = License.objects(id=data['license_id']).first()
        # dataset.license = license or License.objects.get(id='notspecified')
        dataset.tags = [t['name'] for t in data['tags']]

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom = None

        for extra in data['extras']:
            # GeoJSON representation (Polygon or Point)
            if extra['key'] == 'spatial':
                spatial_geom = json.loads(extra['value'])
            #  Textual representation of the extent / location
            elif extra['key'] == 'spatial-text':
                log.debug('spatial-text value not handled')
                print 'spatial-text', extra['value']
            # Linked Data URI representing the place name
            elif extra['key'] == 'spatial-uri':
                log.debug('spatial-uri value not handled')
                print 'spatial-uri', extra['value']
            # Update frequency
            elif extra['key'] == 'frequency':
                print 'frequency', extra['value']
            # Temporal coverage start
            elif extra['key'] == 'temporal_start':
                print 'temporal_start', extra['value']
                temporal_start = daterange_start(extra['value'])
                continue
            # Temporal coverage end
            elif extra['key'] == 'temporal_end':
                print 'temporal_end', extra['value']
                temporal_end = daterange_end(extra['value'])
                continue
            # else:
            #     print extra['key'], extra['value']
            dataset.extras[extra['key']] = extra['value']

        if spatial_geom:
            dataset.spatial = SpatialCoverage()
            if spatial_geom['type'] == 'Polygon':
                coordinates = [spatial_geom['coordinates']]
            elif spatial_geom['type'] == 'MultiPolygon':
                coordinates = spatial_geom['coordinates']
            else:
                HarvestException('Unsupported spatial geometry')
            dataset.spatial.geom = {
                'type': 'MultiPolygon',
                'coordinates': coordinates
            }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        if data.get('url'):
            dataset.extras['remote_url'] = data['url']

        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue
            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except:
                log.error('Unable to parse resource ID %s', res['id'])
                continue
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = res.get('description')
            resource.url = res['url']
            resource.filetype = ('api' if res['resource_type'] == 'api'
                                 else 'remote')
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        return dataset
示例#46
0
 def test_not_found(self):
     found = License.guess('should not be found')
     assert found is None