def test_match_by_alternate_title_with_multiple_candidates_from_one_licence( self): license = LicenseFactory( alternate_titles=['Licence Ouverte v2', 'Licence Ouverte v2.0']) found = License.guess('Licence Ouverte v2.0') assert isinstance(found, License) assert license.id == found.id
def test_prioritize_title_over_alternate_title(self): title = faker.sentence() license = LicenseFactory(title=title) LicenseFactory(alternate_titles=[title]) found = License.guess(title) assert isinstance(found, License) assert license.id == found.id
def process(self, item): dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data kwargs = item.kwargs item = kwargs['item'] dataset.title = item['title'] dataset.license = License.guess('cc-by') dataset.tags = ["snig.dgterritorio.gov.pt"] dataset.description = item['description'] if item.get('date'): dataset.created_at = item['date'] for keyword in item.get('keywords'): dataset.tags.append(keyword) # Force recreation of all resources dataset.resources = [] for resource in item.get("resources"): parsed = urlparse.urlparse(resource['url']) try: format = str(urlparse.parse_qs(parsed.query)['service'][0]) except KeyError: format = resource['url'].split('.')[-1] new_resource = Resource(title=item['title'], url=resource['url'], filetype='remote', format=format) dataset.resources.append(new_resource) dataset.extras['harvest:name'] = self.source.name return dataset
def process(self, item): dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data kwargs = item.kwargs dataset.title = kwargs['title'] dataset.license = License.guess('cc-by') dataset.tags = ["apambiente.pt"] item = kwargs['item'] dataset.description = item['summary'] if kwargs['date']: dataset.created_at = kwargs['date'] # Force recreation of all resources dataset.resources = [] for resource in item['links']: url = resource['href'].replace('\\', '').replace(' ', '%20') type = resource['type'] if type == 'details': dataset.description += "<br>" dataset.description += "<br>Mais detalhes : <a href=\"%s\" target=\"_blank\">%s</a>" % ( url, dataset.title) if type == 'open': url_parts = list(urlparse.urlparse(url)) parts = url_parts[2].split('.') format = parts[-1] if len(parts) > 1 else 'wms' new_resource = Resource(title=dataset.title, url=url, filetype='remote', format=format.lower()) dataset.resources.append(new_resource) return dataset
def process(self, item): dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data kwargs = item.kwargs dataset.title = kwargs['title'] dataset.license = License.guess('cc-by') dataset.tags = ["apambiente.pt"] item = kwargs['item'] dataset.description = item.get('description') if kwargs['date']: dataset.created_at = kwargs['date'] # Force recreation of all resources dataset.resources = [] url = item.get('url') if item.get('type') == "liveData": type = "wms" else: type = url.split('.')[-1].lower() if len(type) > 3: type = "wms" new_resource = Resource(title=dataset.title, url=url, filetype='remote', format=type) dataset.resources.append(new_resource) return dataset
def process(self, item): ods_dataset = item.kwargs['dataset'] dataset_id = ods_dataset['datasetid'] ods_metadata = ods_dataset['metas'] ods_interopmetas = ods_dataset.get('interop_metas', {}) if not ods_dataset.get('has_records'): msg = 'Dataset {datasetid} has no record'.format(**ods_dataset) raise HarvestSkipException(msg) if 'inspire' in ods_interopmetas and not self.has_feature('inspire'): msg = 'Dataset {datasetid} has INSPIRE metadata' raise HarvestSkipException(msg.format(**ods_dataset)) dataset = self.get_dataset(item.remote_id) dataset.title = ods_metadata['title'] dataset.frequency = 'unknown' description = ods_metadata.get('description', '').strip() dataset.description = parse_html(description) dataset.private = False # Detect Organization try: organization_acronym = ods_metadata['publisher'] except KeyError: pass else: orgObj = Organization.objects(acronym=organization_acronym).first() if orgObj: dataset.organization = orgObj else: orgObj = Organization() orgObj.acronym = organization_acronym orgObj.name = organization_acronym orgObj.description = organization_acronym orgObj.save() dataset.organization = orgObj tags = set() if 'keyword' in ods_metadata: if isinstance(ods_metadata['keyword'], list): tags |= set(ods_metadata['keyword']) else: tags.add(ods_metadata['keyword']) if 'theme' in ods_metadata: if isinstance(ods_metadata['theme'], list): for theme in ods_metadata['theme']: tags.update([t.strip().lower() for t in theme.split(',')]) else: themes = ods_metadata['theme'].split(',') tags.update([t.strip().lower() for t in themes]) dataset.tags = list(tags) dataset.tags.append(urlparse(self.source.url).hostname) # Detect license default_license = dataset.license or License.default() license_id = ods_metadata.get('license') dataset.license = License.guess(license_id, self.LICENSES.get(license_id), default=default_license) self.process_resources(dataset, ods_dataset, ('csv', 'json')) if 'geo' in ods_dataset['features']: exports = ['geojson'] if ods_metadata['records_count'] <= self.SHAPEFILE_RECORDS_LIMIT: exports.append('shp') self.process_resources(dataset, ods_dataset, exports) self.process_extra_files(dataset, ods_dataset, 'alternative_export') self.process_extra_files(dataset, ods_dataset, 'attachment') dataset.extras['ods:url'] = self.explore_url(dataset_id) dataset.extras['harvest:name'] = self.source.name if 'references' in ods_metadata: dataset.extras['ods:references'] = ods_metadata['references'] dataset.extras['ods:has_records'] = ods_dataset['has_records'] dataset.extras['ods:geo'] = 'geo' in ods_dataset['features'] return dataset
def test_exact_match_by_title(self): license = LicenseFactory() found = License.guess(license.title) assert isinstance(found, License) assert license.id == found.id
def test_not_found(self): found = License.guess('should not be found') self.assertIsNone(found)
def test_not_found_with_default(self): license = LicenseFactory() found = License.guess('should not be found', default=license) assert found.id == license.id
def test_match_by_alternate_title_with_extra_inner_space(self): license = LicenseFactory(alternate_titles=['License ODBl']) found = License.guess('License ODBl') # 2 spaces instead of 1 assert isinstance(found, License) assert license.id == found.id
def test_exact_match_by_alternate_title_with_spaces(self): alternate_title = faker.sentence() license = LicenseFactory(alternate_titles=[alternate_title]) found = License.guess(' {0} '.format(alternate_title)) assert isinstance(found, License) assert license.id == found.id
def test_multiple_strings(self): license = LicenseFactory() found = License.guess('should not match', license.id) assert isinstance(found, License) assert license.id == found.id
def test_empty_string(self): found = License.guess('') assert found is None
def test_match_by_alternate_title_with_mismatching_case(self): license = LicenseFactory(alternate_titles=['License ODBl']) found = License.guess('License ODBL') assert isinstance(found, License) assert license.id == found.id
def test_match_by_alternate_title_with_low_edit_distance(self): license = LicenseFactory(alternate_titles=['License']) found = License.guess('Licence') assert isinstance(found, License) assert license.id == found.id
def test_exact_match_by_title_with_spaces(self): license = LicenseFactory() found = License.guess(' {0} '.format(license.title)) assert isinstance(found, License) assert license.id == found.id
def test_not_found(self): found = License.guess('should not be found') assert found is None
def test_imatch_by_alternate_url(self): alternate_url = '%s/CAPS.php' % faker.uri() license = LicenseFactory(alternate_urls=[alternate_url]) found = License.guess(alternate_url) assert isinstance(found, License) assert license.id == found.id
def test_none(self): found = License.guess(None) assert found is None
def test_none(self): found = License.guess(None) self.assertIsNone(found)
def test_exact_match_by_title(self): license = LicenseFactory() found = License.guess(license.title) self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_match_by_title_with_low_edit_distance(self): license = LicenseFactory(title='License') found = License.guess('Licence') self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_match_by_title_with_mismatching_case(self): license = LicenseFactory(title='License ODBl') found = License.guess('License ODBL') self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_not_found_with_default(self): license = LicenseFactory() found = License.guess('should not be found', default=license) self.assertEqual(found.id, license.id)
def test_exact_match_by_alternate_url(self): alternate_url = faker.uri() license = LicenseFactory(alternate_urls=[alternate_url]) found = License.guess(alternate_url) assert isinstance(found, License) assert license.id == found.id
def test_empty_string(self): found = License.guess('') self.assertIsNone(found)
def test_exact_match_by_title_with_spaces(self): license = LicenseFactory() found = License.guess(' {0} '.format(license.title)) self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_match_by_title_with_extra_inner_space(self): license = LicenseFactory(title='License ODBl') found = License.guess('License ODBl') # 2 spaces instead of 1 self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_multiple_strings(self): license = LicenseFactory() found = License.guess('should not match', license.id) self.assertIsInstance(found, License) self.assertEqual(license.id, found.id)
def test_imatch_by_id(self): license = LicenseFactory(id='CAPS-ID') found = License.guess(license.id) assert isinstance(found, License) assert license.id == found.id
def process(self, item): dataset_id = item.remote_id response = self.get(self.api_dataset_url(dataset_id), params={'interopmetas': 'true'}) response.raise_for_status() ods_dataset = response.json() ods_metadata = ods_dataset['metas'] ods_interopmetas = ods_dataset.get('interop_metas', {}) if not any((ods_dataset.get(attr) for attr in ('has_records', 'attachments', 'alternative_exports'))): msg = 'Dataset {datasetid} has no record'.format(**ods_dataset) raise HarvestSkipException(msg) if 'inspire' in ods_interopmetas and not self.has_feature('inspire'): msg = 'Dataset {datasetid} has INSPIRE metadata' raise HarvestSkipException(msg.format(**ods_dataset)) dataset = self.get_dataset(item.remote_id) dataset.title = ods_metadata['title'] dataset.frequency = 'unknown' description = ods_metadata.get('description', '').strip() dataset.description = parse_html(description) dataset.private = False dataset.last_modified = ods_metadata['modified'] tags = set() if 'keyword' in ods_metadata: if isinstance(ods_metadata['keyword'], list): tags |= set(ods_metadata['keyword']) else: tags.add(ods_metadata['keyword']) if 'theme' in ods_metadata: if isinstance(ods_metadata['theme'], list): for theme in ods_metadata['theme']: tags.update([t.strip().lower() for t in theme.split(',')]) else: themes = ods_metadata['theme'].split(',') tags.update([t.strip().lower() for t in themes]) dataset.tags = list(tags) # Detect license default_license = dataset.license or License.default() license_id = ods_metadata.get('license') dataset.license = License.guess(license_id, self.LICENSES.get(license_id), default=default_license) self.process_resources(dataset, ods_dataset, ('csv', 'json')) if 'geo' in ods_dataset['features']: exports = ['geojson'] if ods_metadata['records_count'] <= self.SHAPEFILE_RECORDS_LIMIT: exports.append('shp') self.process_resources(dataset, ods_dataset, exports) self.process_extra_files(dataset, ods_dataset, 'alternative_export') self.process_extra_files(dataset, ods_dataset, 'attachment') dataset.extras['ods:url'] = self.explore_url(dataset_id) dataset.extras['remote_url'] = self.explore_url(dataset_id) if 'references' in ods_metadata: dataset.extras['ods:references'] = ods_metadata['references'] dataset.extras['ods:has_records'] = ods_dataset['has_records'] dataset.extras['ods:geo'] = 'geo' in ods_dataset['features'] return dataset
def test_imatch_by_url(self): url = '%s/CAPS.php' % faker.uri() license = LicenseFactory(url=url) found = License.guess(license.url) assert isinstance(found, License) assert license.id == found.id