def fetch_resource(dataset, ignore_hashes): ''' Gets the resource and sets the times of last successful update based on the status code. If `ignore_hashes` is set to True, `last_parsed` will be set to None and an update will be triggered. :param resource: :return: ''' d = iatikit.data().datasets.get(dataset.name) last_updated = iatikit.data().last_updated resource = dataset.resources[0] resource.last_fetch = last_updated try: content = d.raw_xml resource.last_status_code = 200 resource.last_succ = last_updated if (not resource.document) or \ (hash(resource.document) != hash(content)) or \ ignore_hashes: resource.document = content resource.last_parsed = None resource.last_parse_error = None except IOError: # TODO: this isn't true resource.last_status_code = 404 db.session.add(resource) return resource
def fetch_dataset_list(): ''' Fetches datasets from iatikit and stores them in the DB. Used in update() to update the Flask job queue. Uses CKAN metadata to determine if an activity is active or deleted. :return: ''' existing_datasets = Dataset.query.all() existing_ds_names = set(ds.name for ds in existing_datasets) package_list = [d.name for d in iatikit.data().datasets] incoming_ds_names = set(package_list) new_datasets = [ Dataset(name=n) for n in incoming_ds_names - existing_ds_names ] all_datasets = existing_datasets + new_datasets last_seen = iatikit.data().last_updated for dataset in all_datasets: dataset.last_seen = last_seen db.session.add_all(all_datasets) db.session.commit() deleted_ds_names = existing_ds_names - incoming_ds_names if deleted_ds_names: delete_datasets(deleted_ds_names) all_datasets = Dataset.query return all_datasets
def get_registry(refresh=False): if not (pathlib.Path() / "__iatikitcache__").is_dir() or refresh: print("getting regisitry data") iatikit.download.data() return iatikit.data()
def fetch_dataset_metadata(dataset): d = iatikit.data().datasets.get(dataset.name) dataset.publisher = d.metadata['organization']['name'] dataset.last_modified = date_parser( d.metadata.get('metadata_modified', datetime.datetime.now().date().isoformat())) new_urls = [ resource['url'] for resource in d.metadata.get('resources', []) if resource['url'] not in dataset.resource_urls ] dataset.resource_urls.extend(new_urls) urls = [resource['url'] for resource in d.metadata.get('resources', [])] for deleted in set(dataset.resource_urls) - set(urls): dataset.resource_urls.remove(deleted) dataset.license = d.metadata.get('license_id') dataset.is_open = d.metadata.get('isopen', False) db.session.add(dataset) return dataset
from os.path import dirname, join from collections import namedtuple import datetime import mock import iatikit from . import AppTestCase, fixture_filename from . import factories as fac from iatilib import crawler, db, parse from iatilib.model import Dataset, Log, Resource, Activity, DeletedActivity registry = iatikit.data(join(dirname(__file__), 'fixtures', 'registry')) class TestCrawler(AppTestCase): @mock.patch('iatikit.data') def test_fetch_package_list(self, iatikit_mock): data_mock = iatikit_mock.return_value data_mock.last_updated = datetime.datetime.utcnow() data_mock.datasets = [ iatikit.Dataset("tst-a.xml"), iatikit.Dataset("tst-b.xml") ] datasets = crawler.fetch_dataset_list() self.assertIn("tst-a", [ds.name for ds in datasets]) self.assertIn("tst-b", [ds.name for ds in datasets]) @mock.patch('iatikit.data') def test_update_adds_datasets(self, iatikit_mock):