Exemplo n.º 1
0
    def test_good_get_data_json(self):
        url = f'{base_url}/usda.gov.data.json'
        total = 0
        for dataset in get_data_json_from_url(url=url):
            self.assertIsInstance(dataset, dict)
            total += 1

        self.assertEqual(total, 1580)
Exemplo n.º 2
0
    def test_goodwitherrors_get_data_json(self):
        url = f'{base_url}/healthdata.gov.data.json'
        total = 0
        for dataset in get_data_json_from_url(url=url):
            self.assertIsInstance(dataset, dict)
            total += 1

        self.assertEqual(total, 1762)
Exemplo n.º 3
0
 def test_bad_get_data_json(self, build_validation_mock,
                            send_validation_mock):
     url = f'{base_url}/bad.json'
     with self.assertRaises(Exception) as context:
         for dataset in get_data_json_from_url(url=url):
             print(dataset)
     self.assertTrue(
         'Expecting property name enclosed in double quotes: line 3 column 5 (char 25)'
         in str(context.exception))
Exemplo n.º 4
0
    def test_limit(self):
        url = f'{base_url}/healthdata.gov.data.json'
        total = 0
        from harvester import config
        config.LIMIT_DATASETS = 15
        for dataset in get_data_json_from_url(url=url):
            self.assertIsInstance(dataset, dict)
            total += 1

        self.assertEqual(total, 15)
Exemplo n.º 5
0
 def test_404_get_data_json(self):
     url = f'{base_url}/DO-NOT-EXISTS.json'
     with self.assertRaises(Exception) as context:
         for dataset in get_data_json_from_url(url=url):
             print(dataset)
     self.assertTrue('HTTP error: 404' in str(context.exception))
Exemplo n.º 6
0
parser.add_argument(
    "--limit_dataset",
    type=int,
    default=0,
    help="Limit datasets to harvest on each source. Defualt=0 => no limit")

args = parser.parse_args()

config.SOURCE_NAME = args.name  # Nice name of the source
config.SOURCE_URL = args.url  # data.json final URL
config.LIMIT_DATASETS = args.limit_dataset

res = Flow(
    # get data.json and yield all datasets
    # validate headers and save the validation errors
    get_data_json_from_url(url=config.SOURCE_URL),
    update_resource('res_1', name='datajson', path='datajson.csv'),

    # remove duplicates
    clean_duplicated_identifiers,

    # validate each dataset
    validate_datasets,

    # save each dataset as data package
    save_as_data_packages,
).results()

logger.info('Continue to next step with: python3 flow2.py '
            f'--name {config.SOURCE_NAME} ')
Exemplo n.º 7
0
    def test_compare_resources(self):
        config.SOURCE_NAME = 'usada-test'
        url = f'{base_url}/usda.gov.data.json'
        config.SOURCE_URL = url
        total = 0

        config.LIMIT_DATASETS = 0
        for dataset in get_data_json_from_url(url=url):
            self.assertIsInstance(dataset, dict)
            total += 1
            save_as_data_packages(dataset)

        self.assertEqual(total, 1580)

        # compare with fake results
        fake_rows = [
            # extras do not exist
            {'id': '0001',
             'metadata_modified': '2019-05-02T21:36:22.693792',
             'NO-extras': [{'key': 'id', 'value': '000'}]},
            # key "identifier" do not exist inside extras
            {'id': '0002',
             'metadata_modified': '2019-05-02T21:36:22.693792',
             'extras': [{'key': 'id', 'value': '000'}]},
            # must be marked for update
            {'id': '0003',
             'metadata_modified': '2019-05-02T21:36:22.693792',
             'extras': [{'key': 'identifier', 'value': 'usda-ocio-15-01'}]},
            # NOT MODIFIED (by date)
            {'id': '0004',
             'metadata_modified': '2014-10-03T14:36:22.693792',
             'extras': [{'key': 'identifier', 'value': 'USDA-DM-003'}]},
            # NEW unknown identifier. I need to delete if is not in data.json
            {'id': '0005',
             'metadata_modified': '2019-05-02T21:36:22.693792',
             'extras': [{'key': 'identifier', 'value': 'New unexpected identifier'}]},
        ]

        for row in compare_resources(rows=fake_rows):
            # I expect first resoults

            cr = row['comparison_results']
            ckan_id = cr.get('ckan_id', None)

            if ckan_id == '0001':
                self.assertEqual(cr['action'], 'error')
                self.assertEqual(cr['reason'], 'The CKAN dataset does not '
                                              'have the "extras" property')
            elif ckan_id == '0002':
                self.assertEqual(cr['action'], 'error')
                self.assertEqual(cr['reason'], 'The CKAN dataset does not have an "identifier"')

            elif ckan_id == '0003':
                self.assertEqual(cr['action'], 'update')
                self.assertIsInstance(cr['new_data'], dict)

            elif ckan_id == '0004':
                self.assertEqual(cr['action'], 'ignore')
                self.assertIsNone(cr['new_data'])

            elif ckan_id == '0005':
                self.assertEqual(cr['action'], 'delete')
Exemplo n.º 8
0
base_data_folder = 'data'
local_folder = os.path.join(base_data_folder, args.name)
packages_folder_path = os.path.join(local_folder, 'datapackages')
if not os.path.isdir(packages_folder_path):
    os.makedirs(packages_folder_path)

data_json_path = os.path.join(local_folder, 'data.json')
data_json_errors_path = os.path.join(local_folder, 'data_json_errors.json')
duplicates_path = os.path.join(local_folder, 'duplicates.json')

# ----------------------------------------------------
# Get data.json if not here (or force)
# ----------------------------------------------------
if not os.path.isfile(data_json_path) or args.force_download:
    logger.info(f'Downloading {url}')
    datajson = get_data_json_from_url(url)
    datajson.save_data_json(data_json_path)
else:
    logger.info(f'Using data.json prevously downloaded: {data_json_path}')
    datajson = get_data_json_from_file(data_json_path=data_json_path)

datajson.save_validation_errors(path=data_json_errors_path)
total_datasets = len(datajson.datasets)
total_resources = datajson.count_resources()
logger.info('cleaning datasets')
duplicates = datajson.remove_duplicated_identifiers()
total_duplicates = len(duplicates)
datajson.save_duplicates(path=duplicates_path)

logger.info(
    f'Readed {total_datasets} datasets including {total_resources} resources. {total_duplicates} duplicated identifiers removed'