def test_dataset_is_updated_correctly(self): catalog = self.full_catalog catalog_id = title_to_name(catalog['title']) dataset_id = catalog.datasets[0]['identifier'] push_dataset_to_ckan( catalog, "oficina-de-muestra", dataset_id, self.portal_url, self.apikey, catalog_id=catalog_id, ) catalog.datasets[0]['description'] = 'updated description' return_id = push_dataset_to_ckan( catalog, "oficina-de-muestra", dataset_id, self.portal_url, self.apikey, catalog_id=catalog_id, ) data_dict = {'id': catalog_id + '_' + dataset_id} package = self.portal.call_action('package_show', data_dict=data_dict) self.assertEqual(return_id, catalog_id + '_' + dataset_id) self.assertEqual('updated description', package['notes'])
def test_dataset_array_attributes_are_correct(self): package = map_dataset_to_package(self.catalog, self.dataset, 'owner', catalog_id=self.catalog_id) groups = [group['name'] for group in package.get('groups', [])] super_themes = [ title_to_name(s_theme.lower()) for s_theme in self.dataset.get('superTheme') ] try: self.assertItemsEqual(super_themes, groups) except AttributeError: self.assertCountEqual(super_themes, groups) tags = [tag['name'] for tag in package['tags']] keywords = self.dataset.get('keyword', []) themes = self.dataset.get('theme', []) theme_labels = [] for theme in themes: label = self.catalog.get_theme(identifier=theme)['label'] label = re.sub(r'[^\w .-]+', '', label, flags=re.UNICODE) theme_labels.append(label) try: self.assertItemsEqual(keywords + theme_labels, tags) except AttributeError: self.assertCountEqual(keywords + theme_labels, tags)
def setUpClass(cls): cls.catalog = pydatajson.DataJson(cls.get_sample('full_data.json')) cls.catalog_id = cls.catalog.get('identifier', title_to_name(cls.catalog['title'])) cls.dataset = cls.catalog.datasets[0] cls.dataset_id = cls.dataset.get('identifier') cls.distributions = cls.dataset['distribution']
def tearDown(self): full_dataset = self.full_catalog.datasets[0] full_name = title_to_name(full_dataset['title']) justice_dataset = self.justice_catalog.datasets[0] justice_name = title_to_name(justice_dataset['title']) try: self.portal.call_action('dataset_purge', data_dict={'id': full_name}) except NotFound: pass try: self.portal.call_action('dataset_purge', data_dict={'id': justice_name}) except NotFound: pass self.portal.close()
def test_catalog_id_is_prepended_to_dataset_id_and_name_if_passed(self): package = map_dataset_to_package( self.catalog, self.dataset, 'owner', catalog_id=self.catalog_id) self.assertEqual(self.catalog_id + '_' + self.dataset_id, package['id']) self.assertEqual( title_to_name(self.catalog_id + '-' + self.dataset['title']), package['name'])
def test_dataset_is_created_correctly(self): catalog = self.full_catalog catalog_id = title_to_name(catalog['title']) dataset = catalog.datasets[0] dataset_id = dataset['identifier'] return_id = push_dataset_to_ckan( catalog, "oficina-de-muestra", dataset_id, self.portal_url, self.apikey, catalog_id=catalog_id, ) self.assertEqual(return_id, catalog_id + '_' + dataset_id)
def test_themes_are_preserved_if_not_demoted(self): package = map_dataset_to_package(self.catalog, self.dataset, 'owner', catalog_id=self.catalog_id, demote_themes=False) groups = [group['name'] for group in package.get('groups', [])] super_themes = [ title_to_name(s_theme.lower()) for s_theme in self.dataset.get('superTheme') ] themes = self.dataset.get('theme', []) tags = [tag['name'] for tag in package['tags']] keywords = self.dataset.get('keyword', []) try: self.assertItemsEqual(super_themes + themes, groups) except AttributeError: self.assertCountEqual(super_themes + themes, groups) try: self.assertItemsEqual(keywords, tags) except AttributeError: self.assertCountEqual(keywords, tags)
def get_distribution_download_urls(distributions, catalog_id): # agrega las url que encuentra junto con su id de catalogo urls = [] for distribution in [ dist for dist in distributions if 'downloadURL' in dist and dist['downloadURL'] ]: if "fileName" in distribution: distribution_fileName = distribution["fileName"] else: distribution_fileName = "{}.{}".format( title_to_name(distribution["title"]), str(distribution["format"]).split("/")[-1].lower()) urls.append("{} {} {} {} {}".format(catalog_id, distribution["dataset_identifier"], distribution["identifier"], distribution_fileName, distribution["downloadURL"])) return urls
def test_dataset_id_and_name_are_preserved_if_catalog_id_is_not_passed( self): package = map_dataset_to_package(self.catalog, self.dataset, 'owner') self.assertEqual(self.dataset_id, package['id']) self.assertEqual(title_to_name(self.dataset['title']), package['name'])
def analyze_dataset(catalog_id, catalog, dataset_identifier, datasets_output_dir, debug_mode=False, replace=True, debug_distribution_ids=None): res = { "dataset_status": None, "distributions_ok": [], "distributions_error": [], } dataset_meta = catalog.get_dataset(dataset_identifier) if dataset_meta: dataset_dir = os.path.join(datasets_output_dir, dataset_identifier) helpers.ensure_dir_exists(dataset_dir) res["dataset_status"] = "OK" else: res["dataset_status"] = "ERROR: metadata" return res distribution_ids = [ distribution["identifier"] for distribution in dataset_meta["distribution"] ] # si está en debug mode, se puede especificar sólo algunos ids if debug_mode and debug_distribution_ids: distribution_ids = [ distribution_id for distribution_id in distribution_ids if distribution_id in debug_distribution_ids ] # creo c/u de las distribuciones del dataset for distribution_identifier in distribution_ids: msg = "Distribución {}: {} ({})" try: distrib_meta = catalog.get_distribution(distribution_identifier) # usa fileName si la distribución lo especifica, sino crea uno distribution_name = title_to_name(distrib_meta["title"]) distribution_file_name = distrib_meta.get( "fileName", "{}.csv".format(distribution_name)) dist_path = os.path.join(dataset_dir, "distribution", distribution_identifier, "download", "{}".format(distribution_file_name)) dist_url = get_distribution_url(dist_path) # print("esta es la URL QUE VA AL CATALOGO", dist_url) distrib_meta["downloadURL"] = dist_url # chequea si ante la existencia del archivo hay que reemplazarlo o # saltearlo if not os.path.exists(dist_path) or replace: status = "Replaced" if os.path.exists(dist_path) else "Created" origin_dist_path, df = analyze_distribution( catalog_id, catalog, dataset_identifier, distribution_identifier) if isinstance(distribution, list): distribution_complete = pd.concat(distribution) else: distribution_complete = distribution helpers.ensure_dir_exists(os.path.dirname(dist_path)) shutil.copyfile(origin_dist_path, dist_path) else: status = "Skipped" res["distributions_ok"].append((distribution_identifier, status)) logger.info(msg.format(distribution_identifier, "OK", status)) except Exception as e: if isinstance(e, KeyboardInterrupt): raise res["distributions_error"].append((distribution_identifier, repr(e).encode("utf8"))) trace_string = traceback.format_exc() logger.error( msg.format(distribution_identifier, "ERROR", repr(e).encode("utf8"))) for line in trace_string.splitlines(): logger.error(line) if debug_mode: raise res["dataset_status"] = "ERROR: scraping" return res
def scrape_dataset(xl, catalog, dataset_identifier, datasets_dir, debug_mode=False, replace=True, debug_distribution_ids=None, catalog_id=None): res = { "dataset_status": None, "distributions_ok": [], "distributions_error": [], } dataset_meta = catalog.get_dataset(dataset_identifier) if dataset_meta: dataset_dir = os.path.join(datasets_dir, dataset_identifier) helpers.ensure_dir_exists(dataset_dir) res["dataset_status"] = "OK" else: res["dataset_status"] = "ERROR: metadata" return res # filtro los parametros para un dataset en particular distribution_ids = [ distribution["identifier"] for distribution in dataset_meta["distribution"] ] # si está en debug mode, se puede especificar sólo algunos ids if debug_mode and debug_distribution_ids: distribution_ids = [ distribution_id for distribution_id in distribution_ids if distribution_id in debug_distribution_ids ] # creo c/u de las distribuciones del dataset for distribution_identifier in distribution_ids: msg = "Distribución {}: {} ({})" try: distrib_meta = catalog.get_distribution(distribution_identifier) distribution_name = title_to_name(distrib_meta["title"]) distribution_file_name = distrib_meta.get( "fileName", "{}.csv".format(distribution_name)) dist_download_dir = os.path.join(dataset_dir, "distribution", distribution_identifier, "download") dist_path = os.path.join(dist_download_dir, "{}".format(distribution_file_name)) dist_url = get_distribution_url(dist_path) # print("esta es la URL QUE VA AL CATALOGO", dist_url) distrib_meta["downloadURL"] = dist_url # chequea si ante la existencia del archivo hay que reemplazarlo o # saltearlo if not os.path.exists(dist_path) or replace: status = "Replaced" if os.path.exists(dist_path) else "Created" distribution = scrape_distribution(xl, catalog, distribution_identifier) if isinstance(distribution, list): distribution_complete = pd.concat(distribution) else: distribution_complete = distribution helpers.remove_other_files(os.path.dirname(dist_path)) distribution_complete.to_csv( dist_path, encoding="utf-8", index_label="indice_tiempo") else: status = "Skipped" res["distributions_ok"].append((distribution_identifier, status)) logger.info(msg.format(distribution_identifier, "OK", status)) except Exception as e: if isinstance(e, KeyboardInterrupt): raise res["distributions_error"].append((distribution_identifier, repr(e).encode("utf8"))) trace_string = traceback.format_exc() print( msg.format(distribution_identifier, "ERROR", repr(e).encode("utf8"))) print(trace_string) if debug_mode: raise res["dataset_status"] = "ERROR: scraping" # si no hay versión vieja de la distribución, elimina del catálogo try: get_distribution_path(catalog_id, dataset_identifier, distribution_identifier) except: catalog.remove_distribution(distribution_identifier, dataset_identifier) return res