def update_config(self, config): # Must use IConfigurer rather than IConfigurable because only IConfigurer # is called before after_map, in which we need the configuration directives # to know how to set the paths. # TODO commenting out enterprise data inventory for right now # DataJsonPlugin.route_edata_path = config.get("ckanext.enterprisedatajson.path", "/enterprisedata.json") DataJsonPlugin.route_enabled = config.get("ckanext.datajson.url_enabled", "True") == 'True' DataJsonPlugin.route_path = config.get("ckanext.datajson.path", "/data.json") DataJsonPlugin.route_ld_path = config.get(" ckanext.datajsonld.path", re.sub(r"\.json$", ".jsonld", DataJsonPlugin.route_path)) DataJsonPlugin.ld_id = config.get("ckanext.datajsonld.id", config.get("ckan.site_url")) DataJsonPlugin.ld_title = config.get("ckan.site_title", "Catalog") DataJsonPlugin.site_url = config.get("ckan.site_url") DataJsonPlugin.absolute_route_path = DataJsonPlugin.site_url + DataJsonPlugin.route_path DataJsonPlugin.xlsx_file_name = config.get("ckanext.datajson.xlsx_file_name", "catalog.xlsx") DataJsonPlugin.xlsx_route_path = config.get("ckanext.datajson.xlsx_path", "/%s" % DataJsonPlugin.xlsx_file_name) DataJsonPlugin.inventory_links_enabled = config.get("ckanext.datajson.inventory_links_enabled", "False") == 'True' # Adds our local templates directory. It's smart. It knows it's # relative to the path of *this* file. Wow. p.toolkit.add_template_directory(config, "templates")
def __init__(self): """ Init del Controlador pricipal del plugin NeedUpdate. """ self.ext_folder = ckan_config.get('ckanext.needupdate.ext_folder', '/usr/lib/ckan/default/src') self.ext_prefix = ckan_config.get('ckanext.needupdate.ext_folder', 'ckanext-') self.ext_sufix = ckan_config.get('ckanext.needupdate.ext_folder', '')
def command(self): ''' Parse command line arguments and call appropriate method. ''' if not self.args or self.args[0] in ['--help', '-h', 'help']: print self.__doc__ return cmd = self.args[0] self._load_config() resource_base_url = config.get('ottawa.geo_url') dirty = False writelog("running geo update...") model.repo.new_revision() for dataset, resources in self.mapping.iteritems(): package = model.Package.get(dataset) if package is None: writelog("no such package: %s" % dataset) continue writelog("%s" % package.name) for existing_resource in package.resources: if existing_resource.format in resources: if existing_resource.format == 'shp': resource_path = resource_base_url + resources[existing_resource.format]['shp'] else: resource_path = resource_base_url + resources[existing_resource.format] file_name = 'temp_data/' + existing_resource.name + '.' + existing_resource.format resource_exists = self.download_temp_file(resource_path, file_name) if not resource_exists: writelog("resource cannot be found in data repository: %s" % resource_path) continue if self.update_required(existing_resource, file_name): writelog("Updating resource: %s" % resource_path) if existing_resource.format == 'shp': self.replace_shape_files(existing_resource, resources['shp']) else: self.replace_resource(existing_resource, file_name) self.update_checksum(existing_resource, file_name) self.update_dates(existing_resource) dirty = True else: writelog("update not required for: %s" % resource_path) if dirty: model.Session.commit() writelog("geo update commited") else: writelog("no new resources detected")
def replace_resource(self, existing_resource, temp_file): geo_storage_dir = config.get('ottawa.geo_storage_dir') timestamp = datetime.now().strftime('%Y-%m-%dT%H%M%S') timestamp_dir = os.path.join(geo_storage_dir, timestamp) if not os.path.exists(timestamp_dir): os.makedirs(timestamp_dir) if existing_resource.format == 'shp': new_file_name = existing_resource.name + '.shp.zip' else: new_file_name = existing_resource.name + '.' + existing_resource.format end_path = os.path.join(timestamp_dir, new_file_name) shutil.copyfile(temp_file, end_path) geo_storage_url = config.get('ottawa.geo_storage_url') existing_resource.url = "%s%s/%s" % ( geo_storage_url, timestamp, new_file_name, ) writelog("saved new resource for %s" % existing_resource.id) return ""
def replace_shape_files(self, existing_resource, shape_file_locations): resource_base_url = config.get('ottawa.geo_url') shape_destination_dir = os.path.join('temp_data', existing_resource.name + '_shp') if not os.path.exists(shape_destination_dir): os.makedirs(shape_destination_dir) for shape_format, shape_location in shape_file_locations.iteritems(): resource_location = resource_base_url + shape_location file_name = existing_resource.name + '.' + shape_format download_location = os.path.join(shape_destination_dir, file_name) self.download_temp_file(resource_location, download_location) zip_filename = os.path.join('temp_data', existing_resource.name + '.shp.zip') zip = zipfile.ZipFile(zip_filename, 'w') for root, dirs, files in os.walk(shape_destination_dir): for file in files: print 'writing file %s to %s' % (os.path.join(root, file), zip) zip.write(os.path.join(root, file), file) zip.close() self.replace_resource(existing_resource, zip_filename)
def make_json(self, export_type='datajson', owner_org=None): # Error handler for creating error log stream = StringIO.StringIO() eh = logging.StreamHandler(stream) eh.setLevel(logging.WARN) formatter = logging.Formatter('%(asctime)s - %(message)s') eh.setFormatter(formatter) logger.addHandler(eh) data = '' output = [] errors_json = [] Package2Pod.seen_identifiers = set() try: # Build the data.json file. if owner_org: if 'datajson' == export_type: # we didn't check ownership for this type of export, so never load private datasets here packages = DataJsonController._get_ckan_datasets(org=owner_org) if not packages: packages = self.get_packages(owner_org=owner_org, with_private=False) else: packages = self.get_packages(owner_org=owner_org, with_private=True) else: # TODO: load data by pages # packages = p.toolkit.get_action("current_package_list_with_resources")( # None, {'limit': 50, 'page': 300}) packages = DataJsonController._get_ckan_datasets() # packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) import re for i in range(0, len(packages)): j = 0 for extra in packages[i]['extras']: if extra.get('key') == 'language': print 'Key: {}, Value: {}'.format(extra.get('key'), extra.get('value')) if not isinstance(extra.get('value'), (unicode, str)): # Solo puedo operar si value es una instancia de UNICODE o STR logger.warn('No fue posible renderizar el campo: \"Language\".') else: language = [] try: # intento convertir directamente el valor de # Language a una lista. language = json.loads(extra['value']) except ValueError: # La traduccion no es posible, limpiar y reintentar if "{" or "}" in extra.get('value'): lang = extra['value'].replace('{', '').replace('}', '').split(',') else: lang = extra.get('value') if ',' in lang: lang = lang.split(',') else: lang = [lang] language = json.loads(lang) packages[i]['extras'][j]['value'] = language j += 1 try: for index, resource in enumerate(packages[i]['resources']): try: fixed_attrDesc = json.loads(resource['attributesDescription']) packages[i]['resources'][index]['attributesDescription'] = fixed_attrDesc except ValueError: logger.error('Fallo render de \'attributesDescription\'.') except KeyError: pass # Obtengo el ckan.site_url para chequear la propiedad del recurso. ckan_site_url = config.get('ckan.site_url') try: for index, resource in enumerate(packages[i]['resources']): resource = packages[i]['resources'][index] if not resource.get("accessURL", None): accessURL = os.path.join(ckan_site_url, 'dataset', packages[i]['id'], 'resource', resource['id']) resource.update({'accessURL': accessURL}) except KeyError: pass ckan_host = '' try: ckan_host = re.match( r'(?:http)s?:\/\/([\w][^\/=\s]+)\/?|(^w{3}[\.\w][^\/\=\s]{2,})\/?', packages[i]['resources'][0]['url']).group(0) except Exception: pass themes = self.safely_map(dict.get, packages[i]['groups'], 'name') packages[i]['groups'] = themes try: packages[i]['author'] = { 'name': packages[i]['author'], 'mbox': packages[i]['author_email'] } except KeyError: pass tags = self.safely_map(dict.get, packages[i]['tags'], 'display_name') packages[i]['tags'] = tags # packages[i] = json.loads(packages[i][0]['extras']['language']) try: if len(packages[i]['url']) < 1: packages[i]['url'] = '{host}/dataset/{dataset_id}'.format( host=ckan_host[:-1], dataset_id=packages[i]['name']) logger.info("landingPage generado para el dataset_id: %s.", packages[i]['name']) except TypeError: prepare_url = 'unknow' try: prepare_url = packages[i]['resources'][0]['url'] prepare_url = prepare_url.split('resource')[0] logger.info("landingPage generado para el dataset_id: %s, Tipo de datos: \" harvest\".", packages[i]['name']) except IndexError: logger.error("autogen \"landingpage\" fails.") packages[i].update({'url': prepare_url}) json_export_map = get_export_map_json('export.map.json') if json_export_map: for pkg in packages: if json_export_map.get('debug'): output.append(pkg) extras = dict([(x['key'], x['value']) for x in pkg.get('extras', {})]) if export_type in ['unredacted', 'redacted']: if 'Draft' == extras.get('publishing_status'): continue elif 'draft' == export_type: if 'publishing_status' not in extras.keys() or extras.get('publishing_status') != 'Draft': continue redaction_enabled = ('redacted' == export_type) datajson_entry = Package2Pod.convert_package(pkg, json_export_map, redaction_enabled) errors = None if 'errors' in datajson_entry.keys(): errors_json.append(datajson_entry) errors = datajson_entry.get('errors') datajson_entry = None if datajson_entry and \ (not json_export_map.get('validation_enabled') or self.is_valid(datajson_entry)): # logger.debug("writing to json: %s" % (pkg.get('title'))) output.append(datajson_entry) else: publisher = detect_publisher(extras) if errors: logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted, reason below:\n\t%s\n", pkg.get('id', None), pkg.get('title', None), publisher, errors) else: logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted, reason above.\n", pkg.get('id', None), pkg.get('title', None), publisher) try: # CLEAN Not requiered fields for d in output: del d["@type"] except Exception: pass data = Package2Pod.wrap_json_catalog(output, json_export_map) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("%s : %s : %s : %s", exc_type, filename, exc_tb.tb_lineno, unicode(e)) # Get the error log eh.flush() error = stream.getvalue() eh.close() logger.removeHandler(eh) stream.close() # Skip compression if we export whole /data.json catalog if 'datajson' == export_type: return data return self.write_zip(data, error, errors_json, zip_name=export_type)