def sanitize_keyword(s, strict=True): """ Make string compatible for usage as CKAN keywords: keywords rules are not very clear. It seems at first it did not support anything out of lowercased characters and _- but is seems that now it supports well spaces and uppercased chars, as well as accentuated characters. or -_ This is an alternative to setting {"clean_tags": true} in the harvesting configuration,which is a little bit more destructive (although maybe quite similar to the strict option) :param s: :return: """ if not s: return '' s = re.sub(r'\'\s+', ' ', s) # remove duplicate spaces s = s.strip() # remove trailing spaces s = re.sub(u'\'', ' ', s) # Change single quote to space #s = re.sub(r'[\s]', '_', s) if strict: # should ensure compiancy with ckan validators requirements (as announced) #s = unidecode.unidecode(s) # remove accents and keep to closest possible ascii match s = substitute_ascii_equivalents( s) # remove accents and keep to closest possible ascii match pattern = u'[^\w\-]' # set a more strict match pattern s = re.sub(pattern, '-', s, re.UNICODE).lower() # all lowercased else: # seems sufficient in most cases pattern = u'[^a-zA-Z0-9_àâäôéèëêïîçùûüÿæœÀÂÄÔÉÈËÊÏΟÇÙÛÜÆŒ \-]' # Accept accents s = re.sub(pattern, '-', s, re.UNICODE) # don't lowercase systematically return s
def migrate(self): ''' ''' related_items = get_action('related_list')(data_dict={}) # preflight: # related items must have unique titles before migration related_titles = [i['title'] for i in related_items] # make a list of duplicate titles duplicate_titles = self._find_duplicates(related_titles) if duplicate_titles: print( """All Related Items must have unique titles before migration. The following Related Item titles are used more than once and need to be corrected before migration can continue. Please correct and try again:""" ) for i in duplicate_titles: print(i) return for related in related_items: existing_showcase = get_action('package_search')( data_dict={'fq': '+dataset_type:showcase original_related_item_id:{0}'.format(related['id'])}) normalized_title = substitute_ascii_equivalents(related['title']) if existing_showcase['count'] > 0: print('Showcase for Related Item "{0}" already exists.'.format( normalized_title)) else: data_dict = { 'original_related_item_id': related.get('id'), 'title': related.get('title'), 'name': munge_title_to_name(related.get('title')), 'notes': related.get('description'), 'image_url': related.get('image_url'), 'url': related.get('url'), 'tags': [{"name": related.get('type').lower()}] } # make the showcase try: new_showcase = get_action('ckanext_showcase_create')( data_dict=data_dict) except Exception as e: print('There was a problem migrating "{0}": {1}'.format( normalized_title, e)) else: print('Created Showcase from the Related Item "{0}"'.format(normalized_title)) # make the showcase_package_association, if needed try: related_pkg_id = self._get_related_dataset( related['id']) if related_pkg_id: get_action('ckanext_showcase_package_association_create')( data_dict={'showcase_id': new_showcase['id'], 'package_id': related_pkg_id}) except Exception as e: print('There was a problem creating the showcase_package_association for "{0}": {1}'.format( normalized_title, e))
def munge_tag(tag): tag = substitute_ascii_equivalents(tag) tag = tag.lower().strip() tag = re.sub(r'[^a-zA-Z0-9\- ]', '', tag).replace(' ', '-') tag = _munge_to_length(tag, model.MIN_TAG_LENGTH, model.MAX_TAG_LENGTH) return tag
def munge_tag(tag): tag = substitute_ascii_equivalents(tag) tag = tag.lower().strip() return re.sub(r'[^a-zA-Z0-9 -]', '', tag).replace(' ', '-')
def munge_tag(tag): tag = substitute_ascii_equivalents(tag) tag = tag.lower().strip() return re.sub(r"[^a-zA-Z0-9 -]", "", tag).replace(" ", "-")
except ContentFetchError, e: self._save_gather_error('%r' % e.message, harvest_job) return False except KeyError, e: self._save_gather_error('Failed to parse response: %r' % e, harvest_job) return False #members = self.get_xroad_catalog("http://localhost:9090/rest-gateway-0.0.8-SNAPSHOT/Consumer/catalog", "2011-01-01") #file = open(os.path.join(os.path.dirname(__file__), '../tests/response.json')) #members = json.load(file) object_ids = [] for member in members: log.info(json.dumps(member)) #log.info(type(member['subsystems']['subsystem'])) # Create organization id org_id = substitute_ascii_equivalents(unicode(member.get('xRoadInstance', '')) + '.' + unicode(member.get('memberClass', '')) + '.' + unicode(member.get('memberCode', ''))) if member['subsystems'] and (type(member['subsystems']['subsystem']) is list): org = self._create_or_update_organization({'id': org_id, 'name': member['name'], 'created': member['created'], 'changed': member['changed'], 'removed': member.get('removed', None)}, harvest_job) for subsystem in member['subsystems']['subsystem']: # Generate GUID guid = substitute_ascii_equivalents(unicode(member.get('xRoadInstance', '')) + '.' + unicode(member.get('memberClass', '')) + '.' + unicode(member.get('memberCode', '')) + '.' + unicode(subsystem.get('subsystemCode', ''))) # Create harvest object obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps({ 'owner': org,
object_ids = [] for member in members: if isinstance(member, basestring): continue # if there is only 1 subsystem, wrap it with list if member['subsystems'] and (type( member['subsystems']['subsystem']) is dict): member['subsystems']['subsystem'] = [ member['subsystems']['subsystem'] ] # Create organization id org_id = substitute_ascii_equivalents(u'.'.join( unicode(member.get(p, '')) for p in ('xRoadInstance', 'memberClass', 'memberCode'))) org = self._create_or_update_organization( { 'id': org_id, 'name': member['name'], 'created': member['created'], 'changed': member['changed'], 'removed': member.get('removed', None) }, harvest_job) if org is None: self._save_gather_error( 'Failed to create organization with id: %s and name: %s' % (org_id, member['name']), harvest_job)
def read_data(self, id, resource_id): res = Resource.get(resource_id) pkg = Package.get(id) c.pkg_dict = pkg.as_dict() c.package = pkg c.resource = get_action('resource_show')({'model': model}, {'id': resource_id}) label = res.url.split(config.get('ckan.site_url') + '/storage/f/')[-1] label = urllib2.unquote(label) ofs = get_ofs() try: furl = ofs.get_url(BUCKET, label).split('file://')[-1] except FileNotFoundException: h.flash_error(_('Cannot do data mining on remote resource!')) url = h.url_for(controller='package', action='resource_read', id=id, resource_id=resource_id) return redirect(url) wordstats = {} ret = {} if res.format in ('TXT', 'txt'): wdsf, wdspath = tempfile.mkstemp() os.write(wdsf, "%s\nmetadata description title information" % furl) with os.fdopen(wdsf, 'r') as wordfile: preproc = orngText.Preprocess() table = orngText.loadFromListWithCategories(wdspath) data = orngText.bagOfWords(table, preprocessor=preproc) words = orngText.extractWordNGram(data, threshold=10.0, measure='MI') for i in range(len(words)): d = words[i] wordstats = d.get_metas(str) for k, v in wordstats.items(): if v.value > 10.0: ret[unicode(k, 'utf8')] = v.value from operator import itemgetter c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30] os.remove(wdspath) for i in range(len(data)): d = words[i] wordstats = d.get_metas(str) words = [] for k, v in wordstats.items(): words.append(k) model.repo.new_revision() if not 'autoextracted_description' in pkg.extras: pkg.extras['autoextracted_description'] = ' '.join(words) pkg.save() return render('datamining/read.html') elif res.format in ('odt', 'doc', 'xls', 'ods', 'odp', 'ppt', 'doc', 'html'): textfd, textpath = convert_to_text(res, furl) if not textpath: h.flash_error(_('This file could not be mined for any data!')) os.close(textfd) return render('datamining/read.html') else: wdsf, wdspath = tempfile.mkstemp() os.write(wdsf, "%s\nmetadata description title information" % textpath) preproc = orngText.Preprocess() table = orngText.loadFromListWithCategories(wdspath) data = orngText.bagOfWords(table, preprocessor=preproc) words = orngText.extractWordNGram(data, threshold=10.0, measure='MI') for i in range(len(words)): d = words[i] wordstats = d.get_metas(str) for k, v in wordstats.items(): if v.value > 10.0: ret[unicode(k, 'utf8')] = v.value from operator import itemgetter c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30] os.close(textfd) os.close(wdsf) os.remove(wdspath) os.remove(textpath) for i in range(len(data)): d = words[i] wordstats = d.get_metas(str) words = [] for k, v in wordstats.items(): log.debug(k) words.append(substitute_ascii_equivalents(k)) model.repo.new_revision() if not 'autoextracted_description' in pkg.extras: pkg.extras['autoextracted_description'] = ' '.join(words) pkg.save() return render('datamining/read.html') else: h.flash_error(_('This metadata document is not in proper format for data mining!')) url = h.url_for(controller='package', action='resource_read', id=id, resource_id=resource_id) return redirect(url)
# Member = organization # Subsystem = package = API # Service = resource = WSDL object_ids = [] for member in members: if isinstance(member, basestring): continue # if there is only 1 subsystem, wrap it with list if member['subsystems'] and (type(member['subsystems']['subsystem']) is dict): member['subsystems']['subsystem'] = [member['subsystems']['subsystem']] # Create organization id org_id = substitute_ascii_equivalents(u'.'.join(unicode(member.get(p, '')) for p in ('xRoadInstance', 'memberClass', 'memberCode'))) org = self._create_or_update_organization({ 'id': org_id, 'name': member['name'], 'created': member['created'], 'changed': member['changed'], 'removed': member.get('removed', None) }, harvest_job) if org is None: continue if self._organization_has_wsdls(member): for subsystem in member['subsystems']['subsystem']: