def import_data(rec): rec['originating_portal'] = portalname rec['city'] = city rec['source'] = 'd' rec['publisher'] = '' rec['description'] = None rec['costs'] = None rec['metadata_xml'] = None rec['spatial'] = False rec['categories'] = [category_to_odm_map[rec['categories']]] rec['filelist'] = [] rec['metadata'] = '' # according to http://www.arnsberg.de/open-data/nutzungsbedingungen.php # nothing seems to be marked different rec['licenseshort'] = 'dl-de-zero-2.0' rec['open'] = metautils.isopen(rec['licenseshort']) # If a year of the 21st century is in the title use it as the temporalextent # insted of the date the file was added. # This is inconsistend but still better? t = re.search(r'20\d\d', rec['title']) if t: rec['temporalextent'] = t.group(0) return rec
def recordToDB(rec): db = {} db['city'] = 'braunschweig' db['source'] = 'd' db['costs'] = None db['url'] = rec['url'] db['title'] = rec['title'] db['description'] = rec['abstract'] db['temporalextent'] = rec['created'] db['publisher'] = rec['organisation'] db['filelist'] = rec['filelist'] db['formats'] = formatsToODM(rec['formats']) db['categories'] = categoriesToODM(rec['topic category']) db['licenseshort'] = licenseToODM(rec['rights']) db['open'] = metautils.isopen(db['licenseshort']) db['spatial'] = isSpatialFormat(db['formats']) additionalMetadata = [ 'accessRights', 'modified', 'spatials', 'type', 'subjects', 'categoriesB' ] db['metadata'] = dict(db.items() + {key: rec[key] for key in additionalMetadata}.items()) # xml metadata only includes data from catalog api?! db['metadata_xml'] = rec['xml'] return db
def import_data(self, d): d = metautils.gerToEngKeys(d) d['originating_portal'] = portalname d['accepted'] = True d['source'] = 'd' d['metadata_xml'] = None d['costs'] = None d['spatial'] = None d['open'] = metautils.isopen(d.get('licenseshort', '').strip()) d['temporalextent'] = '' # have a look if its there return d
def import_data(self, rec): d = imp_rec(rec) d = metautils.gerToEngKeys(d) d['open'] = metautils.isopen(d.get('licenseshort', '').strip()) d['json'] = '' d['publisher'] = '' d['originating_portal'] = 'daten.ulm.de' d['accepted'] = True d['source'] = 'd' d[u'metadata_xml'] = rec.get('metadata_xml', '') return d
def import_data(self, d): d = import_package(d) d = metautils.gerToEngKeys(d) d = dict(d) d['originating_portal'] = portalname d['accepted'] = True d['source'] = 'd' d['metadata_xml'] = None d['costs'] = None d['spatial'] = None d['open'] = metautils.isopen(d['licenseshort'].strip()) d['publisher'] = '' # actually its in the data d['filelist'] = d['files'] return d
def import_data(self, d): d = import_package(d) d = metautils.gerToEngKeys(d) d['originating_portal'] = portalname d['accepted'] = True d['costs'] = None d['open'] = metautils.isopen(d['licenseshort']) d['publisher'] = None d['spatial'] = None d['source'] = 'd' d['metadata_xml'] = None d['filelist'] = d['files'] return d
def import_data(self, d): d = importCity(self.city, self.url, d) if d != {}: d = metautils.gerToEngKeys(d) d = dict(d) d['originating_portal'] = self.portalname d['accepted'] = True d['costs'] = None d['spatial'] = None d['source'] = 'd' d['metadata_xml'] = None d['formats'] = list(d['formats']) d['open'] = metautils.isopen(d['licenseshort'].strip()) if 'categoies' not in d: d['categories'] = [] d['filelist'] = d['files'] return d
def import_data(self, d): d = importCity(self.city, self.url, d) if d != {}: d = metautils.gerToEngKeys(d) d = dict(d) d['city'] = self.city d['originating_portal'] = self.portalname d['accepted'] = True d['costs'] = None d['spatial'] = None d['source'] = 'd' d['metadata_xml'] = None d['formats'] = list(d['formats']) d['open'] = metautils.isopen(d['licenseshort'].strip()) if 'categories' not in d: d['categories'] = [] d['filelist'] = d['files'] return d
def toDB(rec): db = {} db['city'] = 'badenwuerttemberg' # Baden-Württenberg is not a city ?! db['source'] = 'd' db['costs'] = None db['categories'] = categoryToODM(rec['category']) db['url'] = rec['url'] db['title'] = rec['title'] db['description'] = rec['description'] db['publisher'] = rec['herausgeber'] db['filelist'] = [extractUrl(rec['file-url'])] db['formats'] = formatToODM(rec['format']) db['licenseshort'] = licenseToODM(rec['nutzungsbedingungen']) temps = filter(lambda x: x != "", [rec['zeitraum'], rec['stichtag'], rec['publiziert am']]) db['temporalextent'] = temps[0] if temps else None db['open'] = metautils.isopen(db['licenseshort']) db['spatial'] = False db['metadata'] = '' db['metadata_xml'] = None return db
def importCity(cityname, url, package): if cityname == 'hamburg': # Only take 'open data' if package['type'] != 'dataset' or 'forward-reference' in package['title']: return {} resources = [] formats = set() files = [] # Key for the file link in the resource urlkeys = ['url'] formatkey = 'format' if ('resources' in package): resources = package['resources'] for file in resources: for urlkey in urlkeys: if (file[urlkey] not in [None, '']): if '://' not in file[urlkey]: files.append(url + file[urlkey]) else: files.append(file[urlkey]) break if formatkey in file and file[formatkey] not in [None, '']: format = file[formatkey] formats.add(format.upper()) row = {} row[u'Stadt'] = cityname row[u'Dateibezeichnung'] = package['title'] row[u'URL PARENT'] = url + '/dataset/' + package['name'] if cityname in ('hamburg', 'koeln', 'frankfurt', 'aachen', 'berlin', 'muenchen'): if cityname in ('hamburg', 'frankfurt', 'aachen'): licensekey = 'license_id' vstellekey = 'author' catskey = 'groups' catssubkey = 'title' elif cityname == 'muenchen': licensekey = 'license_id' vstellekey = 'maintainer' catskey = 'groups' catssubkey = 'title' elif cityname in ('koeln', 'berlin'): licensekey = 'license_title' vstellekey = 'maintainer' if cityname == 'koeln': catskey = 'tags' elif cityname == 'berlin': catskey = 'groups' catssubkey = 'name' # Generate URL for the catalog page if 'notes' in package and package['notes'] != None: row[u'Beschreibung'] = package['notes'] if cityname == 'koeln': soup = BeautifulSoup(row[u'Beschreibung']) row[u'Beschreibung'] = soup.getText('\n') else: row[u'Beschreibung'] = '' row[u'Zeitlicher Bezug'] = '' if licensekey in package and package[licensekey] != None: row[u'Lizenz'] = package[licensekey] # if not already short, try to convert if metautils.isopen(row[u'Lizenz']) is 'Unbekannt': row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz']) else: row[u'Lizenz'] = 'nicht bekannt' if vstellekey in package and package[vstellekey] != None: row[u'Veröffentlichende Stelle'] = package[vstellekey] else: row[u'Veröffentlichende Stelle'] = '' if 'extras' in package: print 'WARNING: No author/maintainer/publisher, checking extras' for extra in package['extras']: if extra['key'] == 'contacts': print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra['value'] for group in metautils.setofvaluesasarray(package[catskey], catssubkey): if cityname != 'berlin': odm_cats = metautils.govDataLongToODM(group) else: odm_cats = berlin_to_odm(group) row[u'categories'] = odm_cats # Bonn is just different enough to do it separately. TODO: Consider combining into above. elif cityname == 'bonn': row[u'Beschreibung'] = package.get('description', '') for timeattempt in ['temporal', 'modified']: if timeattempt in package and package[timeattempt] not in [None, '']: row[u'Zeitlicher Bezug'] = package[timeattempt] break row[u'Zeitlicher Bezug'] = row.get(u'Zeitlicher Bezug', '') row[u'Lizenz'] = package.get('license', False) if not row[u'Lizenz']: row[u'Lizenz'] = package['license_title'] row[u'Veröffentlichende Stelle'] = package.get('publisher', '') cats = package.get('keyword', []) odm_cats = map(lambda x: metautils.govDataLongToODM(x, checkAll=True), cats) resources = package.get(u'distribution', []) for r in resources: files.append(r[u'accessURL']) formats.append(r[u'format']) row[u'Format'] = formats row[u'files'] = files row['metadata'] = package return row
def get_categorie_content(category_link): # Get the page allrecords = [] parser = etree.HTMLParser(encoding='utf-8') data = etree.parse(rooturl + category_link, parser) # Get the category category = data.xpath('/html/body/div/div[5]/div/div[1]//h1/text()')[0].strip() # category = urllib.unquote(category).decode('utf8') if (verbose): print 'Category: ' + ascii_only(category) datasets = get_datasets(data) numdatasets = len(datasets) if (verbose): print 'There are ' + str(numdatasets) + ' datasets' # Now get the html for each one. This is painful. # The bit of html concerning the datasets: corehtml = data.xpath('//div[@id=\'ContentBlock\']')[0] # First try to split by the horizontal rules. This usually works, but not always datasetparts = etree.tostring(corehtml).split('<hr id="hr') if (verbose): print 'Found ' + str(len(datasetparts)) + ' datasets by splitting by hr elements with ids' if len(datasetparts) != numdatasets: if (verbose): print 'This doesn\'t match. Trying with links to TOC' # If there is TOC, this works. There isn\'t always one. datasetparts = etree.tostring(corehtml).split('nach oben') del datasetparts[len(datasetparts) - 1] for index in range(0, len(datasetparts)): datasetparts[index] = datasetparts[index] + '</a>' if (verbose): print 'Found ' + str(len(datasetparts)) + ' datasets by splitting by links to TOC' if len(datasetparts) != numdatasets: if (verbose): print 'Well, that didn\'t work either. Giving up' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() else: if numdatasets > 1: for index in range(1, len(datasetparts)): # That split makes for bad HTML. Make it better. datasetparts[index] = '<hr id="hr' + datasetparts[index] count = 1 for datasetpart in datasetparts: data = etree.HTML(datasetpart) record = {} record['city'] = 'bochum' record['categories'] = [] record['categories'].append(category) datasets = get_datasets(data) record['title'] = datasets[0] if (verbose): print 'Parsing dataset ' + ascii_only(record['title']) if 'noch im Aufbau' in record['title']: # Nothing to see here if (verbose): print 'Empty category' continue record['url'] = rooturl + category_link + '#par' + str(count) count += 1 datatables, filetables = findfilesanddata(data) if len(datatables) == 0: if (verbose): print 'This record contains no data... checking for link to another page...' checkforsubpage = data.xpath('//span//a') for link in checkforsubpage: if (verbose): print etree.tostring(link) if len(link.xpath('text()')) > 0 and u'zu den Daten' in link.xpath('text()')[0]: testurl = link.xpath('@href')[0] if (verbose): print 'Following/updating URL: ' + rooturl + testurl record['url'] = rooturl + testurl datatables, filetables = findfilesanddata(html.parse(rooturl + testurl)) # get the data on the files, and get each link in it record['filelist'] = [] for table in filetables: record['filelist'].extend([(rooturl + x) for x in etree.HTML(table).xpath('//a/@href')]) record['formats'] = set() record['spatial'] = False for file in record['filelist']: formatarray = file.split('/')[-1].split('.') format = 'Unknown' if len(formatarray)>1: format = formatarray[1].upper().split('?')[0] elif 'WMS' in formatarray[0]: format = 'WMS' elif 'WFS' in formatarray[0]: format = 'WFS' record['formats'].add(format) if (format.upper() in metautils.geoformats): record['spatial'] = True record['formats'] = list(record['formats']) if len(datatables) > 1: if (verbose): print 'ERROR: More than one data table' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() elif len(datatables) == 0: if (verbose): print 'ERROR: No data table' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() # parse the data table by row if (verbose): print 'Reading datatable...' rowelements = etree.HTML(datatables[0]).xpath('//tr') for row in rowelements: if len(row.xpath('td[1]/text()')) == 0: continue key = row.xpath('td[1]/text()')[0] if (verbose): print ascii_only(key) if len(row.xpath('td[2]/text()')) != 0: val = row.xpath('td[2]/text()')[0] elif len(row.xpath('td[2]//a')) != 0: val = row.xpath('td[2]//a/text()')[0] else: if (verbose): print 'ERROR: Missing value' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() if (verbose): print ascii_only('Parsing key ' + key.replace(':', '') + ' with value ' + val) if u'veröffentlicht' in key: record['publisher'] = val elif u'geändert' in key: record['temporalextent'] = val.split(' ')[2] elif u'Lizenz' in key: record['licenseshort'] = metautils.long_license_to_short(val) record['open'] = metautils.isopen(record['licenseshort']) elif u'Webseite' in key: record['website'] = row.xpath('td[2]//a/@href')[0] # keep, as 'original' metadata if 'http://' not in record['website']: record['website'] = rooturl + record['website'] elif u'Kontakt' in key: record['contact'] = rooturl + row.xpath('td[2]//a/@href')[0] allrecords.append(record) return allrecords
def test_is_open_uppercase(self): assert metautils.isopen('CC BY 3.0 DE') == 'Offen'
def test_isopen(self): assert metautils.isopen('dl-de-by-2.0') == 'Offen'
def importCity(cityname, url, package): if cityname == 'hamburg': # Only take 'open data' if package['type'] != 'dataset' or 'forward-reference' in package[ 'title']: return {} #There is a version of CKAN that can output private datasets! but DKAN is using this field for different purposes if package['private'] and cityname not in dkanCities: return {} resources = [] formats = set() files = [] # Key for the file link in the resource urlkeys = ['url'] formatkey = 'format' if ('resources' in package): resources = package['resources'] for file in resources: for urlkey in urlkeys: if (file[urlkey] not in [None, '']): if '://' not in file[urlkey]: files.append(url + file[urlkey]) else: files.append(file[urlkey]) break if formatkey in file and file[formatkey] not in [None, '']: format = file[formatkey] formats.add(format.upper()) row = {} row[u'Stadt'] = cityname row[u'Dateibezeichnung'] = package['title'] if 'name' in package: row[u'URL PARENT'] = url + '/dataset/' + package['name'] elif 'url' in package: row[u'URL PARENT'] = package['url'] else: row[u'URL PARENT'] = '' if cityname in v3cities: licensekey = 'license_id' vstellekey = 'author' catskey = 'groups' catssubkey = 'title' if cityname == 'berlin': catssubkey = 'name' elif cityname == 'muenchen': licensekey = 'license_id' vstellekey = 'maintainer' catskey = 'groups' catssubkey = 'title' elif cityname in dkanCities: licensekey = 'license_title' vstellekey = 'maintainer' catskey = 'tags' catssubkey = 'name' # Generate URL for the catalog page if 'notes' in package and package['notes'] != None: row[u'Beschreibung'] = package['notes'] if cityname == 'koeln': soup = BeautifulSoup(row[u'Beschreibung']) row[u'Beschreibung'] = soup.getText('\n') else: row[u'Beschreibung'] = '' row[u'Zeitlicher Bezug'] = '' if licensekey in package and package[licensekey] != None: row[u'Lizenz'] = package[licensekey] # if not already short, try to convert if metautils.isopen(row[u'Lizenz']) is 'Unbekannt': row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz']) else: row[u'Lizenz'] = 'nicht bekannt' if vstellekey in package and package[vstellekey] != None: row[u'Veröffentlichende Stelle'] = package[vstellekey] else: row[u'Veröffentlichende Stelle'] = '' if 'extras' in package: print 'WARNING: No author/maintainer/publisher, checking extras' for extra in package['extras']: if extra['key'] == 'contacts': print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra[ 'value'] cat_groups = metautils.setofvaluesasarray(package[catskey], catssubkey) if cityname != 'berlin': odm_cats = metautils.matchCategories(cat_groups) else: for group in cat_groups: odm_cats = berlin_to_odm(group) row[u'categories'] = odm_cats row[u'Format'] = formats row[u'files'] = files row['metadata'] = package row[u'original_metadata'] = { u'metadata_created': package['metadata_created'], u'metadata_modified': package['metadata_modified'] } return row