def toDB(rec): db = {} db['city'] = 'badenwuerttemberg' # Baden-Württenberg is not a city ?! db['source'] = 'd' db['costs'] = None db['categories'] = categoryToODM(rec['category']) db['url'] = rec['page-url'] db['title'] = rec['title'] db['description'] = rec['description'] db['publisher'] = rec['herausgeber'] db['filelist'] = [extractUrl(rec['url'])] db['formats'] = formatToODM(rec['format']) db['licenseshort'] = licenseToODM(rec['nutzungsbedingungen']) temps = filter(lambda x: x != "", [rec['zeitraum'], rec['stichtag'], rec['publiziert am']]) db['temporalextent'] = temps[0] if temps else None db['open'] = metautils.isopen(db['licenseshort']) db['spatial'] = False db['metadata'] = '' #db.copy() - lohnt sich nicht db['metadata_xml'] = None return db
def toDB(rec): db = {} db['city'] = 'badenwuerttemberg' # Baden-Württenberg is not a city ?! db['source'] = 'd' db['costs'] = None db['categories'] = categoryToODM(rec['category']) db['url'] = rec['page-url'] db['title'] = rec['title'] db['description'] = rec['description'] db['publisher'] = rec['herausgeber'] db['filelist'] = [extractUrl(rec['url'])] db['formats'] = formatToODM(rec['format']) db['licenseshort'] = licenseToODM(rec['nutzungsbedingungen']) temps = filter(lambda x: x != "", [rec['zeitraum'], rec['stichtag'], rec['publiziert am']]) db['temporalextent'] = temps[0] if temps else None db['open'] = metautils.isopen(db['licenseshort']) db['spatial'] = False db['metadata'] = ''#db.copy() - lohnt sich nicht db['metadata_xml'] = None return db
def import_data(rec): rec['originating_portal'] = portalname rec['city'] = city rec['source'] = 'd' rec['publisher'] = '' rec['description'] = None rec['costs'] = None rec['metadata_xml'] = None rec['spatial'] = False rec['categories'] = [category_to_odm_map[rec['categories']]] rec['filelist'] = [] rec['metadata'] = '' # according to http://www.arnsberg.de/open-data/nutzungsbedingungen.php # nothing seems to be marked different rec['licenseshort'] = 'dl-de-zero-2.0' rec['open'] = metautils.isopen(rec['licenseshort']) # If a year of the 21st century is in the title use it as the temporalextent # insted of the date the file was added. # This is inconsistend but still better? t = re.search(r'20\d\d', rec['title']) if t: rec['temporalextent'] = t.group(0) return rec
vstellekey = 'author' catskey = 'tags' catssubkey = 'name' #Generate URL for the catalog page row[u'URL PARENT'] = url + '/dataset/' + package['name'] if 'notes' in package and package['notes'] != None: row[u'Beschreibung'] = package['notes'] if cityname == 'koeln': row[u'Beschreibung'] = metautils.unrenderhtml(row[u'Beschreibung']) else: row[u'Beschreibung'] = '' row[u'Zeitlicher Bezug'] = '' if licensekey in package and package[licensekey] != None: row[u'Lizenz'] = package[licensekey] #if not already short, try to convert if metautils.isopen(row[u'Lizenz'], quiet=True) is None: row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz']) else: row[u'Lizenz'] = 'nicht bekannt' if vstellekey in package and package[vstellekey] != None: row[u'Veröffentlichende Stelle'] = package[vstellekey] else: row[u'Veröffentlichende Stelle'] = '' if 'extras' in package: print 'WARNING: No author/maintainer/publisher, checking extras' for extra in package['extras']: if extra['key'] == 'contacts': print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra['value'] for group in metautils.setofvaluesasarray(package[catskey], catssubkey): if cityname != 'berlin': odm_cats = metautils.govDataLongToODM(group)
if len(row.xpath('td[2]/text()')) != 0: val = row.xpath('td[2]/text()')[0] elif len(row.xpath('td[2]//a')) != 0: val = row.xpath('td[2]//a/text()')[0] else: if (verbose): print 'ERROR: Missing value' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() if (verbose): print 'Parsing key ' + key.replace(':', '') + ' with value ' + val if u'veröffentlicht' in key: record['publisher'] = val elif u'geändert' in key: record['temporalextent'] = val.split(' ')[2] elif u'Lizenz' in key: record['licenseshort'] = metautils.long_license_to_short(val) record['open'] = metautils.isopen(record['licenseshort']) elif u'Webseite' in key: record['website'] = row.xpath('td[2]//a/@href')[0] #keep, as 'original' metadata if 'http://' not in record['website']: record['website'] = rooturl + record['website'] elif u'Kontakt' in key: record['contact'] = rooturl + row.xpath('td[2]//a/@href')[0] allrecords.append(record) #Find things in multiple categories recordsdict = {} for record in allrecords: if record['title'] not in recordsdict: recordsdict[record['title']] = record else:
elif len(row.xpath('td[2]//a')) != 0: val = row.xpath('td[2]//a/text()')[0] else: if (verbose): print 'ERROR: Missing value' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() if (verbose): print 'Parsing key ' + key.replace(':', '') + ' with value ' + val if u'veröffentlicht' in key: record['publisher'] = val elif u'geändert' in key: record['temporalextent'] = val.split(' ')[2] elif u'Lizenz' in key: record['licenseshort'] = metautils.long_license_to_short(val) record['open'] = metautils.isopen(record['licenseshort']) elif u'Webseite' in key: record['website'] = row.xpath('td[2]//a/@href')[ 0] #keep, as 'original' metadata if 'http://' not in record['website']: record['website'] = rooturl + record['website'] elif u'Kontakt' in key: record['contact'] = rooturl + row.xpath('td[2]//a/@href')[0] allrecords.append(record) #Find things in multiple categories recordsdict = {} for record in allrecords: if record['title'] not in recordsdict: recordsdict[record['title']] = record
print '\nMarking all Bonn Google data as rejected (needs to be changed if Google searches are ever resumed!' cur = metautils.getDBCursor(settings, dictCursor=True) cur.execute('update data set accepted = %s where city = %s and source = %s', (False, 'bonn', 'g')) metautils.dbCommit() print '\nResetting open...' cur = metautils.getDBCursor(settings, dictCursor=True) cur.execute('select url, licenseshort from data') for ores in cur.fetchall(): if ores['licenseshort'].strip() == '': license = 'nicht bekannt' open = None else: open = metautils.isopen(ores['licenseshort'].strip()) license = ores['licenseshort'].strip() cur.execute('update data set licenseshort = %s, open = %s where url = %s', (license, open, ores['url'])) metautils.dbCommit() print 'Finding cities with data...' cities = metautils.getCitiesWithData() print cities print '\nRemoving search machine data that has been found with own crawler...' for city in cities: cur = metautils.getDBCursor(settings, dictCursor=True) #Get all Google and Bing data to see if the files have also been found by crawling cur.execute(
vstellekey = 'author' catskey = 'tags' catssubkey = 'name' #Generate URL for the catalog page row[u'URL PARENT'] = url + '/dataset/' + package['name'] if 'notes' in package and package['notes'] != None: row[u'Beschreibung'] = package['notes'] if cityname == 'koeln': row[u'Beschreibung'] = metautils.unrenderhtml(row[u'Beschreibung']) else: row[u'Beschreibung'] = '' row[u'Zeitlicher Bezug'] = '' if licensekey in package and package[licensekey] != None: row[u'Lizenz'] = package[licensekey] #if not already short, try to convert if metautils.isopen(row[u'Lizenz'], quiet=True) is None: row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz']) else: row[u'Lizenz'] = 'nicht bekannt' if vstellekey in package and package[vstellekey] != None: row[u'Veröffentlichende Stelle'] = package[vstellekey] else: row[u'Veröffentlichende Stelle'] = '' if 'extras' in package: print 'WARNING: No author/maintainer/publisher, checking extras' for extra in package['extras']: if extra['key'] == 'contacts': print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra[ 'value'] for group in metautils.setofvaluesasarray(package[catskey], catssubkey): if cityname != 'berlin':
metautils.setsettings(settings) print '\nMarking all Bonn Google data as rejected (needs to be changed if Google searches are ever resumed!' cur = metautils.getDBCursor(settings, dictCursor = True) cur.execute('update data set accepted = %s where city = %s and source = %s', (False,'bonn','g')) metautils.dbCommit() print '\nResetting open...' cur = metautils.getDBCursor(settings, dictCursor = True) cur.execute('select url, licenseshort from data') for ores in cur.fetchall(): if ores['licenseshort'].strip() == '': license = 'nicht bekannt' open = None else: open = metautils.isopen(ores['licenseshort'].strip()) license = ores['licenseshort'].strip() cur.execute('update data set licenseshort = %s, open = %s where url = %s', (license, open, ores['url'])) metautils.dbCommit() print 'Finding cities with data...' cities = metautils.getCitiesWithData() print cities print '\nRemoving search machine data that has been found with own crawler...' for city in cities: cur = metautils.getDBCursor(settings, dictCursor = True) #Get all Google and Bing data to see if the files have also been found by crawling cur.execute('SELECT source, url FROM data WHERE city LIKE %s AND (source = %s OR source = %s) AND accepted = %s', (city,'b','g', True)) gbres = cur.fetchall()