def audit_fixme(db, collection): keys = [ 'fixme', 'FIXME', 'FIXME2', 'fixme:de', 'note:FIXME', 'source:fixme' ] for key in keys[1:]: db[collection].update({}, {'$rename': {key: "fixme"}}) db_ops.move_db(db, collection, 'NeedsFix', 'fixme', None)
def audit_postcode(db, collection): pcs = db_ops.get_values(db, collection, ['address.postcode']) for pc in pcs: #One given postcode did not exist: if pc == '22701': pc = '22765' #found at google maps db[collection].update({'address.postcode':'22701'},{'$set':{'address.postcode':pc}}) #postcodes and states were crossvalidated with the google maps API url = 'http://maps.googleapis.com/maps/api/geocode/json?address=urlencode($_REQUEST[' + pc + ' Germany])&sensor=false' attempts = 0 success = False while success != True and attempts < 3: page = urllib.urlopen(url) data = json.loads(page.read()) attempts += 1 if data['status'] == "OVER_QUERY_LIMIT": time.sleep(2) continue success = True if attempts == 3: print("Daily limit has been reached") for i in range(len(data['results'][0]['address_components'])): if data['results'][0]['address_components'][i]['long_name'] in ['Schleswig-Holstein', 'Niedersachsen','Hamburg', 'Lower Saxony']: state = data['results'][0]['address_components'][i]['long_name'] if state == 'Lower Saxony': state = 'Niedersachsen' #Set the state field, if not already given if db[collection].find({'address.postcode':pc, 'address.state' : {'$exists' : 'false'}}) != None: db[collection].update({'address.postcode':pc},{'$set':{'address.state':state}}) if state != 'Hamburg': db_ops.move_db(db, collection, 'SurroundingStates', 'address.postcode', pc)
def audit_state(db, collection): states = db_ops.get_values(db, collection, ['address.state']) # One Document had DE as state. validation showed that Schleswig-Holstein is the correct value #Be careful if using different data sets cor_state = {'manual':{'DE':'Schleswig-Holstein', 'NI':'Niedersachsen', 'Lower Saxony':'Niedersachsen', 'HH':'Hamburg'}} db_ops.update_db(db, collection, 'address.state', cor_state) states = db_ops.get_values(db, collection, ['address.state']) for state in states: if state != 'Hamburg': db_ops.move_db(db, collection, 'SurroundingStates', 'address.state', state)
def audit_postcode(db, collection): pcs = db_ops.get_values(db, collection, ['address.postcode']) for pc in pcs: #One given postcode did not exist: if pc == '22701': pc = '22765' #found at google maps db[collection].update({'address.postcode': '22701'}, {'$set': { 'address.postcode': pc }}) #postcodes and states were crossvalidated with the google maps API url = 'http://maps.googleapis.com/maps/api/geocode/json?address=urlencode($_REQUEST[' + pc + ' Germany])&sensor=false' attempts = 0 success = False while success != True and attempts < 3: page = urllib.urlopen(url) data = json.loads(page.read()) attempts += 1 if data['status'] == "OVER_QUERY_LIMIT": time.sleep(2) continue success = True if attempts == 3: print("Daily limit has been reached") for i in range(len(data['results'][0]['address_components'])): if data['results'][0]['address_components'][i]['long_name'] in [ 'Schleswig-Holstein', 'Niedersachsen', 'Hamburg', 'Lower Saxony' ]: state = data['results'][0]['address_components'][i][ 'long_name'] if state == 'Lower Saxony': state = 'Niedersachsen' #Set the state field, if not already given if db[collection].find({ 'address.postcode': pc, 'address.state': { '$exists': 'false' } }) != None: db[collection].update({'address.postcode': pc}, {'$set': { 'address.state': state }}) if state != 'Hamburg': db_ops.move_db(db, collection, 'SurroundingStates', 'address.postcode', pc)
def audit_state(db, collection): states = db_ops.get_values(db, collection, ['address.state']) # One Document had DE as state. validation showed that Schleswig-Holstein is the correct value #Be careful if using different data sets cor_state = { 'manual': { 'DE': 'Schleswig-Holstein', 'NI': 'Niedersachsen', 'Lower Saxony': 'Niedersachsen', 'HH': 'Hamburg' } } db_ops.update_db(db, collection, 'address.state', cor_state) states = db_ops.get_values(db, collection, ['address.state']) for state in states: if state != 'Hamburg': db_ops.move_db(db, collection, 'SurroundingStates', 'address.state', state)
def audit_city(db, collection): cities = db_ops.get_values(db, collection, ['address.city']) #clean city names shown to have problem characters cities_to_clean = {'wrong_state': {'Barendorf, Kreis Lüneburg':'Barendorf', 'Wintermoor a. d. Ch.':'Wintermoor an der Chaussee', 'Moisburg/Hollenstedt':'Moisburg', 'Lauenburg/Elbe':'Lauenburg Elbe'}} db_ops.update_db(db, collection, 'address.city', cities_to_clean) #cross calidate cities with state and postal codes for city in cities: url = 'http://maps.googleapis.com/maps/api/geocode/json?address=urlencode($_REQUEST["' + city.encode('utf8') + '" Germany])&sensor=false&oe=utf-8' attempts = 0 success = False while success != True and attempts < 3: page = urllib.urlopen(url) data = json.loads(page.read()) attempts += 1 if data['status'] == "OVER_QUERY_LIMIT": time.sleep(2) continue success = True if attempts == 3: print("Daily limit has been reached") if data['status'] == 'ZERO_RESULTS': #print(city + ' not found!') with open('Output\\' + collection + '_unknown-cities.txt', 'a') as f: f.write(unicode(city + '\n').encode("utf-8")) else: try: for i in range(len(data['results'][0]['address_components'])): if data['results'][0]['address_components'][i]['types'] == ['postal_code']: pc = data['results'][0]['address_components'][i]['long_name'] if db[collection].find({'address.city':city, 'address.postcode':{'$ne':pc}}) != None: print(city, pc) db[collection].update({'address.city':city, 'address.postcode':{'$ne':pc}},{'$set':{'address.postcode':pc}}) if data['results'][0]['address_components'][i]['types'] == [ "administrative_area_level_1", "political" ]: state = data['results'][0]['address_components'][i]['long_name'] if state != 'Hamburg': db_ops.move_db(db, collection, 'SurroundingStates', 'address.city', city) except: pprint.pprint(data)
def audit_city(db, collection): cities = db_ops.get_values(db, collection, ['address.city']) #clean city names shown to have problem characters cities_to_clean = { 'wrong_state': { 'Barendorf, Kreis Lüneburg': 'Barendorf', 'Wintermoor a. d. Ch.': 'Wintermoor an der Chaussee', 'Moisburg/Hollenstedt': 'Moisburg', 'Lauenburg/Elbe': 'Lauenburg Elbe' } } db_ops.update_db(db, collection, 'address.city', cities_to_clean) #cross calidate cities with state and postal codes for city in cities: url = 'http://maps.googleapis.com/maps/api/geocode/json?address=urlencode($_REQUEST["' + city.encode( 'utf8') + '" Germany])&sensor=false&oe=utf-8' attempts = 0 success = False while success != True and attempts < 3: page = urllib.urlopen(url) data = json.loads(page.read()) attempts += 1 if data['status'] == "OVER_QUERY_LIMIT": time.sleep(2) continue success = True if attempts == 3: print("Daily limit has been reached") if data['status'] == 'ZERO_RESULTS': #print(city + ' not found!') with open('Output\\' + collection + '_unknown-cities.txt', 'a') as f: f.write(unicode(city + '\n').encode("utf-8")) else: try: for i in range(len(data['results'][0]['address_components'])): if data['results'][0]['address_components'][i][ 'types'] == ['postal_code']: pc = data['results'][0]['address_components'][i][ 'long_name'] if db[collection].find({ 'address.city': city, 'address.postcode': { '$ne': pc } }) != None: print(city, pc) db[collection].update( { 'address.city': city, 'address.postcode': { '$ne': pc } }, {'$set': { 'address.postcode': pc }}) if data['results'][0]['address_components'][i][ 'types'] == [ "administrative_area_level_1", "political" ]: state = data['results'][0]['address_components'][i][ 'long_name'] if state != 'Hamburg': db_ops.move_db(db, collection, 'SurroundingStates', 'address.city', city) except: pprint.pprint(data)
def audit_fixme(db, collection): keys = ['fixme','FIXME', 'FIXME2', 'fixme:de', 'note:FIXME','source:fixme'] for key in keys[1:]: db[collection].update({}, {'$rename':{key:"fixme"}}) db_ops.move_db(db, collection, 'NeedsFix', 'fixme', None)