' directly, like population/elevation/climate') parser.add_argument('input_file', help='dbpedia dump') parser.add_argument('output_file', help='file where to dump the augmented data') parser.add_argument('--max-cities', '-m', type=int) parser.add_argument('--min-pop', default=1e6, help='minimum population to' ' keep the city (if there are multiple population' ' fields, we keep the maximum)', type=int) args = parser.parse_args() configure_logging() dump_in = pickle.load(open(args.input_file)) if True or ask_before_overwrite(args.output_file): dump_out = open(args.output_file, 'w') else: sys.exit() timer = Timer(len(dump_in)) new_data = {} nb_no_climate = 0 nb_coords_from_wiki = 0 nb_coords_from_dbpedia = 0 for i, (city, infos) in enumerate(dump_in.items()): timer.update(i) if args.max_cities is not None and i + 1 > args.max_cities: break
session.query(MonthlyStat).delete() session.query(City).delete() session.commit() else: print('Did nothing.') sys.exit() else: # Here we are simply appending the new cities. You want to be sure # that this won't conflict in some way... if not are_you_sure( 'The database is not empty, there are already {} cities,' ' do you still wish to pursue with loading data? If you' ' want to clear already existing cities, use the' ' --clear-cities flag' .format(nb_cities)): print('Did nothing.') sys.exit() logger.info('loading the data') with open(args.input_file) as f: data = pickle.load(f) if args.max_cities is not None: data = data[:args.max_cities] with session_scope() as session: logger.info('filling the database') fill_cities(data, session) logger.info('adding the priority index') add_priority_index(session, args.fast_priority_index)
if 'precipitationDays' not in stats and 'rainDays' in stats: rainDays = stats['rainDays'] if 'snowDays' in stats: print( 'summing rain and snow days, makes sense? what about days' ' with rain AND snow?') rainDays += stats['snowDays'] stats['precipitationDays'] = rainDays return month_stats if __name__ == '__main__': # arg 1 : file to open city_data = pickle.load(open(sys.argv[1])) # arg 2 : output dump output = sys.argv[2] if not ask_before_overwrite(output): sys.exit() filtered_cities = {} not_found = [] timer = Timer(len(city_data), 100) for city, data in city_data.items(): filtered_city = {} name = city.split('/')[-1] # remove keys we want to ignore for k in list(data.keys()): for regex in IGNORE:
if __name__ == '__main__': parser = argparse.ArgumentParser( description='augments the data form dbpedia with data from wikipedia' ' directly, like population/elevation/climate') parser.add_argument('input_file', help='dbpedia dump') parser.add_argument('output_file', help='file where to dump the augmented data') parser.add_argument('--max-cities', '-m', type=int) parser.add_argument('--min-pop', default=1e6, help='minimum population to' ' keep the city (if there are multiple population' ' fields, we keep the maximum)', type=int) args = parser.parse_args() configure_logging() dump_in = pickle.load(open(args.input_file)) if True or ask_before_overwrite(args.output_file): dump_out = open(args.output_file, 'w') else: sys.exit() timer = Timer(len(dump_in)) new_data = {} nb_no_climate = 0 nb_coords_from_wiki = 0 nb_coords_from_dbpedia = 0 for i, (city, infos) in enumerate(dump_in.items()): timer.update(i) if args.max_cities is not None and i+1 > args.max_cities: break
help = 'We assume we already have cities in the output' ' and we will only (re-)augment those, skipping all the' ' others') parser.add_argument('--logging-level', choices = ['debug', 'info', 'warning', 'error', 'critical'], default='info') args = parser.parse_args() configure_logging(args.logging_level.upper()) # validation of the passed arguments if args.append and args.update_only: raise Exception('can not use --append and --update-only at the' ' same time') with open(args.input_file) as f: dump_in = pickle.load(f) if not (args.append or args.force or ask_before_overwrite(args.output_file)): sys.exit() if args.skip_wiki: logger.info('skipping wikipedia') for c in dump_in: c.month_stats = {'avgHigh': [0] * 12, 'precipitation': [0] * 12} c.wiki_source = '' with open(args.output_file, 'w') as dump_out: pickle.dump(dump_in, dump_out) sys.exit() if args.append or args.update_only:
for city in cities_dict.keys(): # get the properties of the city results = sparql_query( sparql, """ PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> SELECT ?p ?o WHERE {{ <{}> ?p ?o. FILTER( regex(?p, "population", "i") || regex(?p, "elevation", "i")) }} """.format(city)) for c in results: att = c['p']['value'] val = c['o']['value'] # some negative values are weird val = clean_minuses(val) cities_dict[city][att].append(val) timer.update() # pprint(cities_dict) with open(output, 'w') as f: pickle.dump(dict(cities_dict), f)
for city in cities_dict.keys(): # get the properties of the city results = sparql_query(sparql, """ PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> SELECT ?p ?o WHERE {{ <{}> ?p ?o. FILTER( regex(?p, "population", "i") || regex(?p, "elevation", "i")) }} """.format(city)) for c in results: att = c['p']['value'] val = c['o']['value'] # some negative values are weird val = clean_minuses(val) cities_dict[city][att].append(val) timer.update() # pprint(cities_dict) with open(output, 'w') as f: pickle.dump(dict(cities_dict), f)
logger.info('deleting cities') session.query(MonthlyStat).delete() session.query(City).delete() session.commit() else: print('Did nothing.') sys.exit() else: # Here we are simply appending the new cities. You want to be sure # that this won't conflict in some way... if not are_you_sure( 'The database is not empty, there are already {} cities,' ' do you still wish to pursue with loading data? If you' ' want to clear already existing cities, use the' ' --clear-cities flag'.format(nb_cities)): print('Did nothing.') sys.exit() logger.info('loading the data') with open(args.input_file) as f: data = pickle.load(f) if args.max_cities is not None: data = data[:args.max_cities] with session_scope() as session: logger.info('filling the database') fill_cities(data, session) logger.info('adding the priority index') add_priority_index(session, args.fast_priority_index)
type=float, default=25., help='a city' ' will be ignored if there is a bigger city closer than' ' this radius') args = parser.parse_args() configure_logging() output = args.output_file if not (args.force or ask_before_overwrite(output)): sys.exit() fields = ['country_region', 'name', 'asciiname', 'geonameid'] regions = defaultdict(dict) with open(args.admin1codes_file) as f: reader = csv.DictReader(f, delimiter='\t', fieldnames=fields) for row in reader: country, region = row['country_region'].split('.') if region in regions[country]: raise Exception('A region is present twice in the file') regions[country][region] = row['name'] # pprint(regions) countries = {} with open(args.country_infos_file) as f: reader = csv.reader((line for line in f if not line.startswith('#')), delimiter='\t') for row in reader: countries[row[0]] = row[4] # pprint(countries)
' before overwriting the output file') parser.add_argument('--max-cities', '-m', type=int) parser.add_argument('--too-close', type=float, default=25., help='a city' ' will be ignored if there is a bigger city closer than' ' this radius') args = parser.parse_args() configure_logging() output = args.output_file if not (args.force or ask_before_overwrite(output)): sys.exit() fields = ['country_region', 'name', 'asciiname', 'geonameid'] regions = defaultdict(dict) with open(args.admin1codes_file) as f: reader = csv.DictReader(f, delimiter='\t', fieldnames=fields) for row in reader: country, region = row['country_region'].split('.') if region in regions[country]: raise Exception('A region is present twice in the file') regions[country][region] = row['name'] # pprint(regions) countries = {} with open(args.country_infos_file) as f: reader = csv.reader((line for line in f if not line.startswith('#')), delimiter='\t') for row in reader: countries[row[0]] = row[4] # pprint(countries)
stats['precipitation'] = prec if 'precipitationDays' not in stats and 'rainDays' in stats: rainDays = stats['rainDays'] if 'snowDays' in stats: print('summing rain and snow days, makes sense? what about days' ' with rain AND snow?') rainDays += stats['snowDays'] stats['precipitationDays'] = rainDays return month_stats if __name__ == '__main__': # arg 1 : file to open city_data = pickle.load(open(sys.argv[1])) # arg 2 : output dump output = sys.argv[2] if not ask_before_overwrite(output): sys.exit() filtered_cities = {} not_found = [] timer = Timer(len(city_data), 100) for city, data in city_data.items(): filtered_city = {} name = city.split('/')[-1] # remove keys we want to ignore for k in list(data.keys()): for regex in IGNORE:
' and we will only (re-)augment those, skipping all the' ' others') parser.add_argument( '--logging-level', choices=['debug', 'info', 'warning', 'error', 'critical'], default='info') args = parser.parse_args() configure_logging(args.logging_level.upper()) # validation of the passed arguments if args.append and args.update_only: raise Exception('can not use --append and --update-only at the' ' same time') with open(args.input_file) as f: dump_in = pickle.load(f) if not (args.append or args.force or ask_before_overwrite(args.output_file)): sys.exit() if args.skip_wiki: logger.info('skipping wikipedia') for c in dump_in: c.month_stats = {'avgHigh': [0] * 12, 'precipitation': [0] * 12} c.wiki_source = '' with open(args.output_file, 'w') as dump_out: pickle.dump(dump_in, dump_out) sys.exit()
import pickle, pprint, sys from meteomap.utils import open if __name__ == '__main__': some_file = sys.argv[1] pprint.pprint(pickle.load(open(some_file)))