def fill_cities(data, session): stats_dict = dict(session.query(Stat.code, Stat.id)) timer = Timer(len(data)) for city in data: geom = WKTElement('POINT({:.8f} {:.8f})'.format( city.coords[1], city.coords[0], srid=4326)) city_db = City( location = geom, name = city.name, region = city.region, country = city.country, source = city.wiki_source, population = city.pop, country_rank = city.country_rank, region_rank = city.region_rank) session.add(city_db) # commit the cities so we can use them # I'm not sure this is the clean way to do it... session.commit() for code, month_stats in city.month_stats.items(): if code not in stats_dict: continue for month_idx, value in enumerate(month_stats): ms = MonthlyStat( month = month_idx, city_id = city_db.id, stat_id = stats_dict[code], value = value) session.add(ms) timer.update() # for the last monthly stats session.commit()
def fill_cities(data, session): stats_dict = dict(session.query(Stat.code, Stat.id)) timer = Timer(len(data)) for city in data: geom = WKTElement('POINT({:.8f} {:.8f})'.format(city.coords[1], city.coords[0], srid=4326)) city_db = City(location=geom, name=city.name, region=city.region, country=city.country, source=city.wiki_source, population=city.pop, country_rank=city.country_rank, region_rank=city.region_rank) session.add(city_db) # commit the cities so we can use them # I'm not sure this is the clean way to do it... session.commit() for code, month_stats in city.month_stats.items(): if code not in stats_dict: continue for month_idx, value in enumerate(month_stats): ms = MonthlyStat(month=month_idx, city_id=city_db.id, stat_id=stats_dict[code], value=value) session.add(ms) timer.update() # for the last monthly stats session.commit()
def add_priority_index(session, fast_mode=False): """ decides the order in which the cities should be selected """ cities = session.query(City, func.ST_Y(cast(City.location, Geometry())), func.ST_X(cast(City.location, Geometry()))) \ .join(MonthlyStat) \ .order_by(City.region_rank, City.country_rank, desc(func.count(MonthlyStat.id))) \ .group_by(City.id) \ .yield_per(1000).all() if fast_mode: logger.info('doing the fast version of priority index') for i,city in enumerate(cities): city[0].priority_index = i session.commit() return def distance_fn(tuple1, tuple2): _,lat1,lon1 = tuple1 _,lat2,lon2 = tuple2 return lat_lon_fast_distance(lat1, lon1, lat2, lon2) indices = [0] indices_left = list(range(1,len(cities))) # pre-calculate the distances between all the cities logger.info('pre-calculating the distances between all cities') lats = numpy.array([c[1] for c in cities]) lons = numpy.array([c[2] for c in cities]) distances = lat_lon_fast_distance(lats.reshape(-1,1), lons.reshape(-1,1), lats.reshape(1,-1), lons.reshape(1,-1)) class CityComp(object): idx = None max_dist = None max_dist_idx = None def __init__(self, max_dist, max_dist_idx): self.max_dist = max_dist self.max_dist_idx = max_dist_idx def __lt__(self, other): return self.max_dist < other.max_dist # each city is compared to all the previous ones (maximum) timer = Timer(len(indices_left)) # percent of closest cities to choose from perc_closest_cities = 0.1 # same but max max_closest_cities = 200 while len(indices_left) > 0: # let's find the next city amongst the next candidates # this will be our (heap) list of good candidates, i.e. the ones # farthest from all the others good_candidates = [] nb_keep = min(perc_closest_cities * len(indices_left), max_closest_cities) nb_keep = max(1, nb_keep) # at least 1! logger.debug('will keep the farthest %i', nb_keep) # max_dist = 0. # max_dist_idx = 0 logger.debug('---------looking for the next one----------') for no_candidate, i_left in enumerate(indices_left): # logger.debug('candidate %i, idx %i', no_candidate, i_left) # find how close is the nearest neighbor for this city # we are looking for the city with the fartest nearest neighbor dist_nearest_neighbor = 1e9 # get the distance of our candidate to the closest (already chosen) # city too_close = False for i_chosen in indices: cur_dist = distances[i_chosen, i_left] # if we already have enough candidates, and if the current is # worse than all others, let's skip it if len(good_candidates) >= nb_keep \ and cur_dist <= good_candidates[0].max_dist: too_close = True # logger.debug('too close @%f', cur_dist) break dist_nearest_neighbor = min(dist_nearest_neighbor, cur_dist) # we don't compare the distance of this candidate with all cities # if it's closer to (already chosen) city than our best candidate # so far if too_close: continue # dist_nearest_neighbor = numpy.min(distances[indices][:,i_left]) # logger.debug('candidate %i has a city at %f', no_candidate, # dist_nearest_neighbor) # if dist_nearest_neighbor > best_candidate.max_dist: # logger.debug('(new max)') new_candidate = CityComp(dist_nearest_neighbor, no_candidate) # logger.debug('trying to add new candidate with dist %f', # new_candidate.max_dist) # if we don't have enough anyway if len(good_candidates) < nb_keep: heapq.heappush(good_candidates, new_candidate) else: # if we have enough, just keep the n best rejected_cand = heapq.heappushpop(good_candidates, new_candidate) # logger.debug('removed candidate %i with dist %f', # rejected_cand.max_dist_idx, # rejected_cand.max_dist) # take the smallest index in our good candidates. this corresponds to # the best (according to our first ORDER BY) amongst the "far enough" # candidates best_candidate = min(good_candidates, key=lambda x: x.max_dist_idx) logger.debug('keeping %s with pop %i', cities[indices_left[best_candidate.max_dist_idx]][0].name, cities[indices_left[best_candidate.max_dist_idx]][0].population,) # input('press to continue') indices.append(indices_left.pop(best_candidate.max_dist_idx)) logger.debug('done, best candidate was %i with distance %f', best_candidate.max_dist_idx, best_candidate.max_dist) logger.debug('done, chosen: %i, remaining: %i', len(indices), len(indices_left)) timer.update() assert len(indices) == len(cities) for priority_index, i in enumerate(indices): cities[i][0].priority_index = priority_index session.commit()
help='minimum population to' ' keep the city (if there are multiple population' ' fields, we keep the maximum)', type=int) args = parser.parse_args() configure_logging() dump_in = pickle.load(open(args.input_file)) if True or ask_before_overwrite(args.output_file): dump_out = open(args.output_file, 'w') else: sys.exit() timer = Timer(len(dump_in)) new_data = {} nb_no_climate = 0 nb_coords_from_wiki = 0 nb_coords_from_dbpedia = 0 for i, (city, infos) in enumerate(dump_in.items()): timer.update(i) if args.max_cities is not None and i + 1 > args.max_cities: break logger.debug(city) # parsing population pop = parse_population(infos) if pop < args.min_pop: continue wikiurl = urlparse('http://' + infos['source'])
parser.add_argument('--max-cities', '-m', type=int) parser.add_argument('--min-pop', default=1e6, help='minimum population to' ' keep the city (if there are multiple population' ' fields, we keep the maximum)', type=int) args = parser.parse_args() configure_logging() dump_in = pickle.load(open(args.input_file)) if True or ask_before_overwrite(args.output_file): dump_out = open(args.output_file, 'w') else: sys.exit() timer = Timer(len(dump_in)) new_data = {} nb_no_climate = 0 nb_coords_from_wiki = 0 nb_coords_from_dbpedia = 0 for i, (city, infos) in enumerate(dump_in.items()): timer.update(i) if args.max_cities is not None and i+1 > args.max_cities: break logger.debug(city) # parsing population pop = parse_population(infos) if pop < args.min_pop: continue wikiurl = urlparse('http://' + infos['source'])
return month_stats if __name__ == '__main__': # arg 1 : file to open city_data = pickle.load(open(sys.argv[1])) # arg 2 : output dump output = sys.argv[2] if not ask_before_overwrite(output): sys.exit() filtered_cities = {} not_found = [] timer = Timer(len(city_data), 100) for city, data in city_data.items(): filtered_city = {} name = city.split('/')[-1] # remove keys we want to ignore for k in list(data.keys()): for regex in IGNORE: if regex.match(k): # print(' removing', k, 'from', city) # print(' using', regex.pattern) data.pop(k) break # break # else: # print(' ', k, 'not match', regex.pattern)
HAVING(MAX(?pop) > %i) """ % args.min_pop, limit=args.max_cities)) print('got', len(cities)) def f(): return defaultdict(list) cities_dict = defaultdict(f) for c in cities: city = c['city']['value'] for k in c.keys(): cities_dict[city][k] = c[k]['value'] timer = Timer(len(cities)) for city in cities_dict.keys(): # get the properties of the city results = sparql_query( sparql, """ PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> SELECT ?p ?o WHERE {{ <{}> ?p ?o. FILTER( regex(?p, "population", "i") || regex(?p, "elevation", "i"))
def print_if(n): if n < 100: return Timer.default_print_if(n) else: return n % 100 == 0
if n < 100: return Timer.default_print_if(n) else: return n % 100 == 0 def keep_city(city): if (args.country is None or city.country == args.country) and \ (args.region is None or city.region == args.region) and \ (args.city is None or city.name == args.city): return True return False nb_cities = sum(1 for x in dump_in if keep_city(x)) if args.max_cities is not None and args.max_cities < nb_cities: nb_cities = args.max_cities timer = Timer(nb_cities, print_if=print_if) nb_no_wiki = 0 nb_no_climate = 0 nb_already_there = 0 nb_coords_from_wiki = 0 nb_coords_from_geonames = 0 nb_done = 0 for city in dump_in: if args.max_cities is not None and nb_done >= args.max_cities: break if not keep_city(city): continue timer.update() logger.debug(city) city_id = '{}/{}/{}'.format(city.name, city.region, city.country)
def add_priority_index(session, fast_mode=False): """ decides the order in which the cities should be selected """ cities = session.query(City, func.ST_Y(cast(City.location, Geometry())), func.ST_X(cast(City.location, Geometry()))) \ .join(MonthlyStat) \ .order_by(City.region_rank, City.country_rank, desc(func.count(MonthlyStat.id))) \ .group_by(City.id) \ .yield_per(1000).all() if fast_mode: logger.info('doing the fast version of priority index') for i, city in enumerate(cities): city[0].priority_index = i session.commit() return def distance_fn(tuple1, tuple2): _, lat1, lon1 = tuple1 _, lat2, lon2 = tuple2 return lat_lon_fast_distance(lat1, lon1, lat2, lon2) indices = [0] indices_left = list(range(1, len(cities))) # pre-calculate the distances between all the cities logger.info('pre-calculating the distances between all cities') lats = numpy.array([c[1] for c in cities]) lons = numpy.array([c[2] for c in cities]) distances = lat_lon_fast_distance(lats.reshape(-1, 1), lons.reshape(-1, 1), lats.reshape(1, -1), lons.reshape(1, -1)) class CityComp(object): idx = None max_dist = None max_dist_idx = None def __init__(self, max_dist, max_dist_idx): self.max_dist = max_dist self.max_dist_idx = max_dist_idx def __lt__(self, other): return self.max_dist < other.max_dist # each city is compared to all the previous ones (maximum) timer = Timer(len(indices_left)) # percent of closest cities to choose from perc_closest_cities = 0.1 # same but max max_closest_cities = 200 while len(indices_left) > 0: # let's find the next city amongst the next candidates # this will be our (heap) list of good candidates, i.e. the ones # farthest from all the others good_candidates = [] nb_keep = min(perc_closest_cities * len(indices_left), max_closest_cities) nb_keep = max(1, nb_keep) # at least 1! logger.debug('will keep the farthest %i', nb_keep) # max_dist = 0. # max_dist_idx = 0 logger.debug('---------looking for the next one----------') for no_candidate, i_left in enumerate(indices_left): # logger.debug('candidate %i, idx %i', no_candidate, i_left) # find how close is the nearest neighbor for this city # we are looking for the city with the fartest nearest neighbor dist_nearest_neighbor = 1e9 # get the distance of our candidate to the closest (already chosen) # city too_close = False for i_chosen in indices: cur_dist = distances[i_chosen, i_left] # if we already have enough candidates, and if the current is # worse than all others, let's skip it if len(good_candidates) >= nb_keep \ and cur_dist <= good_candidates[0].max_dist: too_close = True # logger.debug('too close @%f', cur_dist) break dist_nearest_neighbor = min(dist_nearest_neighbor, cur_dist) # we don't compare the distance of this candidate with all cities # if it's closer to (already chosen) city than our best candidate # so far if too_close: continue # dist_nearest_neighbor = numpy.min(distances[indices][:,i_left]) # logger.debug('candidate %i has a city at %f', no_candidate, # dist_nearest_neighbor) # if dist_nearest_neighbor > best_candidate.max_dist: # logger.debug('(new max)') new_candidate = CityComp(dist_nearest_neighbor, no_candidate) # logger.debug('trying to add new candidate with dist %f', # new_candidate.max_dist) # if we don't have enough anyway if len(good_candidates) < nb_keep: heapq.heappush(good_candidates, new_candidate) else: # if we have enough, just keep the n best rejected_cand = heapq.heappushpop(good_candidates, new_candidate) # logger.debug('removed candidate %i with dist %f', # rejected_cand.max_dist_idx, # rejected_cand.max_dist) # take the smallest index in our good candidates. this corresponds to # the best (according to our first ORDER BY) amongst the "far enough" # candidates best_candidate = min(good_candidates, key=lambda x: x.max_dist_idx) logger.debug( 'keeping %s with pop %i', cities[indices_left[best_candidate.max_dist_idx]][0].name, cities[indices_left[best_candidate.max_dist_idx]][0].population, ) # input('press to continue') indices.append(indices_left.pop(best_candidate.max_dist_idx)) logger.debug('done, best candidate was %i with distance %f', best_candidate.max_dist_idx, best_candidate.max_dist) logger.debug('done, chosen: %i, remaining: %i', len(indices), len(indices_left)) timer.update() assert len(indices) == len(cities) for priority_index, i in enumerate(indices): cities[i][0].priority_index = priority_index session.commit()
for row in reader: country, region = row['country_region'].split('.') if region in regions[country]: raise Exception('A region is present twice in the file') regions[country][region] = row['name'] # pprint(regions) countries = {} with open(args.country_infos_file) as f: reader = csv.reader((line for line in f if not line.startswith('#')), delimiter='\t') for row in reader: countries[row[0]] = row[4] # pprint(countries) timer = Timer() fields = [ 'geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude', 'feature class', 'feature code', 'country code', 'cc2', 'admin1 code', 'admin2 code', 'admin3 code', 'admin4 code', 'population', 'elevation', 'dem', 'timezone', 'modification date' ] logger.info('reading the data') cities = defaultdict(lambda: defaultdict(dict)) nb_cities_kept = 0 with open(args.input_file) as f: reader = csv.DictReader(f, delimiter='\t', fieldnames=fields,