def save_dynamic_distance_year_graph(years, area_name, distance, node_type, crime_types=None): """ Saves a number of networks, where each network represents a year. Each network is saved to disk at a location that matches the parameters used to construct it. This method uses the box network method to make network creation faster. This means each network may not contain all the crimes available for that period of time. :param years: The time frame of networks to create. Should be a list of integers. :param area_name: The name of the area to search for crimes. This should be a valid entry in cities.json :param distance: The maximum distance between two connected crimes. :param node_type: What each node represents. The is passed on to network_creation.distance_graph. :param crime_types: The types of crimes to include in the network. Examples -------- >>> save_dynamic_distance_year_graph([2008, 2009], 'miami', 1.6, ... 'crime', ['Theft']) """ start_times = map( lambda args: datetime.datetime(**args), multithreading.combinations(month=[1], year=years, day=[1])) # add the end point of the final network to the list start_times.append(datetime.datetime(year=years[-1] + 1, month=1, day=1)) path = 'data/{}/{}/distance/{}/{}'.format(area_name, get_crime_name(crime_types), distance, node_type) box_networks._multiprocess = False # this is very bad, do not multiprocess! zipcodes = json.load(open('cities.json', 'r'))[area_name] if crime_types is None: limits = {'zipcode': {'$in': zipcodes}} else: limits = {'zipcode': {'$in': zipcodes}, 'type': {'$in': crime_types}} i = 0 while i < len(start_times) - 1: network_path = '{}/networks/{}_{}.graphml'.format( path, 'year', date_string(start_times[i])) if not os.path.exists(network_path): limits['date'] = { '$gte': start_times[i], '$lt': start_times[i + 1] } logger.info('Building {}'.format(limits)) g = box_networks.distance_crime_network_by_box(distance, area_name, limits=limits) save_graph(g, network_path) gc.collect() else: logger.info('Network exists, skipping') i += 1
def get_box_networks(distance, bottom, left, width, height, num_box_x, num_box_y, limits=None, keep_frac=1): """ Gets a number of networks contained in boxes. :param distance: The maximum distance between connected nodes :param bottom: The bottom coordinate of the group of boxes :param left: The left coordinate of the group of boxes :param width: The total width of the group of boxes :param height: The total height of the group of boxes :param num_box_x: The number of boxes in a row :param num_box_y: The number of boxes in a column :param limits: Any restrictions on database output eg: {'type': 'Theft'} :return: A map from the (x,y) index of a box to the network of crimes in the box. """ box_width = width / num_box_x box_height = height / num_box_y params = multithreading.combinations(width=[box_width], height=[box_height], x=range(num_box_x), y=range(num_box_y), gl_bottom=[bottom], gl_left=[left], limits=[limits], distance=[distance], keep_frac=[keep_frac]) logger.debug('Broke into {} boxes'.format(len(params))) if _multiprocess: results = multithreading.map_kwargs(get_box_network, params, failsafe=True) else: results = map(lambda args: get_box_network(**args), params) # fail if we are missing boxes if False in results: logger.fatal('Some Boxes Failed!') raise RuntimeError # get a map<(x,y), network> # I know dict is ghetto, 2d array would be better return {(params[i]['x'], params[i]['y']): results[i] for i in range(len(results))}
def save_dynamic_distance_month_graph(years, area_name, distance, node_type, crime_types=None): """ Saves a number of networks, where each network represents a month. Each network is saved to disk at a location that matches the parameters used to construct it. :param years: The time frame of networks to create. Should be a list of integers. :param area_name: The name of the area to search for crimes. This should be a valid entry in cities.json :param distance: The maximum distance between two connected crimes. :param node_type: What each node represents. The is passed on to network_creation.distance_graph. :param crime_types: The types of crimes to include in the network. Examples -------- >>> save_dynamic_distance_month_graph([2008, 2009], 'miami', 1.6, ... 'crime', ['Theft']) """ start_times = map( lambda args: datetime.datetime(**args), multithreading.combinations(month=range(1, 13), year=years, day=[1])) # add the end point of the final network to the list start_times.append(datetime.datetime(year=years[-1] + 1, month=1, day=1)) path = 'data/{}/{}/distance/{}/{}'.format(area_name, get_crime_name(crime_types), distance, node_type) zipcodes = json.load(open('cities.json', 'r'))[area_name] i = 0 while i < len(start_times) - 1: network_path = '{}/networks/{}_{}.graphml'.format( path, 'month', date_string(start_times[i])) if not os.path.exists(network_path): data = crime_window(start_times[i], start_times[i + 1], zipcodes, crime_types) logger.info('{} crimes found for {}'.format( len(data), start_times[i])) g = network_creation.distance_graph(data, distance, node_type) save_graph(g, network_path) else: logger.info('Network exists, skipping') i += 1
week_files = save_networks.week_files(datetime.datetime(2007, 1, 1), datetime.datetime(2011, 1, 1)) month_files = save_networks.month_files(range(2007, 2011)) year_files = save_networks.year_files(range(2007, 2011)) logger.debug(year_files) _cities = ['baltimore', 'los_angeles', 'miami'] _clusterings = ['average', 'single', 'complete'] _levels = [50000, 25000] params = multithreading.combinations( area=_cities, clustering=_clusterings, level=_levels, crime_types=[['all', 'assault', 'burglary', 'theft']], distances=[[3.2, 2.4, 1.6, 0.8, 0.1]], node_types=[['zip']], filenames=[year_files], algorithms=[['label_propagation']], iterations_list=[[1000]]) # filter out bad combinations params = filter( lambda args: args['clustering'] == 'average' and args['level'] == 50000 or args['clustering'] == 'single' and args['level'] == 25000 or args[ 'clustering'] == 'complete' and args['level'] == 25000, params) logger.info('{} Base Networks Found'.format(len(params))) score_lists = multithreading.map_kwargs(get_z_scores, params) logger.info('Combining Results')
if __name__ == '__main__': # write info and debug to different files logging.config.dictConfig(json.load(open('logging_config.json', 'r'))) fiona.log.setLevel(logging.WARNING) # I don't care about the fiona logs week_files = save_networks.week_files(datetime.datetime(2007, 1, 1), datetime.datetime(2011, 1, 1)) month_files = save_networks.month_files(range(2007, 2011)) year_files = save_networks.year_files(range(2007, 2011)) params = multithreading.combinations( city=['los_angeles', 'baltimore', 'miami'], crime_name=['all', 'theft', 'burglary', 'assault'], distance=[3.2, 2.4, 1.6, .8, .1], node_type=['crime', 'zip'], region_type=['voronoi', 'zip'], algorithm=['label_propagation'], filename=week_files + month_files + year_files, iterations=[100]) # filter out bad combinations params = filter( lambda d: d['node_type'] == 'crime' and d['region_type'] == 'voronoi' or d['node_type'] == 'zip' and d['region_type'] == 'zip', params) def work(city, crime_name, distance, node_type, region_type, algorithm, filename, iterations): unique_id = '{}-{}-{}-{}-{}-{}-{}'.format(city, distance, node_type, region_type, algorithm, filename, iterations)
def month_files(years, months=range(1, 13)): return map( lambda args: 'month_{year}-{month:0>2}-{day:0>2}'.format(**args), multithreading.combinations(year=years, month=months, day=[1]))
def year_files(years): return map(lambda args: 'year_{year}-{month:0>2}-{day:0>2}'.format(**args), multithreading.combinations(year=years, month=[1], day=[1]))
if __name__ == '__main__': logging.config.dictConfig(json.load(open('logging_config.json', 'r'))) # logging.basicConfig(level=logging.DEBUG) todo = 'year' areas = ['baltimore', 'los_angeles', 'miami'] _distances = [0.1, 0.8, 1.6, 2.4, 3.2] node_types = ['crime'] _crime_types = [None, ['Theft'], ['Burglary'], ['Assault']] logger.info('Starting') if todo == 'month': params = multithreading.combinations(years=[range(2007, 2011)], area_name=areas, distance=_distances, node_type=node_types, crime_types=_crime_types) multithreading.map_kwargs(save_dynamic_distance_month_graph, params) elif todo == 'week': params = multithreading.combinations( initial=[datetime.datetime(2007, 1, 1)], final=[datetime.datetime(2011, 1, 1)], delta_name=['week'], area_name=areas, distance=_distances, node_type=node_types, crime_types=_crime_types) logger.info('Generating {} dynamic networks'.format(len(params))) multithreading.map_kwargs(save_dynamic_distance_delta_graph, params) # map(lambda args: save_dynamic_distance_graph(**args), params)