def read_realtime_logs(filenames): for fn in filenames: try: for r in commons.zipjson_load(fn): yield r except json.decoder.JSONDecodeError as e: commons.logger.warning("Could not read {} ({})".format(fn, e))
def img_transit(): for fn in commons.ls(OFILE['transit_map'].format(uuid="*", ext="json")): with open(commons.reformat(OFILE['transit_map'], fn, {'ext': "png"}), 'wb') as fd: J = commons.zipjson_load(fn) fd.write(make_transit_img(J))
def segment_by_route(): # A "case" is the result of "RUN_KEY", i.e. a pair (routeid, direction) commons.logger.info("Collecting cases...") # Associate to each case a list of files that contain instances of it # case_directory : run_key --> list of filenames case_directory = { case: set(r[1] for r in g) for (case, g) in groupby(sorted( (RUN_KEY(s), busfile) for busfile in commons.ls( IFILE['segment_by_bus'].format(busid="*", ext="json")) for s in commons.zipjson_load(busfile)), key=(lambda r: r[0])) }
def keep_ttfile(fn): return True J = commons.zipjson_load(fn) # "Inner" Kaohsiung bbox = (120.2593, 22.5828, 120.3935, 22.6886) (left, bottom, right, top) = bbox (lat, lon) = map( np.asarray, zip(*map( commons.inspect( {'StopPosition': ('PositionLat', 'PositionLon')}), J['route']['Stops']))) return all([bottom <= lat, lat <= top, left <= lon, lon <= right])
def generate_timetables(): motc_routes = commons.index_dicts_by_key( commons.zipjson_load(IFILE['MOTC_routes'].format(City="Kaohsiung")), ROUTE_KEY) run_files = commons.ls(IFILE['segment_by_route'].format(scenario="**", routeid="*", dir="*")) print("Found {} route files.".format(len(run_files))) for run_file in run_files: print("===") print("Analyzing route file {}.".format(run_file)) (scenario, routeid, dir) = re.fullmatch( IFILE['segment_by_route'].format(scenario="(.*)", routeid="(.*)", dir="(.*)"), run_file).groups() case = {'scenario': scenario, 'routeid': routeid, 'dir': int(dir)} print("Route: {routeid}, direction: {dir} (from scenario: {scenario})". format(**case)) # # DEBUG: trouble case # if not ((routeid, int(dir)) == ('KHH116', 1)) : continue fn_output = commons.makedirs(OFILE['timetable_json'].format(**case)) if os.path.isfile(fn_output): print("Output file exists ({}) -- skipping".format(fn_output)) continue # Load all bus run segments for this case runs = commons.zipjson_load(run_file) print("Number of runs: {} ({})".format(len(runs), "total")) runs = [run for run in runs if (run.get('quality') == "+")] print("Number of runs: {} ({})".format(len(runs), "quality")) try: route = motc_routes[(case['routeid'], case['dir'])] stops = route['Stops'] except KeyError: print( "Warning: No stops info for route {routeid}, direction {dir}". format(**case)) continue # ETA table of Busrun x Stop ETA = np.vstack( Parallel(n_jobs=PARAM['n_parallel_jobs'])( delayed(bus_at_stops)(run, stops) for run in progressbar(runs)) # bus_at_stops(run, stops) for run in progressbar(runs) ) # 2018-12-12: pandas does not digest dt.datetime with timezones # https://github.com/pandas-dev/pandas/issues/13287 # Note: datetime64 automatically converts to UTC, # i.e. these are the same: # np.datetime64(dt.datetime.utcnow()) # np.datetime64(dt.datetime.now().astimezone()) ETA = ETA.astype('datetime64[ms]') # Timetable as DataFrame df = pd.DataFrame(data=ETA, columns=[s['StopUID'] for s in stops]) J = { 'case': case, 'route': route, 'run_file': run_file, 'timetable_df': df.to_json(), } with open(fn_output, 'w') as fd: json.dump(J, fd)
def match_routes() : # MOTC route info motc_routes = commons.index_dicts_by_key(commons.zipjson_load(IFILE['MOTC_routes']), (lambda r: r['SubRouteUID'])) # for (route_id, route) in route_stops.items() : # stops = dict(zip(route['Direction'], route['Stops'])) OSM = pickle.load(open(IFILE['OSM'], 'rb')) for (route_id, route) in OSM['rels']['route'].items(): # Skip non-bus routes if not (route['t'].get('route') == 'bus'): continue # Note: most routes have relations in route['r'] (route_tags, route_stops, route_ways) = (route['t'], route['n'], route['w']) # https://wiki.openstreetmap.org/wiki/Buses route_name = route_tags['name'] # Common routines def strip_brackets(s): return re.match(r'(?P<name>\w+)+[ ]*(?P<extra>\(\w+\))*', s).group('name') def matchratio_stop_names(name1, name2): return difflib.SequenceMatcher(None, strip_brackets(name1), strip_brackets(name2)).ratio() # Method 0: Match route names top_namematch_motc_ids = None try : top_namematch_motc_ids = sorted( motc_routes.keys(), key=(lambda j : matchratio_stop_names(route_name, motc_routes[j]['RouteName']['Zh_tw'])), reverse=True )[0:6] #print("Route {} best matches: {}".format(route_name, ",".join([motc_routes[j]['RouteName']['Zh_tw'] for j in top_namematch_motc_ids]))) except : raise # Method 1: Match route start/end stops def zip_listify(a, b) : return zip(a, b) if (type(a) is list) else zip([a], [b]) try : (route_a, route_b) = (route_tags['from'], route_tags['to']) def matchratio_ab(motc_route) : # motc_name = motc_route['RouteName']['Zh_tw'] for (dir, stops) in zip_listify(motc_route['Direction'], motc_route['Stops']) : (motc_a, motc_b) = map(commons.inspect({'StopName' : 'Zh_tw'}), [stops[0], stops[-1]]) ab_ratio = (matchratio_stop_names(route_a, motc_a) + matchratio_stop_names(route_b, motc_b)) / 2 assert((0 <= ab_ratio) and (ab_ratio <= 1)) yield (ab_ratio, { 'SubRouteUID' : motc_route['SubRouteUID'], 'Direction' : dir }) ab_matchratios = sorted( chain.from_iterable([ matchratio_ab(motc_routes[j]) for j in top_namematch_motc_ids ]), key=(lambda p: p[0]), reverse=True ) print(route_name, ab_matchratios) except KeyError as e : #print("Method 1 failed on route {}".format(route_name)) continue #print(route_tags) continue if (len(route_stops) < 2) : #print("Route {} has fewer than two stops".format(route_name)) #print(route_ways) continue # Method 2: Match all stops # Get stop info if not all(OSM['node_tags'].get(i) for i in route_stops) : print("Nodes of route {} not found".format(route_tags['name'])) continue route_stops = { i : OSM['node_tags'].get(i) for i in route_stops } print(route_stops) #print(route['n']) #time.sleep(1) # # route_name = route['t'].get('name') # # route_ref = route['t']['ref'] # #if (route_ref == '88') : # print(route_name, route_id, route['t']) # exit(39) return
for ((scenario, routeid, dir), files) in files_by_case.items(): commons.logger.info("===") commons.logger.info( "Mapping route {}, direction {} (from scenario '{}')...".format( routeid, dir, scenario)) try: if not files: commons.logger.warning("No mapmatch files to distill") continue # Load map-matched variants sources = { fn: preprocess_source(commons.zipjson_load(fn)) for fn in files } commons.logger.info( "Number of sources before quality filter: {}".format( len(sources))) # Quality filter def is_qualified(src): if (len(src['waypoints_used']) < PARAM['quality_min_wp/src']): return False return True # Filter quality sources = {
def mapmatch_all(): commons.seed() PARAM['graph_bbox'] = maps.bbox_for_points( nx.get_node_attributes( trim_graph_to_busable( pickle.load(open(IFILE['OSM_graph_file'], 'rb'))['main_component_with_knn']['g']), 'pos').values()) for route_file_template in IFILE['segment_by_route']: route_files = commons.ls( route_file_template.format(scenario="**", routeid="*", direction="*")) commons.logger.info( "Route file template: {}".format(route_file_template)) commons.logger.info("Found {} route files".format(len(route_files))) for route_file in route_files: # time.sleep(2) commons.logger.info("===") commons.logger.info("Analyzing route file {}.".format(route_file)) case = commons.unformat(route_file_template, route_file) commons.logger.info( "Route: {routeid}, direction: {direction} (from scenario: {scenario})" .format(**case)) # # DEBUG # if not ("KHH239-0" == "{routeid}-{direction}".format(**case)) : # continue # Load all bus run segments for this case runs = commons.zipjson_load(route_file) commons.logger.info("Number of runs: {} ({})".format( len(runs), "total")) # Check that the file indeed contains only one type of route assert ({(case['routeid'], int(case['direction'])) } == set(RUN_KEY(r) for r in runs)) # Remove runs that have a negative quality flag runs = [run for run in runs if not (run.get('quality') == "-")] commons.logger.info("Number of runs: {} ({})".format( len(runs), "not marked as bad quality")) # Keep only runs within the map runs = [ run for run in runs if all( is_in_map(*p) for p in run[KEYS.pos]) ] commons.logger.info("Number of runs: {} ({})".format( len(runs), "within the map bbox")) if (len(runs) > PARAM['max_runs_to_mapmatch']): commons.logger.info( "Out of {} available runs, will mapmatch only random {}". format(len(runs), PARAM['max_runs_to_mapmatch'])) runs = commons.random_subset(runs, k=PARAM['max_runs_to_mapmatch']) if (len(runs) < PARAM['min_runs_to_mapmatch']): commons.logger.warning("Skipping mapmatch: too few runs.") continue # Q: clustering here? # Existing mapmatched runs for this route existing = commons.ls(OFILE['mapmatched'].format(**case, mapmatch_uuid="*", ext="json")) if existing: commons.logger.warning( "Skipping mapmatch: {} mapmatched files found".format( len(existing))) continue try: mapmatch_runs(case['scenario'], runs) except Exception as e: commons.logger.error("Mapmatch failed ({}) \n{}".format( e, traceback.format_exc())) commons.logger.warning( "Mapmatch incomplete on route {routeid}-{direction} from scenario '{scenario}'" .format(**case)) time.sleep(5)
def vis1() : # OSM = pickle.load(open(IFILE['OSM'], 'rb')) # for (route_id, route) in OSM['rels']['route'].items(): # # Skip non-bus routes # if not (route['t'].get('route') == 'bus'): continue # # route_name = route['t'].get('name') # # route_ref = route['t']['ref'] # #if (route_ref == '88') : # print(route_name, route_id, route['t']) # exit(39) routeid_of = (lambda r: r['SubRouteUID']) # List of filenames, one file per physical bus, identified by plate number bus_files = commons.ls(IFILE['busses'].format(busid="*")) # Refile bus runs by their route ID runs_by_route = defaultdict(list) for fn in bus_files : runs = commons.zipjson_load(fn) for run in runs : runs_by_route[routeid_of(run)].append(run) # route_stops = commons.index_dicts_by_key(commons.zipjson_load(IFILE['route-stops']), routeid_of) # Are those valid route ID that can be found among the routes? unknown_route_ids = sorted(set(runs_by_route.keys()) - set(route_stops.keys())) if unknown_route_ids : print("The following route IDs from bus records are unknown:") print(", ".join(unknown_route_ids)) raise KeyError("Unkown route IDs in bus records") # route_uid = 'KHH24' runs = runs_by_route[route_uid] route = route_stops[route_uid] # Kaohsiung (left, bottom, right, top) bbox = (120.2593, 22.5828, 120.3935, 22.6886) (left, bottom, right, top) = bbox # Download the background map i = maps.get_map_by_bbox(bbox, token=PARAM['mapbox_api_token']) # Show the background map (fig, ax) = plt.subplots() plt.ion() ax.axis([left, right, bottom, top]) ax.imshow(i, extent=(left, right, bottom, top), interpolation='quadric') #fig.canvas.draw_idle() plt.pause(0.1) stops_by_direction = dict(zip(route['Direction'], route['Stops'])) # Draw stops for both route directions for (dir, stops) in stops_by_direction.items() : # Stop locations (y, x) = zip(*[ commons.inspect({'StopPosition': ('PositionLat', 'PositionLon')})(stop) for stop in stops ]) # Plot as dots ax.scatter(x, y, c=('b' if dir else 'g'), marker='o', s=4) # Show bus location for run in runs : # Trace bus (y, x) = (run['PositionLat'], run['PositionLon']) h1 = ax.plot(x, y, '--+', c='r', linewidth=1) h2 = ax.plot(x[0], y[0], 'o', c='r') h3 = ax.plot(x[-1], y[-1], 's', c='r') plt.title(run['PlateNumb']) #plt.savefig("{}.png".format(route_uid), dpi=180) plt.pause(0.1) bus_at_stops(run, stops_by_direction[run['Direction']]) plt.pause(0.1) [h[0].remove() for h in [h1, h2, h3]] return
def compress(): realtime_files = commons.ls(IFILE['realtime'].format(city=PARAM['city'], date="*", time="*")) #commons.logger.debug(realtime_files) # Allow for pending write operations time.sleep(1) if True: # Brutal compression step commons.logger.info("COMPRESSION 0: Zip all") for fn in commons.progressbar(realtime_files): try: # See if file is in a valid format commons.zipjson_load(fn) try: commons.zipjson_load(fn, insist=True) # commons.logger.info("File {}: compressed already".format(fn)) except RuntimeError: commons.zipjson_dump(commons.zipjson_load(fn), fn) commons.logger.info("File {}: compressed".format(fn)) except: commons.logger.exception( "File {}: unexpected error".format(fn)) except: commons.logger.warning("File {}: reading error".format(fn)) if False: commons.logger.info( "COMPRESSION I: Remove duplicates in back-to-back records") for (fn1, fn2) in zip(realtime_files[:-1], realtime_files[1:]): def hashable(J): assert (type(J) is list) return list(map(json.dumps, J)) def unhashable(J): assert (type(J) is list) return list(map(json.loads, J)) try: J1 = set(hashable(commons.zipjson_load(fn1))) J2 = set(hashable(commons.zipjson_load(fn2))) except EOFError: # Raised by zipjson_load if a file is empty continue except Exception as e: commons.logger.warning("Cannot open {}/{} ({})".format( fn1, fn2, e)) continue if not J1.intersection(J2): continue J1 = J1.difference(J2) J1 = list(unhashable(list(J1))) J2 = list(unhashable(list(J2))) commons.logger.info("Compressing {}".format(fn1)) commons.zipjson_dump(J1, fn1) if False: commons.logger.info( "COMPRESSION II: Remove redundancies from individual records") unknown_subroutes = set() # Route meta R = commons.zipjson_load(IFILE['routes'].format(city=PARAM['city'])) # Reindex by subroute-direction S = defaultdict(dict) for r in R: for s in r['SubRoutes']: sid = s['SubRouteUID'] dir = s['Direction'] assert (dir not in S[sid]) S[sid][dir] = s # S = dict(S) # Reindex by RouteUID assert (commons.all_distinct([g['RouteUID'] for g in R])) R = {g['RouteUID']: g for g in R} def remove_single_route_redundancies(j): subroute_id = j['SubRouteUID'] if not (subroute_id in S): if not (subroute_id in unknown_subroutes): commons.logger.warning( "Unknown subroute {} [warning will not be repeated]". format(subroute_id)) unknown_subroutes.add(subroute_id) return j assert (j['Direction'] in S[subroute_id]) s = S[subroute_id][j['Direction']] for key in ['SubRouteName', 'SubRouteID']: if key in j: if (j[key] == s[key]): del j[key] else: # commons.logger.warning("Unexpected attribute value {}={}".format(key, j[key])) pass if ('RouteUID' in j): route_id = j['RouteUID'] assert (route_id in R) r = R[route_id] for key in ['RouteName', 'RouteID']: if key in j: if not (j[key] == r[key]): commons.logger.warning( "Unexpected attribute value {}={}".format( key, j[key])) else: del j[key] if (j['RouteUID'] == j['SubRouteUID']): del j['RouteUID'] assert ('GPSTime' in j) for key in ['SrcUpdateTime', 'UpdateTime']: if key in j: del j[key] # Note: # - we keep the 'OperatorID' field, even if s['OperatorIDs'] has length 1 # - of the time stamps, we keep 'GPSTime' which is the bus on-board time return j for fn in realtime_files: try: J = commons.zipjson_load(fn) except EOFError: commons.logger.warning("{} appears empty".format(fn)) continue except Exception: commons.logger.warning("Failed to open {}".format(fn)) continue b = len(json.dumps(J)) # Before compression try: J = list(map(remove_single_route_redundancies, J)) except ValueError as e: commons.logger.exception("ValueError at {} -- {}".format( fn, e)) continue except AssertionError as e: commons.logger.exception("Assertion error at {} -- {}".format( fn, e)) continue except Exception as e: commons.logger.exception( "Warning: Compression attempt failed for {} -- {}".format( fn, e)) continue # J = remove_global_route_redundancies(J) a = len(json.dumps(J)) # After compression assert (a <= b) if (a == b): continue commons.logger.info("Compressing {}".format(fn)) commons.zipjson_dump(J, fn) commons.logger.info("DONE")
# # Filter to specific routes (DEBUG) # commons.logger.warning("Filtering the case directory") # case_directory = { # case : files # for (case, files) in case_directory.items() # # DEBUG: # if (case[0] in ["KHH239"]) # } # for (case, files) in sorted(case_directory.items(), key=(lambda cf: -len(cf[1]))): segments = [{ **s, PARAM['quality_key']: ("+" if is_run_acceptable(s) else "-") } for busfile in files for s in commons.zipjson_load(busfile) if (RUN_KEY(s) == case)] if not segments: commons.logger.warning( "No valid bus runs found for {}".format(case)) continue fn = OFILE['segment_by_route'].format(**{ k: segments[0].get(K) for (k, K) in KEYS.items() }, ext="json") with open(fn, 'w') as fd: json.dump(segments, fd)
def debug_compare_two(): uuids = [ "16b767f12ac841fea47ad9b735df1504", "69e47ef6a81a4a3aae0529b8b974896b" ] (J1, J2) = (commons.zipjson_load(OFILE['transit_map'].format(uuid=uuid, ext="json")) for uuid in uuids) o = tuple(J1['origin']['x']) assert (J1['origin'] == J2['origin']) (H1, H2) = ({}, {}) (O1, O2) = ({}, {}) for (J, H, O) in zip([J1, J2], [H1, H2], [O1, O2]): # Location --> Transit time in minutes ; keep track of duplicates J['gohere'] = commons.index_dicts_by_key( J['gohere'], key_func=(lambda __: tuple(__['x'])), collapse_repetitive=False) # Keep the *time* field H.update({x: attr['s'] for (x, attr) in J['gohere'].items()}) # Keep the *origin* field O.update({x: attr['o'] for (x, attr) in J['gohere'].items()}) # The two datasets cover the same geo-locations assert (set(H1) == set(H2)) X = sorted([x for x in H1 if (set(H1[x]) != set(H2[x]))], key=(lambda x: sum(H1[x]) + sum(H2[x]))) # commons.logger.debug("Earliest differing location: {}".format(items[0])) for x in X[0:4]: g1 = nx.DiGraph() g2 = nx.DiGraph() def retrace(O, g, x): for o in O[x]: if o: o = tuple(o) if not g.has_edge(o, x): g.add_edge(o, x) retrace(O, g, o) g.nodes[x]['xy'] = ll2xy(x) retrace(O1, g1, x) retrace(O2, g2, x) commons.logger.debug("Graph 1: {}".format(g1.nodes)) commons.logger.debug("Graph 2: {}".format(g2.nodes)) import matplotlib as mpl mpl.use("TkAgg") import matplotlib.pyplot as plt (fig, ax) = plt.subplots() # # "Inner" Kaohsiung # bbox = (120.2593, 22.5828, 120.3935, 22.6886) # # Set plot view to the bbox # ax.axis(maps.mb2ax(*bbox)) # ax.autoscale(enable=False) nx.draw_networkx(g1, ax=ax, pos=nx.get_node_attributes(g1, 'xy'), edge_color='b', node_size=1, with_labels=False) nx.draw_networkx(g2, ax=ax, pos=nx.get_node_attributes(g2, 'xy'), edge_color='g', node_size=1, with_labels=False) plt.show()