def imn_extract(filename, path, type_user, traj_table, evnt_table, min_traj_nbr, min_length, min_duration, area, overwrite=False): output_filename = path + filename con = database_io.get_connection() cur = con.cursor() #users_list = [100225,101127,100742,100747,100690,100578,1003,100191,100192,100193,321463] users_list = [100619, 100554] users_list = sorted(users_list) nbr_users = len(users_list) print("user ids before checking :") print(nbr_users, len(users_list)) if os.path.isfile(output_filename) and not overwrite: processed_users = list() fout = gzip.GzipFile(output_filename, 'r') for row in fout: customer_obj = json.loads(row) processed_users.append(customer_obj['uid']) fout.close() users_list = [uid for uid in users_list if uid not in processed_users] print("user ids after checking :") print(nbr_users, len(users_list)) for i, uid in enumerate(users_list): if i % 1 == 0: print( datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i, nbr_users, i / nbr_users * 100.0)) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None #imh['trajectories']=dict(list(islice(imh['trajectories'].items(), 200))) if len(imh['trajectories']) < min_traj_nbr: print('len trajectories]) < min_traj_nbr', len(imh['trajectories']), min_traj_nbr) continue main_imh = imh['trajectories'] jan_feb_tid = [] march_april_id = [] for tid, t in imh['trajectories'].items(): start_time = str(t.start_time()) if ('2017-01' in start_time) or ('2017-02' in start_time): jan_feb_tid.append(tid) if ('2017-03' in start_time) or ('2017-04' in start_time): march_april_id.append(tid) imh['trajectories'] = {x: imh['trajectories'][x] for x in jan_feb_tid} imn1 = individual_mobility_network.build_imn(imh, reg_loc=True, events=events, verbose=False) period_imn1 = {"01-02": imn1} imh['trajectories'] = {x: main_imh[x] for x in march_april_id} imn2 = individual_mobility_network.build_imn(imh, reg_loc=True, events=events, verbose=False) period_imn2 = {"03-04": imn2} customer_obj = {'uid': uid} period_imn1.update(period_imn2) customer_obj.update(period_imn1) json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj), default=agenda_converter) json_bytes = json_str.encode('utf-8') with gzip.GzipFile(output_filename, 'a') as fout: fout.write(json_bytes) print("done") cur.close() con.close()
def imn_extract_for_one_month(filename, path, type_user, traj_table, evnt_table, min_traj_nbr, min_length, min_duration, area, overwrite=False): output_filename = path + filename con = database_io.get_connection() cur = con.cursor() users_list = find_user_list(cur, traj_table) nbr_users = len(users_list) print("user ids before checking :") print(nbr_users, len(users_list)) if os.path.isfile(output_filename) and not overwrite: processed_users = list() fout = gzip.GzipFile(output_filename, 'r') for row in fout: customer_obj = json.loads(row) processed_users.append(customer_obj['uid']) fout.close() users_list = [uid for uid in users_list if uid not in processed_users] print("user ids after checking :") print(nbr_users, len(users_list)) for i, uid in enumerate(users_list): try: if i % 1 == 0: print( datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i, nbr_users, i / nbr_users * 100.0)) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None if len(imh['trajectories']) < min_traj_nbr: print('len trajectories]) < min_traj_nbr', len(imh['trajectories']), min_traj_nbr) continue main_imh = imh['trajectories'] jan_tid = [] for tid, t in imh['trajectories'].items(): start_time = str(t.start_time()) if ('2017-01' in start_time): jan_tid.append(tid) imh['trajectories'] = {x: imh['trajectories'][x] for x in jan_tid} imn1 = individual_mobility_network.build_imn(imh, reg_loc=True, events=events, verbose=False) period_imn1 = {"01": imn1} customer_obj = {'uid': uid} customer_obj.update(period_imn1) json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj), default=agenda_converter) json_bytes = json_str.encode('utf-8') with gzip.GzipFile(output_filename, 'a') as fout: fout.write(json_bytes) except (TypeError): print("type error") continue print("done") cur.close() con.close()
def imn_extract_all_year(filename, path, type_user, traj_table, evnt_table, min_traj_nbr, min_length, min_duration, area, overwrite=False): output_filename = path + filename con = database_io.get_connection() cur = con.cursor() #users_list=find_user_list(cur,traj_table): #users_list = [100225,101127,100742,100747,100690,100578,1003,100191,100192,100193,318819,100619,100554,100498] #users_list = [100843,100836,100827,100795,100747,100717,100681,100669,101293,101194,101091] users_list = [7925] users_list = sorted(users_list) nbr_users = len(users_list) print("user ids before checking :") print(nbr_users, len(users_list)) if os.path.isfile(output_filename) and not overwrite: processed_users = list() fout = gzip.GzipFile(output_filename, 'r') for row in fout: customer_obj = json.loads(row) processed_users.append(customer_obj['uid']) fout.close() users_list = [uid for uid in users_list if uid not in processed_users] print("user ids after checking :") print(nbr_users, len(users_list)) for i, uid in enumerate(users_list): try: if i % 1 == 0: print( datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i, nbr_users, i / nbr_users * 100.0)) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None #imh['trajectories']=dict(list(islice(imh['trajectories'].items(), 200))) if len(imh['trajectories']) < min_traj_nbr: print('len trajectories]) < min_traj_nbr', len(imh['trajectories']), min_traj_nbr) continue imn = individual_mobility_network.build_imn(imh, reg_loc=True, events=events, verbose=False) customer_obj = {'uid': uid} customer_obj.update(imn) json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj), default=agenda_converter) json_bytes = json_str.encode('utf-8') with gzip.GzipFile(output_filename, 'a') as fout: fout.write(json_bytes) except (TypeError): print("type error") continue print("done") cur.close() con.close()
def main(): area = sys.argv[1] # 'rome' 'tuscany' 'london' type_user = sys.argv[2] # 'crash' 'nocrash' overwrite = int(sys.argv[3]) country = 'uk' if area == 'london' else 'italy' min_length = 1.0 min_duration = 60.0 print(datetime.datetime.now(), 'Crash Prediction - Train Test Partitioner') if not overwrite: print(datetime.datetime.now(), '(restart)') path = './' path_imn = path + 'imn_new/' path_dataset = path + 'dataset/' path_traintest = path + 'traintest/' path_quadtree = path + 'quadtree/' traj_table = 'tak.%s_traj' % country evnt_table = 'tak.%s_evnt' % country crash_table = 'tak.%s_crash' % country if area == 'london' and type_user == 'nocrash': users_filename = path_dataset + '%s_%s_users_list.csv' % (area, 'all') users_filename_crash = path_dataset + '%s_%s_users_list.csv' % ( area, 'crash') else: users_filename = path_dataset + '%s_%s_users_list.csv' % (area, type_user) users_filename_crash = None users_list = pd.read_csv(users_filename).values[:, 0].tolist() users_list = sorted(users_list) if users_filename_crash is not None: users_list_crash = pd.read_csv( users_filename_crash).values[:, 0].tolist() users_list_crash = sorted(users_list_crash) users_list = [uid for uid in users_list if uid not in users_list_crash] nbr_users = len(users_list) print(datetime.datetime.now(), 'Reading quadtree') quadtree_poi_filename = path_quadtree + '%s_personal_osm_poi_lv17.json.gz' % area fout = gzip.GzipFile(quadtree_poi_filename, 'r') quadtree = json.loads(fout.readline()) fout.close() print(datetime.datetime.now(), 'Reading quadtree features') quadtree_features_filename = path_quadtree + '%s_quadtree_features.json.gz' % area fout = gzip.GzipFile(quadtree_features_filename, 'r') quadtrees_features_str = json.loads(fout.readline()) quadtrees_features = {int(k): v for k, v in quadtrees_features_str.items()} fout.close() processed_users = set() if overwrite: for index in range(0, 7): output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % ( area, type_user, index) if os.path.exists(output_filename): os.remove(output_filename) else: processed_users = set() for index in range(0, 7): output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % ( area, type_user, index) if os.path.isfile(output_filename): fout = gzip.GzipFile(output_filename, 'r') for row in fout: customer_obj = json.loads(row) processed_users.add(customer_obj['uid']) fout.close() window = 4 datetime_from = datetime.datetime.strptime('2017-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') datetime_to = datetime.datetime.strptime('2018-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') print(datetime.datetime.now(), 'Generating month boundaries') months = pd.date_range(start=datetime_from, end=datetime_to, freq='MS') boundaries = [[lm, um] for lm, um in zip(months[:-window], months[window:])] training_months = list() test_months = list() for i in range(len(boundaries) - 1): training_months.append(boundaries[i]) test_months.append(boundaries[i + 1]) index = 0 tr_data_map = dict() ts_data_map = dict() for tr_months, ts_months in zip(training_months, test_months): tr_data_map[tuple(tr_months)] = index ts_data_map[tuple(ts_months)] = index index += 1 print(datetime.datetime.now(), 'Initializing quadtree features') tr_quadtree_features = dict() for m in quadtrees_features: for lu, index in tr_data_map.items(): if lu[0].month <= m < lu[1].month: if index not in tr_quadtree_features: tr_quadtree_features[index] = dict() for path in quadtrees_features[m]: if path not in tr_quadtree_features[index]: tr_quadtree_features[index][path] = { 'nbr_traj_start': 0, 'nbr_traj_stop': 0, 'nbr_traj_move': 0, 'traj_speed_sum': 0, 'traj_speed_count': 0, 'nbr_evnt_A': 0, 'nbr_evnt_B': 0, 'nbr_evnt_C': 0, 'nbr_evnt_Q': 0, 'nbr_evnt_start': 0, 'nbr_evnt_stop': 0, 'speed_A_sum': 0, 'max_acc_A_sum': 0, 'avg_acc_A_sum': 0, 'speed_B_sum': 0, 'max_acc_B_sum': 0, 'avg_acc_B_sum': 0, 'speed_C_sum': 0, 'max_acc_C_sum': 0, 'avg_acc_C_sum': 0, 'speed_Q_sum': 0, 'max_acc_Q_sum': 0, 'avg_acc_Q_sum': 0, 'nbr_crash': 0, } for k, v in quadtrees_features[m][path].items(): tr_quadtree_features[index][path][k] += v ts_quadtree_features = dict() for m in quadtrees_features: for lu, index in tr_data_map.items(): if lu[0].month <= m < lu[1].month: if index not in ts_quadtree_features: ts_quadtree_features[index] = dict() for path in quadtrees_features[m]: if path not in ts_quadtree_features[index]: ts_quadtree_features[index][path] = { 'nbr_traj_start': 0, 'nbr_traj_stop': 0, 'nbr_traj_move': 0, 'traj_speed_sum': 0, 'traj_speed_count': 0, 'nbr_evnt_A': 0, 'nbr_evnt_B': 0, 'nbr_evnt_C': 0, 'nbr_evnt_Q': 0, 'nbr_evnt_start': 0, 'nbr_evnt_stop': 0, 'speed_A_sum': 0, 'max_acc_A_sum': 0, 'avg_acc_A_sum': 0, 'speed_B_sum': 0, 'max_acc_B_sum': 0, 'avg_acc_B_sum': 0, 'speed_C_sum': 0, 'max_acc_C_sum': 0, 'avg_acc_C_sum': 0, 'speed_Q_sum': 0, 'max_acc_Q_sum': 0, 'avg_acc_Q_sum': 0, 'nbr_crash': 0, } for k, v in quadtrees_features[m][path].items(): ts_quadtree_features[index][path][k] += v print(datetime.datetime.now(), 'Connecting to database') con = database_io.get_connection() cur = con.cursor() count = 0 imn_filedata = gzip.GzipFile( path_imn + '%s_imn_%s.json.gz' % (area, type_user), 'r') print(datetime.datetime.now(), 'Calculating features and partitioning dataset') for row in imn_filedata: if len(row) <= 1: print('new file started ;-)') continue user_obj = json.loads(row) uid = user_obj['uid'] count += 1 if uid in processed_users: continue if count % 10 == 0: print( datetime.datetime.now(), 'train test partition %s %s [%s/%s] - %.2f' % (area, type_user, count, nbr_users, 100 * count / nbr_users)) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None trajectories = imh['trajectories'] tr_data = dict() ts_data = dict() # partitioning imn for train and test for imn_months in user_obj: if imn_months == 'uid': continue # print(imn_months) m0 = int(imn_months.split('-')[0]) m1 = int(imn_months.split('-')[1]) for lu, index in tr_data_map.items(): if lu[0].month <= m0 < m1 < lu[1].month: if index not in tr_data: tr_data[index] = { 'uid': uid, 'crash': False, 'trajectories': dict(), 'imns': dict(), 'events': dict(), } tr_data[index]['imns'][imn_months] = user_obj[imn_months] for lu, index in ts_data_map.items(): if lu[0].month <= m0 < lu[1].month: if index not in ts_data: ts_data[index] = { 'uid': uid, 'crash': False, 'trajectories': dict(), 'imns': dict(), 'events': dict(), } ts_data[index]['imns'][imn_months] = user_obj[imn_months] # partitioning trajectories for train and test for tid, traj in trajectories.items(): for lu, index in tr_data_map.items(): if lu[0] <= traj.start_time() < lu[1] and index in tr_data: tr_data[index]['trajectories'][tid] = traj for lu, index in ts_data_map.items(): if lu[0] <= traj.start_time() < lu[1] and index in ts_data: ts_data[index]['trajectories'][tid] = traj # partitioning events for train and test for eid, evnt in events.items(): # print(evnt) for lu, index in tr_data_map.items(): if lu[0] <= evnt[0]['date'] < lu[1] and index in tr_data: tr_data[index]['events'][eid] = evnt[0] for lu, index in ts_data_map.items(): if lu[0] <= evnt[0]['date'] < lu[1] and index in ts_data: ts_data[index]['events'][eid] = evnt[0] # get has crash next month for lu, index in tr_data_map.items(): if index not in tr_data: continue query = """SELECT * FROM %s WHERE uid = '%s' AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % ( crash_table, uid, str( lu[1]), str(lu[1] + relativedelta(months=1))) cur.execute(query) rows = cur.fetchall() has_crash_next_month = len(rows) > 0 tr_data[index]['crash'] = has_crash_next_month for lu, index in ts_data_map.items(): if index not in ts_data: continue query = """SELECT * FROM %s WHERE uid = '%s' AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % ( crash_table, uid, str( lu[1]), str(lu[1] + relativedelta(months=1))) cur.execute(query) rows = cur.fetchall() has_crash_next_month = len(rows) > 0 ts_data[index]['crash'] = has_crash_next_month tr_features, ts_features = feature_extractor.extract_features( uid, tr_data, ts_data, quadtree, tr_quadtree_features, ts_quadtree_features) for index in tr_features: if index in ts_features: output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % ( area, type_user, index) store_obj = { 'uid': uid, 'train': tr_features[index], 'test': ts_features[index] } feature_extractor.store_features(output_filename, store_obj) imn_filedata.close()
def main(): area = sys.argv[1] country = 'uk' if area == 'london' else 'italy' overwrite = True depth = 16 store_evry = 100 path = './' path_dataset = path + 'dataset/' path_quadtree = path + 'quadtree/' traj_table = 'tak.%s_traj' % country evnt_table = 'tak.%s_evnt' % country crash_table = 'tak.%s_crash' % country users_filename = path_dataset + '%s_all_users_list.csv' % area quadtree_output_filename = path_quadtree + '%s_quadtree_features.json.gz' % area quadtrees_features = dict() datetime_from = datetime.datetime.strptime('2017-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') datetime_to = datetime.datetime.strptime('2018-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') months = pd.date_range(start=datetime_from, end=datetime_to, freq='MS') boundaries = [[lm, um] for lm, um in zip(months[:-1], months[1:])] index = 0 data_map = dict() for months in boundaries: data_map[tuple(months)] = index quadtrees_features[index] = dict() index += 1 users_list = sorted(pd.read_csv(users_filename).values[:, 0].tolist()) last_processed_user = None if os.path.isfile(quadtree_output_filename) and not overwrite: fout = gzip.GzipFile(quadtree_output_filename, 'r') quadtrees_features_str = json.loads(fout.readline()) quadtrees_features = { int(k): v for k, v in quadtrees_features_str.items() } last_processed_user = json.loads(fout.readline()) fout.close() con = database_io.get_connection() cur = con.cursor() for i, uid in enumerate(users_list): if last_processed_user is not None and uid <= last_processed_user: continue if i % store_evry == 0: print( datetime.datetime.now(), '%s %s %.2f' % (traj_table, area, i / len(users_list) * 100.0)) trajectories = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length=1.0, min_duration=60.0)['trajectories'] events = database_io.load_individual_event_history( cur, uid, evnt_table) quadtree_data = dict() # partitioning trajectories for train and test for tid, traj in trajectories.items(): for lu, index in data_map.items(): if lu[0] <= traj.start_time() < lu[1]: if index not in quadtree_data: quadtree_data[index] = { 'uid': uid, 'crash': None, 'trajectories': dict(), 'events': dict(), } quadtree_data[index]['trajectories'][tid] = traj # partitioning events for train and test for eid, evnt in events.items(): for lu, index in data_map.items(): if lu[0] <= evnt[0]['date'] < lu[1] and index in quadtree_data: quadtree_data[index]['events'][eid] = evnt[0] # get has crash this month for lu, index in data_map.items(): if index not in quadtree_data: continue query = """SELECT lat, lon FROM %s WHERE uid = '%s' AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % ( crash_table, uid, str(lu[0]), str(lu[1])) cur.execute(query) rows = cur.fetchall() if len(rows) > 0: quadtree_data[index]['crash'] = { 'lat': float(rows[0][0]), 'lon': float(rows[0][1]) } quadtrees_features = quadtrees_features_extract( quadtrees_features, quadtree_data, depth) if i % store_evry == 0: json_str_quadtree = '%s\n' % json.dumps(quadtrees_features) json_bytes_quadtree = json_str_quadtree.encode('utf-8') json_str_lpu = '%s\n' % json.dumps(last_processed_user) json_bytes_lpu = json_str_lpu.encode('utf-8') with gzip.GzipFile(quadtree_output_filename, 'w') as fout: fout.write(json_bytes_quadtree) fout.write(json_bytes_lpu) last_processed_user = uid
def imn_extract(filename, path, type_user, traj_table, evnt_table, min_traj_nbr, min_length, min_duration, area, overwrite=False, users_filename_crash=None): output_filename = path + '%s_imn_%s.json.gz' % (area, type_user) con = database_io.get_connection() cur = con.cursor() users_list = pd.read_csv(filename).values[:, 0].tolist() users_list = sorted(users_list) if users_filename_crash is not None: users_list_crash = pd.read_csv( users_filename_crash).values[:, 0].tolist() users_list_crash = sorted(users_list_crash) users_list = [uid for uid in users_list if uid not in users_list_crash] nbr_users = len(users_list) print(nbr_users, len(users_list)) if os.path.isfile(output_filename) and not overwrite: processed_users = list() fout = gzip.GzipFile(output_filename, 'r') # count = 0 for row in fout: customer_obj = json.loads(row) processed_users.append(customer_obj['uid']) # print(customer_obj['uid']) # if count == 100: # break # count += 1 fout.close() users_list = [uid for uid in users_list if uid not in processed_users] print(nbr_users, len(users_list)) # from_perc = 95 # to_perc = 100 for i, uid in enumerate(users_list): # if not from_perc < i / len(users_list) * 100.0 <= to_perc: # continue if i % 1 == 0: print( datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i, nbr_users, i / nbr_users * 100.0)) # print(datetime.datetime.now(), '%s %s %s %.2f' % ( # traj_table, area, type_user, i / len(users_list) * 100.0), from_perc, to_perc) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None if len(imh['trajectories']) < min_traj_nbr: # print('len trajectories]) < min_traj_nbr', len(imh['trajectories']), min_traj_nbr) continue # print(len(events)) # print(list(events.keys())) wimh_dict = dict() wevents_dict = dict() for tid, traj in imh['trajectories'].items(): st = traj.start_time() stk_list = start_time_map(st) for stk in stk_list: if stk is None: continue if stk not in wimh_dict: wimh_dict[stk] = {'uid': uid, 'trajectories': dict()} wevents_dict[stk] = dict() wimh_dict[stk]['trajectories'][tid] = traj if tid in events: wevents_dict[stk][tid] = events[tid] customer_obj = {'uid': uid} for stk in wimh_dict: wimh = wimh_dict[stk] wevents = wevents_dict[stk] # print(stk, len(wimh['trajectories']), len(wevents)) if len(wimh['trajectories']) < min_traj_nbr // 12: continue imn = individual_mobility_network.build_imn(wimh, reg_loc=True, events=wevents, verbose=False) customer_obj[stk] = imn json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj), default=agenda_converter) json_bytes = json_str.encode('utf-8') with gzip.GzipFile(output_filename, 'a') as fout: fout.write(json_bytes) # with gzip.GzipFile(output_filename.replace('.json.gz', '_%s_%s.json.gz' % (from_perc, to_perc)), 'a') as fout: # fout.write(json_bytes) cur.close() con.close()