def run(path, verbose=1): print(path) trajs = read_geolife(path) preprocessed = list() for traj in trajs: traj_new = time_duplication_filter(traj) traj_new = speed_filter_abs(traj_new, 300, in_kmh=True) preprocessed.append(traj_new) if verbose > 1: print('Trajectories preprocessed') df = make_df(preprocessed) if verbose > 1: print('Trajectories split at staypoints') length = len(df) df = cluster_into_spots(df, init_eps=300, levels=1, threshold=0.2) if verbose > 1: print('Trajectories clustered in sps') ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0] train_weeks = int((ts.days / 7) * 4/5) duration = timedelta(days=7 * train_weeks) train_start = df['start_date'].iloc[0] train_end = train_start + duration train = df[train_start:train_end] test = df[train_end:] dates = list() for i in test.index: if len(dates) > 1: if dates[-1] == (i.month, i.day): continue dates.append((i.month, i.day)) ts = TimeSensitiveMostFrequentRoute() ts.fit(train) preds = ts.predict_proba(test) ranks = list() for i in range(len(preds)): correct = False for j, pred in enumerate(preds[i].index): if pred == test['end_cluster'].iloc[i]: ranks.append((j + 1, len(preds[i]))) correct = True break if not correct: ranks.append((None, len(preds[i]))) if verbose > 0: print('Average rank/pred', np.mean(ranks, axis=0)) print('Total predictions:', len(test)) return ranks, len(test)
def prepare(path, verbose=1): print(path) trajs = read_geolife(path) preprocessed = list() for traj in trajs: traj_new = time_duplication_filter(traj) traj_new = speed_filter_abs(traj_new, 300, in_kmh=True) preprocessed.append(traj_new) if verbose > 1: print('Trajectories preprocessed') df = make_df(preprocessed) if verbose > 1: print('Trajectories split at staypoints') df = kmeans_cluster_into_spots(df, 20, 100) if verbose > 1: print('Trajectories clustered in sps')
def run(path, test_traj_len, predict_after_frac=100, verbose=1): print(path) trajs = read_geolife(path) preprocessed = list() for traj in trajs: traj_new = time_duplication_filter(traj) traj_new = speed_filter_abs(traj_new, 300, in_kmh=True) preprocessed.append(traj_new) if verbose > 1: print('Trajectories preprocessed') df = make_df(preprocessed) if verbose > 1: print('Trajectories split at staypoints') length = len(df) df = cluster_into_spots(df, init_eps=300, levels=0, threshold=0.2) if verbose > 1: print('Trajectories clustered in sps') ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0] train_weeks = int((ts.days / 7) * 4/5) duration = timedelta(days=7 * train_weeks) train_start = df['start_date'].iloc[0] train_end = train_start + duration train = df[train_start:train_end] test = df[train_end:] test_trajs = get_test_trajs(test, preprocessed) kse = KnownStartEstimator() kse = kse.fit(train) filtered_test_trajs = list() for traj in test_trajs: if len(traj) >= test_traj_len: filtered_test_trajs.append(traj) if verbose > 1: print(len(filtered_test_trajs), "of", len(test_trajs), "have the minimal length of", test_traj_len) res = dict() for traj in filtered_test_trajs: res[traj[-1]] = kse.predict_proba(traj[:predict_after_frac]) ranks = list() for p in res: predictions = res[p] truth = resolve_endcluster(train, p) if predictions is None or len(predictions) < 1: ranks.append((None, None)) else: ranked = False for i in range(len(predictions)): if predictions.index[i] == truth: ranks.append((i+1, len(predictions))) ranked = True break if not ranked: ranks.append((None, len(predictions))) return ranks
def run(path, verbose=1): print(path) trajs = read_geolife(path) if len(trajs) < 30: return None preprocessed = list() for traj in trajs: traj_new = time_duplication_filter(traj) traj_new = speed_filter_abs(traj_new, 300, in_kmh=True) preprocessed.append(traj_new) if verbose > 1: print('Trajectories preprocessed') df = make_df(preprocessed) if verbose > 1: print('Trajectories split at staypoints') df = agglomerative_cluster_into_spots(df, 20, 100) if verbose > 1: print('Trajectories clustered in sps') ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0] train_weeks = int((ts.days / 7) * 4 / 5) duration = timedelta(days=7 * train_weeks) train_start = df['start_date'].iloc[0] train_end = train_start + duration train = df[train_start:train_end] test = df[train_end:] dates = list() for i in test.index: if len(dates) > 1: if dates[-1] == (i.month, i.day): continue dates.append((i.month, i.day)) bwe = FrequentistEstimator() bwe = bwe.fit(train) counter_no_prob = 0 ranks = list() for date in dates: month, day = date x = pd.DataFrame( data=[[49.475752, 8.482531]], index=pd.DatetimeIndex( [pd.Timestamp("2008-{}-{} 19:45:21".format(month, day))]), columns=['lat', 'lon']) if verbose > 2: print(x) pred = bwe.predict_proba(x) sorted_pred = sorted(pred.items(), key=operator.itemgetter(1), reverse=True) for i, row in df.loc[(df.index.month == month) & (df.index.day == day)].iterrows(): if verbose > 2: print((row['start_cluster'], row['end_cluster'])) if (row['start_cluster'], row['end_cluster']) in pred.keys(): if verbose > 2: print(pred[(row['start_cluster'], row['end_cluster'])]) for j, s_pred in enumerate(sorted_pred): if s_pred[0] == (row['start_cluster'], row['end_cluster']): if verbose > 2: print('Ranked:', j + 1, 'of total', len(pred), 'predictions') ranks.append((j + 1, len(pred))) else: if verbose > 2: print("no prob") counter_no_prob += 1 if verbose > 0: print('Average rank/pred', np.mean(ranks, axis=0)) print('Number of no prob:', counter_no_prob) print('Total predictions:', len(test)) return ranks, len(test), counter_no_prob
def run(path, verbose=1): print(path) trajs = read_geolife(path) preprocessed = list() for traj in trajs: traj_new = time_duplication_filter(traj) traj_new = speed_filter_abs(traj_new, 300, in_kmh=True) preprocessed.append(traj_new) if verbose > 1: print('Trajectories preprocessed') df = make_df(preprocessed) if verbose > 1: print('Trajectories split at staypoints') length = len(df) df = cluster_into_spots(df, init_eps=300, levels=1, threshold=0.2) if verbose > 1: print('Trajectories clustered in sps') ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0] train_weeks = int((ts.days / 7) * 4 / 5) duration = timedelta(days=7 * train_weeks) train_start = df['start_date'].iloc[0] train_end = train_start + duration train = df[train_start:train_end] test = df[train_end:] ddf = DenseDepartureTimes(0.03) train_ddt = ddf.fit_transform(train.copy()) bdte = BayesDepartureTimeEstimator() bdte = bdte.fit(train_ddt) counter_prob = 0 ranks = list() for i, row in test.iterrows(): x = pd.DataFrame(data=[[row['start_lat'], row['start_lon']]], index=[i], columns=['lat', 'lon']) pred = bdte.predict_proba(x) sorted_pred = sorted(pred.items(), key=operator.itemgetter(1), reverse=True) for key in pred.keys(): time_wa, time_wb = bdte.resolve_start_time_cluster(key) dummydate = date(1970, 1, 1) # time_wa = (datetime.combine(dummydate,time_wa)-timedelta(hours=1)).time() # time_wb = (datetime.combine(dummydate,time_wb)+timedelta(hours=1)).time() if len(x.between_time(time_wa, time_wb)) == 1: for j, s_pred in enumerate(sorted_pred): if s_pred[0] == key: if verbose > 1: print('Ranked:', j + 1, 'of total', len(pred), 'predictions') counter_prob += 1 if verbose > 0: print('Average rank/pred', np.mean(ranks, axis=0)) print('Number of no prob:', counter_prob) print('Total predictions:', len(test)) return ranks, len(test), counter_prob