def find_similar_subroutes_per_test_trip(test_points, train_df, k, paropts=None, verbosity=False): if paropts: print("Parallelizing with", paropts) partype, numpar = paropts else: partype, numpar = None, None timestart = utils.tic() test_lonlat = utils.idx_to_lonlat(test_points, format="tuples") max_subseqs = [] if partype: # num threads or processes if partype == "processes": max_subseqs = exec_with_processes(train_df, numpar, test_lonlat, k) elif partype == "threads": max_subseqs = exec_with_threads(train_df, numpar, test_lonlat, k) else: max_subseqs = serial_execution(train_df, test_lonlat, k, verbosity=verbosity) if len(max_subseqs) != k: print("WARNING: Specified %d subseqs!" % k) print("Extracted %d nearest subsequences of a %d-long test tring in: %s" % (len(test_points), k, utils.tictoc(timestart))) return max_subseqs
def map_to_features_bow(data_df, grid, output_file): rows, columns, cell_names = grid points_header = "points" if "points" in data_df else "Trajectory" features = [] for index, row in data_df.iterrows(): bow_vector = [0 for cc in cell_names for c in cc] train_points = row[points_header] train_points = eval(train_points) train_lonlats = utils.idx_to_lonlat(train_points, format="tuples") for i, lonlat in enumerate(train_lonlats): lon = lonlat[0] # for columns lat = lonlat[1] # for rows row_idx = find_cell_index(rows, lat) col_idx = find_cell_index(columns, lon) linear_idx = row_idx * len(columns) + col_idx bow_vector[linear_idx] += 1 bus_direction = find_trip_direction(train_lonlats) bow_vector = [bus_direction * x for x in bow_vector] #if(bus_direction<0): #bow_vector = list(reversed(bow_vector)) features.append(bow_vector) for i, feats in enumerate(features): data_df.at[i, points_header] = feats if output_file is not None: data_df.to_csv(output_file) else: return features
def serial_execution(df, test_lonlat, k, verbosity=False): max_subseqs = [] # for each trip in the training data for index, row in df.iterrows(): train_points = row["points"] train_points = eval(train_points) train_lonlat = utils.idx_to_lonlat(train_points, format="tuples") timestart = utils.tic() # compute common subsequences between the test trip and the current candidate _, subseqs_idx_list = calc_lcss(test_lonlat, train_lonlat) # consider non-consequtive subroutes subseqs_idx = list( set([idx for seq in subseqs_idx_list for idx in seq])) elapsed = utils.tictoc(timestart) # sort by decr. length subseqs_idx.sort(reverse=True) # update the list of the longest subsequences if subseqs_idx: max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k, elapsed, row) # print("Max subseq length:",len(max_subseqs)) #print([x[0] for x in max_subseqs]) # print("Updated max subseqs, lens now:",[len(x[0]) for x in max_subseqs]) if verbosity: print("Got %d subseqs:" % len(max_subseqs), [(x, y, z["tripId"]) for (x, y, z) in max_subseqs]) #max_subseqs = check_reverse_lcss(max_subseqs, test_lonlat, k) if verbosity: print("Got %d reversed: subseqs:" % len(max_subseqs), [(x, y, z["tripId"]) for (x, y, z) in max_subseqs]) return max_subseqs
def exec_with_threads(df, numpar, test_lonlat, k): max_subseqs = [] res1 = [[] for _ in range(numpar)] res2 = [[] for _ in range(numpar)] subframes = utils.get_sub_dataframes(df, numpar) # assign data and start the threads threads = [] timestart = utils.tic() for i in range(numpar): train_lonlat = [] for index, row in subframes[i].iterrows(): train_points = row["points"] train_points = eval(train_points) train_lonlat = utils.idx_to_lonlat(train_points, format="tuples") threads.append( threading.Thread(target=calc_lcss, args=(test_lonlat, train_lonlat, res1, res2))) threads[i].start() # gather and merge results subseqs = [] subseqs_idx = [] for i in range(numpar): threads[i].join() subseqs += res1[i] subseqs_idx += res2[i] subseqs_idx = sorted(subseqs_idx, key=lambda x: len(x), reverse=True) elapsed = utils.tictoc(timestart) max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k, elapsed, row) return max_subseqs
def map_to_features_vlad(data_df, grid, output_file): print("Computing VLAD encoding") rows, columns, cell_names = grid # internal midpoints r_dists = [u - d for (u, d) in zip(rows[1:], rows[:-1])] c_dists = [u - d for (u, d) in zip(columns[1:], columns[:-1])] r_mids = [r + m / 2 for (r, m) in zip(rows[:-1], r_dists)] c_mids = [r + m / 2 for (r, m) in zip(columns[:-1], c_dists)] # edge midpoints, with distance the mean of the internals r_dists, c_dists = np.mean(r_dists), np.mean(c_dists) r_mids = [rows[0] - r_dists / 2] + r_mids + [rows[-1] + r_dists / 2] c_mids = [columns[0] - c_dists / 2] + c_mids + [columns[-1] + c_dists / 2] centroids = [(c, r) for c in c_mids for r in r_mids] points_header = "points" if "points" in data_df else "Trajectory" features = [] for index, row in data_df.iterrows(): vlad_vector = [0 for _ in centroids] train_points = row[points_header] train_points = eval(train_points) train_lonlats = utils.idx_to_lonlat(train_points, format="tuples") for i, lonlat in enumerate(train_lonlats): lon = lonlat[0] # for columns lat = lonlat[1] # for rows dists = [] # get distance from each centroid for centroid in centroids: dists.append(utils.euc_dist(centroid, lonlat)) dnorm = np.sqrt(sum([pow(d, 2) for d in dists])) dists = [1 - d / dnorm for d in dists] vlad_vector = [v + d for (v, d) in zip(vlad_vector, dists)] dnorm = np.sqrt(sum([pow(d, 2) for d in dists])) vlad_vector = [1 - d / dnorm for d in vlad_vector] features.append(vlad_vector) print("Done computing VLAD encoding") for i, feats in enumerate(features): data_df.at[i, points_header] = feats if output_file is not None: data_df.to_csv(output_file) else: return features
def calculate_nns(test_points, train_df, paropts=None, k=5, unique_jids=False): # parallelization type if paropts: print("Parallelizing with", paropts) partype, numpar = paropts else: partype, numpar = None, None timestart = utils.tic() test_lonlat = utils.idx_to_lonlat(test_points, format="tuples") nearest_neighbours = [-1 for _ in range(len(train_df.index))] if partype: # num threads or processes if partype == "processes": nearest_neighbours = run_with_processes(numpar, test_lonlat, train_df) elif partype == "threads": nearest_neighbours = run_with_threads(numpar, test_lonlat, train_df) else: # serial execution nearest_neighbours = calculate_dists(test_lonlat, train_df) # sort the list to increasing distance nearest_neighbours = sorted(nearest_neighbours, key=lambda k: k[1]) # keep unique jids, if needed if unique_jids: print("Restricting to single neighbour per jid") keep = [0 for _ in range(len(nearest_neighbours))] already_encountered = [] for i, nn in enumerate(nearest_neighbours): jid = nn[2] if jid not in already_encountered: already_encountered.append(jid) keep[i] = True continue nearest_neighbours = [ nearest_neighbours[i] for i in range(len(nearest_neighbours)) if keep[i] ] # return the top 5 nearest_neighbours = nearest_neighbours[:k] print("Neighbours:", [n[0] for n in nearest_neighbours]) print("Extracted %d nearest neighbours of a %d-long test trip in: %s" % (len(test_points), k, utils.tictoc(timestart))) return nearest_neighbours
def check_reverse_lcss(max_subseqs, test_lonlat, k): new_subseqs = [] for i, mxs in enumerate(max_subseqs): (seq_old, elapsed, row) = mxs # get reversed points train_pts = eval(row["points"]) train_lonlat = utils.idx_to_lonlat(train_pts, format="tuples") _, seq_old_again = calc_lcss(test_lonlat, train_lonlat) train_lonlat = train_lonlat[-1::-1] _, idxs = calc_lcss(test_lonlat, train_lonlat) # re-reverse if idxs: idxs = [ii[-1::-1] for ii in idxs] idxs = list(set([idx for seq in idxs for idx in seq])) max_subseqs = update_current_maxsubseq(max_subseqs, idxs, k, elapsed, row) return max_subseqs
def visualize_trips(output_folder, df): output_file_base = os.path.join(output_folder, "mapplot") num_visualize = 5 idxs = list(range(len(df))) random.shuffle(idxs) # total_points = utils.get_total_points(df) print("Randomly selected %d trips to visualize, with indexes" % num_visualize, idxs[:num_visualize]) for i in range(num_visualize): idx = idxs[i] total_pts = utils.get_total_points(df[idx : idx + 1])[0] file_name = output_file_base + str(i) + ".html" # get point coordinates lon-lat # points_lonlat = [utils.idx_to_lonlat(total_points[i])] points_lonlat = [utils.idx_to_lonlat(total_pts)] # produce output htmls utils.write_group_gml(points_lonlat, file_name) # produce output jpg utils.html_to_png(file_name, file_name + ".jpg")
def map_to_features_bow_bigrams(data_df, grid, output_file): rows, columns, cell_names = grid points_header = "points" if "points" in data_df else "Trajectory" features = [] num_cells = len([0 for cc in cell_names for c in cc]) # vector order: for N cells: bigrams of (cell1, cell2), cell1,cell3 cell_pairs = list(combinations(list(range(num_cells)), 2)) for n in range(num_cells): cell_pairs.append((n, n)) pairs_to_idxs = {} # map each cell pair to a vector index for pair in cell_pairs: pairs_to_idxs[pair] = len(pairs_to_idxs) bigram_dim = len(pairs_to_idxs) for index, row in data_df.iterrows(): bow_vector = [0 for _ in range(bigram_dim)] train_points = row[points_header] train_points = eval(train_points) train_lonlats = utils.idx_to_lonlat(train_points, format="tuples") # loop into bigrams for i in range(len(train_lonlats) - 1): lon1, lat1 = tuple(train_lonlats[i][0:2]) lon2, lat2 = tuple(train_lonlats[i + 1][0:2]) r1, c1 = find_cell_index(rows, lat1), find_cell_index(rows, lon1) r2, c2 = find_cell_index(rows, lat2), find_cell_index(rows, lon2) linear_idx1 = r1 * len(columns) + c1 linear_idx2 = r2 * len(columns) + c2 stuple = tuple(sorted((linear_idx1, linear_idx2))) vec_idx = pairs_to_idxs[stuple] bow_vector[vec_idx] += 1 features.append(bow_vector) for i, feats in enumerate(features): data_df.at[i, points_header] = feats if output_file is not None: data_df.to_csv(output_file) else: return features
def calculate_dists(test_lonlat, train_df, ret_container=None, paropts=None): if ret_container is not None: dists = ret_container else: dists = [-1 for _ in range(len(train_df))] for index, row in train_df.iterrows(): train_points = row["points"] jid = row["journeyId"] # print(train_points) train_points = eval(train_points) trip_lonlat = utils.idx_to_lonlat(train_points, format="tuples") # calculate distance distance = calculate_dynamic_time_warping(test_lonlat, trip_lonlat, paropts) #distance = calculate_dynamic_time_warping(test_lonlat, trip_lonlat, paropts, impl = "libdtw") # print("Calculated distance: %.2f for trip: %d/%d : %s" % (distance, index+1, len(train_df.index), str(row["journeyId"]))) dists[index] = (int(row["tripId"]), distance, jid, train_points) # print("Computed distance for train trip %d/%d" % (index+1, len(train_df))) return dists
def map_to_features_pointwise(data_df, grid): rows, columns, cell_names = grid # measure some statistics grid_hist = {} total_points = 0 numcells = (len(rows) + 1) * (len(columns) + 1) for cc in cell_names: for c in cc: grid_hist['C' + str(c)] = 0 points_header = "points" if "points" in data_df else "Trajectory" features, timestamps = [], [] for index, row in data_df.iterrows(): train_points = row[points_header] train_points = eval(train_points) ts = [p[0] for p in train_points] timestamps.append(ts) train_lonlats = utils.idx_to_lonlat(train_points, format="tuples") feature_list = [] for i, lonlat in enumerate(train_lonlats): lon = lonlat[0] # for columns lat = lonlat[1] # for rows row_idx = find_cell_index(rows, lat) col_idx = find_cell_index(columns, lon) cell_name = 'C' + cell_names[row_idx][col_idx] # visualize_grid(rows,columns,None,None,[[lon],[lat]]) grid_hist[cell_name] += 1 total_points += 1 feature_list.append(cell_name) features.append(feature_list) # show stats print() print("Grid assignment frequencies of the total of %d points:" % total_points) ssum = 0 for i, name in enumerate(grid_hist): print(i, "/", numcells, name, grid_hist[name]) ssum += grid_hist[name] return features, timestamps
def exec_with_processes(df, process_num, test_lonlat, k): max_subseqs = [] pool = ThreadPool(processes=process_num) for index, row in df.iterrows(): train_points = row["points"] train_points = eval(train_points) train_lonlat = utils.idx_to_lonlat(train_points, format="tuples") timestart = utils.tic() # compute common subsequences between the test trip and the current candidate async_result = pool.apply_async(calc_lcss, (test_lonlat, train_lonlat)) subseqs, subseqs_idx = async_result.get() elapsed = utils.tictoc(timestart) # sort by decr. length subseqs_idx = sorted(subseqs_idx, key=lambda x: len(x), reverse=True) # update the list of the longest subsequences max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k, elapsed, row) print("Got %d common subsequences" % len(max_subseqs)) pool.close() pool.join() return max_subseqs
def preprocessing_for_visualisation(test_points, max_subseqs, file_name, index): # initialize to the test trip data labels = ["test trip: %s" % index] # test jid points = [[utils.get_lonlat_tuple(test_points)]] colors = [['b']] for j, sseq in enumerate(max_subseqs): cols, print_idxs, pts = [], [], [] # trip jid jid = sseq[2]["journeyId"] subseq_idxs = sseq[0] num_points = sum([len(x) for x in subseq_idxs]) # label str = [ "neighbour %d" % j, "jid: %s" % jid, "Matching pts: %d" % num_points, "Delta-t: %s " % sseq[1] ] labels.append("\n".join(str)) # print("seq first/last idxs:",[(s[0],s[-1]) for s in subseq_idxs]) # get the points data from the pandas dataframe to lonlat tuples train_points = sseq[2]["points"] train_points = utils.idx_to_lonlat(eval(train_points), format='tuples') # prepend blue, if list is not starting at first point if subseq_idxs[0][0] > 0: idxs = list(range(subseq_idxs[0][0] + 2)) print_idxs.append(list(range(idxs[0], idxs[-1]))) cols.append('b') # for each sequence, make the matching red and the following blue, if any for seq_idx, idxs in enumerate(subseq_idxs): # the match if idxs: print_idxs.append(idxs) cols.append('r') # check for a following blue portion: not existent iff last seq idx is last idx of the trip if idxs[-1] == len(train_points) - 1: continue # else, either up to first point of next subsequence, or last point in row if seq_idx == len(subseq_idxs) - 1: next_seq_first_pt = len(train_points) - 1 else: next_seq_first_pt = subseq_idxs[seq_idx + 1][0] # blue it up b = list(range(idxs[-1], next_seq_first_pt + 1)) if b[0]: print_idxs.append(b) cols.append('b') # append the points corresponding to the indexes for i, idx_list in enumerate(print_idxs): pts.append( utils.get_lonlat_tuple([train_points[i] for i in idx_list])) # print("Idx list:",idx_list[0],idx_list[-1],"col:",cols[i]) # add to the list of points to draw points.append(pts) colors.append(cols) # print("Added pts:", pts) # print("Added cols:", cols) # send the whole parameter bundle to be drawn utils.visualize_point_sequences(points, colors, labels, file_name)