예제 #1
0
def find_similar_subroutes_per_test_trip(test_points,
                                         train_df,
                                         k,
                                         paropts=None,
                                         verbosity=False):
    if paropts:
        print("Parallelizing with", paropts)
        partype, numpar = paropts
    else:
        partype, numpar = None, None

    timestart = utils.tic()
    test_lonlat = utils.idx_to_lonlat(test_points, format="tuples")
    max_subseqs = []
    if partype:
        # num threads or processes
        if partype == "processes":
            max_subseqs = exec_with_processes(train_df, numpar, test_lonlat, k)
        elif partype == "threads":
            max_subseqs = exec_with_threads(train_df, numpar, test_lonlat, k)
    else:
        max_subseqs = serial_execution(train_df,
                                       test_lonlat,
                                       k,
                                       verbosity=verbosity)
    if len(max_subseqs) != k:
        print("WARNING: Specified %d subseqs!" % k)
    print("Extracted %d nearest subsequences of a %d-long test tring in: %s" %
          (len(test_points), k, utils.tictoc(timestart)))
    return max_subseqs
예제 #2
0
def map_to_features_bow(data_df, grid, output_file):
    rows, columns, cell_names = grid
    points_header = "points" if "points" in data_df else "Trajectory"

    features = []
    for index, row in data_df.iterrows():
        bow_vector = [0 for cc in cell_names for c in cc]
        train_points = row[points_header]
        train_points = eval(train_points)

        train_lonlats = utils.idx_to_lonlat(train_points, format="tuples")
        for i, lonlat in enumerate(train_lonlats):
            lon = lonlat[0]  # for columns
            lat = lonlat[1]  # for rows
            row_idx = find_cell_index(rows, lat)
            col_idx = find_cell_index(columns, lon)
            linear_idx = row_idx * len(columns) + col_idx
            bow_vector[linear_idx] += 1
        bus_direction = find_trip_direction(train_lonlats)
        bow_vector = [bus_direction * x for x in bow_vector]
        #if(bus_direction<0):
        #bow_vector = list(reversed(bow_vector))
        features.append(bow_vector)

    for i, feats in enumerate(features):
        data_df.at[i, points_header] = feats
    if output_file is not None:
        data_df.to_csv(output_file)
    else:
        return features
예제 #3
0
def serial_execution(df, test_lonlat, k, verbosity=False):
    max_subseqs = []
    # for each trip in the training data
    for index, row in df.iterrows():
        train_points = row["points"]
        train_points = eval(train_points)
        train_lonlat = utils.idx_to_lonlat(train_points, format="tuples")
        timestart = utils.tic()
        # compute common subsequences between the test trip and the current candidate
        _, subseqs_idx_list = calc_lcss(test_lonlat, train_lonlat)
        # consider non-consequtive subroutes
        subseqs_idx = list(
            set([idx for seq in subseqs_idx_list for idx in seq]))
        elapsed = utils.tictoc(timestart)
        # sort by decr. length
        subseqs_idx.sort(reverse=True)
        # update the list of the longest subsequences
        if subseqs_idx:
            max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k,
                                                   elapsed, row)
            # print("Max subseq length:",len(max_subseqs))
            #print([x[0] for x in max_subseqs])
            # print("Updated max subseqs, lens now:",[len(x[0]) for x in max_subseqs])
    if verbosity:
        print("Got %d subseqs:" % len(max_subseqs),
              [(x, y, z["tripId"]) for (x, y, z) in max_subseqs])

    #max_subseqs = check_reverse_lcss(max_subseqs, test_lonlat, k)
    if verbosity:
        print("Got %d reversed: subseqs:" % len(max_subseqs),
              [(x, y, z["tripId"]) for (x, y, z) in max_subseqs])

    return max_subseqs
예제 #4
0
def exec_with_threads(df, numpar, test_lonlat, k):
    max_subseqs = []
    res1 = [[] for _ in range(numpar)]
    res2 = [[] for _ in range(numpar)]
    subframes = utils.get_sub_dataframes(df, numpar)
    # assign data and start the threads
    threads = []
    timestart = utils.tic()
    for i in range(numpar):
        train_lonlat = []
        for index, row in subframes[i].iterrows():
            train_points = row["points"]
            train_points = eval(train_points)
            train_lonlat = utils.idx_to_lonlat(train_points, format="tuples")
        threads.append(
            threading.Thread(target=calc_lcss,
                             args=(test_lonlat, train_lonlat, res1, res2)))
        threads[i].start()
    # gather and merge results
    subseqs = []
    subseqs_idx = []
    for i in range(numpar):
        threads[i].join()
        subseqs += res1[i]
        subseqs_idx += res2[i]
    subseqs_idx = sorted(subseqs_idx, key=lambda x: len(x), reverse=True)
    elapsed = utils.tictoc(timestart)
    max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k,
                                           elapsed, row)
    return max_subseqs
예제 #5
0
def map_to_features_vlad(data_df, grid, output_file):
    print("Computing VLAD encoding")
    rows, columns, cell_names = grid
    # internal midpoints
    r_dists = [u - d for (u, d) in zip(rows[1:], rows[:-1])]
    c_dists = [u - d for (u, d) in zip(columns[1:], columns[:-1])]
    r_mids = [r + m / 2 for (r, m) in zip(rows[:-1], r_dists)]
    c_mids = [r + m / 2 for (r, m) in zip(columns[:-1], c_dists)]
    # edge midpoints, with distance the mean of the internals
    r_dists, c_dists = np.mean(r_dists), np.mean(c_dists)
    r_mids = [rows[0] - r_dists / 2] + r_mids + [rows[-1] + r_dists / 2]
    c_mids = [columns[0] - c_dists / 2] + c_mids + [columns[-1] + c_dists / 2]

    centroids = [(c, r) for c in c_mids for r in r_mids]

    points_header = "points" if "points" in data_df else "Trajectory"

    features = []
    for index, row in data_df.iterrows():
        vlad_vector = [0 for _ in centroids]
        train_points = row[points_header]
        train_points = eval(train_points)

        train_lonlats = utils.idx_to_lonlat(train_points, format="tuples")
        for i, lonlat in enumerate(train_lonlats):
            lon = lonlat[0]  # for columns
            lat = lonlat[1]  # for rows
            dists = []
            # get distance from each centroid
            for centroid in centroids:
                dists.append(utils.euc_dist(centroid, lonlat))

            dnorm = np.sqrt(sum([pow(d, 2) for d in dists]))
            dists = [1 - d / dnorm for d in dists]
            vlad_vector = [v + d for (v, d) in zip(vlad_vector, dists)]

        dnorm = np.sqrt(sum([pow(d, 2) for d in dists]))
        vlad_vector = [1 - d / dnorm for d in vlad_vector]
        features.append(vlad_vector)

    print("Done computing VLAD encoding")
    for i, feats in enumerate(features):
        data_df.at[i, points_header] = feats
    if output_file is not None:
        data_df.to_csv(output_file)
    else:
        return features
예제 #6
0
def calculate_nns(test_points, train_df, paropts=None, k=5, unique_jids=False):
    # parallelization type
    if paropts:
        print("Parallelizing with", paropts)
        partype, numpar = paropts
    else:
        partype, numpar = None, None

    timestart = utils.tic()
    test_lonlat = utils.idx_to_lonlat(test_points, format="tuples")
    nearest_neighbours = [-1 for _ in range(len(train_df.index))]

    if partype:
        # num threads or processes
        if partype == "processes":
            nearest_neighbours = run_with_processes(numpar, test_lonlat,
                                                    train_df)
        elif partype == "threads":
            nearest_neighbours = run_with_threads(numpar, test_lonlat,
                                                  train_df)
    else:
        # serial execution
        nearest_neighbours = calculate_dists(test_lonlat, train_df)
    # sort the list to increasing distance
    nearest_neighbours = sorted(nearest_neighbours, key=lambda k: k[1])
    # keep unique jids, if needed
    if unique_jids:
        print("Restricting to single neighbour per jid")
        keep = [0 for _ in range(len(nearest_neighbours))]
        already_encountered = []
        for i, nn in enumerate(nearest_neighbours):
            jid = nn[2]
            if jid not in already_encountered:
                already_encountered.append(jid)
                keep[i] = True
                continue
        nearest_neighbours = [
            nearest_neighbours[i] for i in range(len(nearest_neighbours))
            if keep[i]
        ]

    # return the top 5
    nearest_neighbours = nearest_neighbours[:k]
    print("Neighbours:", [n[0] for n in nearest_neighbours])
    print("Extracted %d nearest neighbours of a %d-long test trip in: %s" %
          (len(test_points), k, utils.tictoc(timestart)))
    return nearest_neighbours
예제 #7
0
def check_reverse_lcss(max_subseqs, test_lonlat, k):
    new_subseqs = []
    for i, mxs in enumerate(max_subseqs):
        (seq_old, elapsed, row) = mxs
        # get reversed points
        train_pts = eval(row["points"])
        train_lonlat = utils.idx_to_lonlat(train_pts, format="tuples")
        _, seq_old_again = calc_lcss(test_lonlat, train_lonlat)
        train_lonlat = train_lonlat[-1::-1]
        _, idxs = calc_lcss(test_lonlat, train_lonlat)
        # re-reverse
        if idxs:
            idxs = [ii[-1::-1] for ii in idxs]
            idxs = list(set([idx for seq in idxs for idx in seq]))
            max_subseqs = update_current_maxsubseq(max_subseqs, idxs, k,
                                                   elapsed, row)
    return max_subseqs
예제 #8
0
def visualize_trips(output_folder, df):
    output_file_base = os.path.join(output_folder, "mapplot")
    num_visualize = 5
    idxs = list(range(len(df)))
    random.shuffle(idxs)
    # total_points = utils.get_total_points(df)
    print("Randomly selected %d trips to visualize, with indexes" % num_visualize, idxs[:num_visualize])
    for i in range(num_visualize):
        idx = idxs[i]
        total_pts = utils.get_total_points(df[idx : idx + 1])[0]
        file_name = output_file_base + str(i) + ".html"
        # get point coordinates lon-lat
        # points_lonlat = [utils.idx_to_lonlat(total_points[i])]
        points_lonlat = [utils.idx_to_lonlat(total_pts)]
        # produce output htmls
        utils.write_group_gml(points_lonlat, file_name)
        # produce output jpg
        utils.html_to_png(file_name, file_name + ".jpg")
예제 #9
0
def map_to_features_bow_bigrams(data_df, grid, output_file):
    rows, columns, cell_names = grid

    points_header = "points" if "points" in data_df else "Trajectory"

    features = []
    num_cells = len([0 for cc in cell_names for c in cc])
    # vector order: for N cells: bigrams of (cell1, cell2), cell1,cell3
    cell_pairs = list(combinations(list(range(num_cells)), 2))
    for n in range(num_cells):
        cell_pairs.append((n, n))
    pairs_to_idxs = {}
    # map each cell pair to a vector index
    for pair in cell_pairs:
        pairs_to_idxs[pair] = len(pairs_to_idxs)

    bigram_dim = len(pairs_to_idxs)

    for index, row in data_df.iterrows():
        bow_vector = [0 for _ in range(bigram_dim)]
        train_points = row[points_header]
        train_points = eval(train_points)

        train_lonlats = utils.idx_to_lonlat(train_points, format="tuples")
        # loop into bigrams
        for i in range(len(train_lonlats) - 1):
            lon1, lat1 = tuple(train_lonlats[i][0:2])
            lon2, lat2 = tuple(train_lonlats[i + 1][0:2])
            r1, c1 = find_cell_index(rows, lat1), find_cell_index(rows, lon1)
            r2, c2 = find_cell_index(rows, lat2), find_cell_index(rows, lon2)

            linear_idx1 = r1 * len(columns) + c1
            linear_idx2 = r2 * len(columns) + c2
            stuple = tuple(sorted((linear_idx1, linear_idx2)))
            vec_idx = pairs_to_idxs[stuple]
            bow_vector[vec_idx] += 1
        features.append(bow_vector)

    for i, feats in enumerate(features):
        data_df.at[i, points_header] = feats
    if output_file is not None:
        data_df.to_csv(output_file)
    else:
        return features
예제 #10
0
def calculate_dists(test_lonlat, train_df, ret_container=None, paropts=None):
    if ret_container is not None:
        dists = ret_container
    else:
        dists = [-1 for _ in range(len(train_df))]
    for index, row in train_df.iterrows():
        train_points = row["points"]
        jid = row["journeyId"]
        # print(train_points)
        train_points = eval(train_points)
        trip_lonlat = utils.idx_to_lonlat(train_points, format="tuples")
        # calculate distance
        distance = calculate_dynamic_time_warping(test_lonlat, trip_lonlat,
                                                  paropts)
        #distance = calculate_dynamic_time_warping(test_lonlat, trip_lonlat, paropts, impl = "libdtw")
        # print("Calculated distance: %.2f for trip: %d/%d : %s" % (distance, index+1, len(train_df.index), str(row["journeyId"])))
        dists[index] = (int(row["tripId"]), distance, jid, train_points)
        # print("Computed distance for train trip %d/%d" % (index+1, len(train_df)))
    return dists
예제 #11
0
def map_to_features_pointwise(data_df, grid):
    rows, columns, cell_names = grid
    # measure some statistics
    grid_hist = {}
    total_points = 0
    numcells = (len(rows) + 1) * (len(columns) + 1)
    for cc in cell_names:
        for c in cc:
            grid_hist['C' + str(c)] = 0

    points_header = "points" if "points" in data_df else "Trajectory"
    features, timestamps = [], []
    for index, row in data_df.iterrows():
        train_points = row[points_header]
        train_points = eval(train_points)

        ts = [p[0] for p in train_points]
        timestamps.append(ts)
        train_lonlats = utils.idx_to_lonlat(train_points, format="tuples")
        feature_list = []
        for i, lonlat in enumerate(train_lonlats):
            lon = lonlat[0]  # for columns
            lat = lonlat[1]  # for rows
            row_idx = find_cell_index(rows, lat)
            col_idx = find_cell_index(columns, lon)
            cell_name = 'C' + cell_names[row_idx][col_idx]
            # visualize_grid(rows,columns,None,None,[[lon],[lat]])
            grid_hist[cell_name] += 1
            total_points += 1

            feature_list.append(cell_name)
        features.append(feature_list)
    # show stats
    print()
    print("Grid assignment frequencies of the total of %d points:" %
          total_points)
    ssum = 0
    for i, name in enumerate(grid_hist):
        print(i, "/", numcells, name, grid_hist[name])
        ssum += grid_hist[name]
    return features, timestamps
예제 #12
0
def exec_with_processes(df, process_num, test_lonlat, k):
    max_subseqs = []
    pool = ThreadPool(processes=process_num)
    for index, row in df.iterrows():
        train_points = row["points"]
        train_points = eval(train_points)
        train_lonlat = utils.idx_to_lonlat(train_points, format="tuples")
        timestart = utils.tic()
        # compute common subsequences between the test trip and the current candidate
        async_result = pool.apply_async(calc_lcss, (test_lonlat, train_lonlat))
        subseqs, subseqs_idx = async_result.get()
        elapsed = utils.tictoc(timestart)
        # sort by decr. length
        subseqs_idx = sorted(subseqs_idx, key=lambda x: len(x), reverse=True)
        # update the list of the longest subsequences
        max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k,
                                               elapsed, row)
    print("Got %d common subsequences" % len(max_subseqs))
    pool.close()
    pool.join()
    return max_subseqs
예제 #13
0
def preprocessing_for_visualisation(test_points, max_subseqs, file_name,
                                    index):
    # initialize to the test trip data
    labels = ["test trip: %s" % index]  # test jid
    points = [[utils.get_lonlat_tuple(test_points)]]
    colors = [['b']]

    for j, sseq in enumerate(max_subseqs):
        cols, print_idxs, pts = [], [], []
        # trip jid
        jid = sseq[2]["journeyId"]
        subseq_idxs = sseq[0]
        num_points = sum([len(x) for x in subseq_idxs])
        # label
        str = [
            "neighbour %d" % j,
            "jid: %s" % jid,
            "Matching pts: %d" % num_points,
            "Delta-t: %s " % sseq[1]
        ]
        labels.append("\n".join(str))
        # print("seq first/last idxs:",[(s[0],s[-1]) for s in subseq_idxs])

        # get the points data from the pandas dataframe to lonlat tuples
        train_points = sseq[2]["points"]
        train_points = utils.idx_to_lonlat(eval(train_points), format='tuples')

        # prepend blue, if list is not starting at first point
        if subseq_idxs[0][0] > 0:
            idxs = list(range(subseq_idxs[0][0] + 2))
            print_idxs.append(list(range(idxs[0], idxs[-1])))
            cols.append('b')

        # for each sequence, make the matching red and the following blue, if any
        for seq_idx, idxs in enumerate(subseq_idxs):
            # the match
            if idxs:
                print_idxs.append(idxs)
                cols.append('r')

            # check for a following blue portion: not existent iff last seq idx is last idx of the trip
            if idxs[-1] == len(train_points) - 1:
                continue
            # else, either up to first point of next subsequence, or last point in row
            if seq_idx == len(subseq_idxs) - 1:
                next_seq_first_pt = len(train_points) - 1
            else:
                next_seq_first_pt = subseq_idxs[seq_idx + 1][0]
            # blue it up
            b = list(range(idxs[-1], next_seq_first_pt + 1))
            if b[0]:
                print_idxs.append(b)
                cols.append('b')

        # append the points corresponding to the indexes
        for i, idx_list in enumerate(print_idxs):
            pts.append(
                utils.get_lonlat_tuple([train_points[i] for i in idx_list]))
            # print("Idx list:",idx_list[0],idx_list[-1],"col:",cols[i])

        # add to the list of points to draw
        points.append(pts)
        colors.append(cols)
        # print("Added pts:", pts)
        # print("Added cols:", cols)
    # send the whole parameter bundle to be drawn
    utils.visualize_point_sequences(points, colors, labels, file_name)