示例#1
0
def build_flight_trajectory_df(flights_to_airport,
                               label_encoder,
                               flight_ids,
                               max_flights=1000,
                               epsilon=None):
    """
    build data-frame contains flight-ID and coordinators of flight trajectories
    Args:
        flights_to_airport (pd.DataFrame):
        label_encoder (LabelEncoder):
        flight_ids (list[str]):
        max_flights (int):
        is_simplify (bool):

    Returns:
        flight_df (pd.DataFrame)
    """
    encoded_idx = []
    trajectories = []
    flight_dicts = {}

    for fid in flight_ids[:max_flights]:
        df_min = flights_to_airport[flights_to_airport['Flight_ID'] == fid]
        df_min = df_min.sort_values(by='DRemains', ascending=False)
        encode_id = label_encoder.transform([fid])[0]
        encoded_idx.append(encode_id)
        coords = df_min[['Latitude', 'Longitude']].values
        flight_dicts[encode_id] = coords
        if epsilon:
            coords = simplify_coordinator(coords, epsilon=epsilon)
        trajectories.append(coords)

    flight_df = pd.DataFrame()
    flight_df['idx'] = encoded_idx
    flight_df['flight_id'] = flight_ids[:max_flights]
    flight_df['trajectory'] = trajectories
    print("Total extracted flights %s" % len(flight_df))

    return flight_df, flight_dicts
示例#2
0
def main(input_path,
         airport_code,
         distance,
         min_sample,
         max_flights,
         min_dr,
         max_dr,
         epsilon=0.001):
    history = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_")
    logger.info("=============================================")
    logger.info(
        "================ DATETIME {} ================".format(history))
    df = pd.read_csv(input_path)
    logger.info(df.head())
    file_name = input_path.split("/")[-1].replace(".csv", "")

    # get fixed
    flights_to_airport = filter_by_airport(df=df,
                                           airport_code=airport_code,
                                           min_dr=min_dr,
                                           max_dr=max_dr)

    logger.info("Encoding flight ID ...")
    flight_ids = flights_to_airport['Flight_ID'].unique().tolist()
    logger.info("Total # flight ID {}".format(len(flight_ids)))
    flight_encoder = flight_id_encoder(flight_ids)

    logger.info(
        "Extracting trajectory coordinators and flight id from dataset")
    flight_df, flight_dicts = build_flight_trajectory_df(
        flights_to_airport=flights_to_airport,
        label_encoder=flight_encoder,
        flight_ids=flight_ids,
        max_flights=max_flights,
        epsilon=epsilon)

    # prepare data-frame for detect entrance points toward the airport
    entrance_to_airport = filter_by_airport(df=df,
                                            airport_code=airport_code,
                                            min_dr=min_dr,
                                            max_dr=max_dr)
    entrance_trajectories = []
    for fid in flight_ids[:max_flights]:
        tmp_df = entrance_to_airport[entrance_to_airport['Flight_ID'] == fid]
        tmp_df = tmp_df.sort_values(by='DRemains', ascending=False)
        entrance_trajectories.append(tmp_df[['Latitude', 'Longitude']].values)
    simplified_coords = [
        simplify_coordinator(coord_curve=curve, epsilon=epsilon)
        for curve in entrance_trajectories
    ]

    # create data-frame result
    clusters_df = pd.DataFrame()
    clusters_df['Flight_ID'] = flight_encoder.inverse_transform(
        flight_df['idx'])

    logger.info("Building distance matrix - {} ...".format(distance))
    dist_matrix = build_matrix_distances(coords=simplified_coords,
                                         dist_type=distance)

    # prepare grid search for tuning epsilon
    alpha = 0.001
    upper_bound = max(dist_matrix[0, :])
    lower_bound = min(dist_matrix[0, :])
    step = (upper_bound - lower_bound) * alpha
    logger.info("upper_bound {}, lower_bound {}, step {}".format(
        upper_bound, lower_bound, step))
    # eps_list = np.arange(step*1, step*5, step)
    eps_list = [max_km / KM_PER_RADIAN / 10.0 for max_km in [5, 10, 15, 20]]
    print(eps_list)

    last_clusters = None
    # for min_sp in range(1, min_sample, 1):
    min_sp = min_sample
    for eps in eps_list:
        epsilon = eps
        # epsilon =  eps / kms_per_radian
        clusters, labels, silhouette = cluster_trajectories(
            dist_matrix=dist_matrix, epsilon=epsilon, min_samples=min_sp)

        # list of cluster id along side with the  encoded flight id
        last_clusters = clusters
        unique_labels = set(labels)
        clusters_df['c_{}_eps_{}'.format(len(unique_labels), epsilon)] = labels

        # export images
        result_file_name = "../tmp/{}_{}_dbscan_sil_{}_ms_{}_eps_{}.png".format(
            file_name, airport_code, silhouette, min_sp, epsilon)
        traffic_flight_plot(flight_ids=flight_df['idx'].tolist(),
                            clusters=labels,
                            flight_dicts=flight_dicts,
                            file_path=result_file_name,
                            info={
                                'file_name': file_name,
                                'airport_code': airport_code
                            })
        if len(last_clusters) <= 2:
            break

    # export result
    clusters_df.to_csv("../tmp/{}_{}_ms_{}.csv".format(file_name, airport_code,
                                                       min_sample),
                       index=False)
    logger.info("\n {}".format(clusters_df.head()))
示例#3
0
def main(
        input_path,
        airport_code='WSSS',
        max_flights=1000,
        estimated_n_entrance=9,
        threshold=0.6,
        algo='k-means',
        min_dr=1.0,
        max_dr=2.0,
        filter_date='',
        epsilon=0.001
):
    # load raw-data from csv
    logger = gen_log_file(path_to_file='../tmp/lsh_clustering_{}.log'.format(filter_date))
    df = pd.read_csv(input_path)
    file_name = input_path.split("/")[-1].replace(".csv", "")

    if filter_date != '':
        print("before filtering %s" % len(df))
        df['filtered'] = df['Actual_Arrival_Time_(UTC)'].apply(
            lambda x: filter_by_date(datetime=x, filter_date=filter_date)
        )
        df = df[df['filtered']]
        print("after filtering %s" % len(df))

    # filter data by airport code-name
    flights_to_airport = filter_by_airport(
        df=df,
        airport_code=airport_code,
        min_dr=0.0,
        max_dr=max_dr
    )

    # prepare data-frame for detect entrance points toward the airport
    entrance_to_airport = filter_by_airport(
        df=df,
        airport_code=airport_code,
        min_dr=min_dr,
        max_dr=max_dr
    )

    logger.info("Encoding flight ID ... %s" % airport_code)
    flight_ids = flights_to_airport['Flight_ID'].unique().tolist()
    logger.info("Total # flight ID {}".format(len(flight_ids)))
    flight_encoder = flight_id_encoder(flight_ids)

    flight_df, flight_dicts = build_flight_trajectory_df(
        flights_to_airport=flights_to_airport,
        label_encoder=flight_encoder,
        flight_ids=flight_ids,
        max_flights=max_flights,
        epsilon=epsilon
    )

    entrance_trajectories = []
    total_original_points = 0
    for fid in flight_ids[:max_flights]:
        tmp_df = entrance_to_airport[entrance_to_airport['Flight_ID'] == fid]
        tmp_df = tmp_df.sort_values(by='DRemains', ascending=False)
        lat_lon_values = tmp_df[['Latitude', 'Longitude']].values
        total_original_points += len(lat_lon_values)
        entrance_trajectories.append(lat_lon_values)

    simplified_coords = [simplify_coordinator(coord_curve=curve, epsilon=epsilon)
                         for curve in entrance_trajectories
                         ]

    logger.info("Total original points at entrance %s" % total_original_points)
    point_coords = simplified_coords[0]
    for item in simplified_coords[1:]:
        point_coords = np.concatenate((point_coords, item))
    logger.info("Total points at entrance %s" % len(point_coords))

    detect_entrance_algo = algo
    reduced_groups, classifier = detect_entrance_ways(
        point_coords=point_coords,
        algorithm=detect_entrance_algo,
        estimated_n_entrance=estimated_n_entrance
    )


    # we trick each group label as a term, then each trajectory will contains
    # list of terms/tokens
    if detect_entrance_algo == 'dbscan':
        flight_df['groups'] = [classifier.fit_predict(X=coord)
                               for coord in entrance_trajectories]
    elif detect_entrance_algo == 'k-means':
        entrance_groups = []
        for traj in entrance_trajectories:
            if len(traj) > 1:
                entrance_groups.append(classifier.predict(X=traj))
            else:
                entrance_groups.append([-1])
        flight_df['groups'] = entrance_groups

    # convert clustering number to group label,
    flight_df['groups'] = flight_df['groups'].apply(
        lambda clusters: ["G{}".format(c) for c in clusters])

    # Now we will apply Jaccard similarity and LSH for theses trajectories
    lsh_clustering = LSHClusteringLib(
        threshold=threshold,
        num_perm=128
    )
    flight_df['hash'] = lsh_clustering.compute_min_hash_lsh_over_data(
        record_ids=flight_df['idx'].tolist(),
        data=flight_df['groups'].tolist()
    )

    flight_df['duplicated'] = flight_df['hash'].apply(
        lambda x: lsh_clustering.query_duplicated_record(x)
    )

    flight_df['buckets'] = flight_df['duplicated'].apply(
        lambda x: '_'.join(x)
    )
    unique_buckets = flight_df['buckets'].unique().tolist()
    logger.info("number buckets %s" % len(unique_buckets))
    logger.info(len(flight_df.groupby('buckets').size()))
    n_curve_per_bucket = flight_df.groupby('buckets').size().to_dict()

    def convert_to_cluster_number(bucket_label, unique_buckets, total_buckets, n_curve_per_bucket=None):
        if (n_curve_per_bucket[bucket_label] * 100.0 / total_buckets) <= 5.0:
            return -1
        return unique_buckets.index(bucket_label)

    cluster_labels = [
        convert_to_cluster_number(bucket, unique_buckets, len(flight_df), n_curve_per_bucket)
        for bucket in flight_df['buckets'].tolist()
    ]
    flight_df['cluster'] = cluster_labels
    logger.info("Non-outlier cluster number %s" %
          len(flight_df[flight_df['cluster'] != -1]['cluster'].unique().tolist())
    )
    logger.info(flight_df[flight_df['cluster'] != -1]['cluster'].unique())
    n_curve_per_cluster = flight_df.groupby('cluster').size()
    logger.info(n_curve_per_cluster)


    # # evaluation
    silhouette_val = None
    dist_matrix = build_matrix_distances(
        coords=flight_df['trajectory'].tolist(),
        dist_type='directed_hausdorff'
    )
    silhouette_val = compute_silhouette_score(
        feature_matrix=dist_matrix, labels=cluster_labels
    )
    logger.info("Silhouette Coefficient via LSH %s" % silhouette_val)

    # ### base-line with DBSCAN
    # from db_clustering import cluster_trajectories
    # alpha = 0.001
    # upper_bound = max(dist_matrix[0, :])
    # lower_bound = min(dist_matrix[0, :])
    # step = (upper_bound - lower_bound) * alpha
    # logger.info(
    #     "upper_bound {}, lower_bound {}, step {}".format(
    #         upper_bound, lower_bound, step)
    # )
    # eps_list = np.arange(step*1, step*5, step)
    # for eps in eps_list:
    #     try:
    #         clusters, labels, silhouette = cluster_trajectories(
    #             dist_matrix=dist_matrix,
    #             epsilon=eps,
    #             min_samples=1
    #         )
    #     except:
    #         continue




    plot_file_name = "{file_name}_{airport_code}_lsh_{threshold}_{algo}_{n_entrance}_dr_{dr_range}_sil_{silhoette}.png".format(
            file_name=file_name,
            airport_code="{}_{}_flights".format(airport_code, len(flight_df)),
            threshold=threshold,
            algo=detect_entrance_algo,
            n_entrance=estimated_n_entrance,
            dr_range="{}_{}".format(min_dr, max_dr),
            silhoette = silhouette_val

        )

    traffic_flight_plot(
        flight_ids=flight_df['idx'].tolist(),
        clusters=cluster_labels,
        flight_dicts=flight_dicts,
        file_path=plot_file_name,
        group_clusters=reduced_groups,
        info={'file_name': file_name, 'airport_code': airport_code}
    )

    result_file_name = "{file_name}_{airport_code}_lsh_{threshold}_{algo}_{n_entrance}_dr_{dr_range}_sil_{silhoette}.png".format(
            file_name=file_name,
            airport_code="{}_{}_flights".format(airport_code, len(flight_df)),
            threshold=threshold,
            algo=detect_entrance_algo,
            n_entrance=estimated_n_entrance,
            dr_range="{}_{}".format(min_dr, max_dr),
            silhoette=silhouette_val

        )
    flight_df[['flight_id', 'buckets', 'cluster']].to_csv("../tmp/{}.csv".format(result_file_name), index=False)