def build_flight_trajectory_df(flights_to_airport, label_encoder, flight_ids, max_flights=1000, epsilon=None): """ build data-frame contains flight-ID and coordinators of flight trajectories Args: flights_to_airport (pd.DataFrame): label_encoder (LabelEncoder): flight_ids (list[str]): max_flights (int): is_simplify (bool): Returns: flight_df (pd.DataFrame) """ encoded_idx = [] trajectories = [] flight_dicts = {} for fid in flight_ids[:max_flights]: df_min = flights_to_airport[flights_to_airport['Flight_ID'] == fid] df_min = df_min.sort_values(by='DRemains', ascending=False) encode_id = label_encoder.transform([fid])[0] encoded_idx.append(encode_id) coords = df_min[['Latitude', 'Longitude']].values flight_dicts[encode_id] = coords if epsilon: coords = simplify_coordinator(coords, epsilon=epsilon) trajectories.append(coords) flight_df = pd.DataFrame() flight_df['idx'] = encoded_idx flight_df['flight_id'] = flight_ids[:max_flights] flight_df['trajectory'] = trajectories print("Total extracted flights %s" % len(flight_df)) return flight_df, flight_dicts
def main(input_path, airport_code, distance, min_sample, max_flights, min_dr, max_dr, epsilon=0.001): history = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_") logger.info("=============================================") logger.info( "================ DATETIME {} ================".format(history)) df = pd.read_csv(input_path) logger.info(df.head()) file_name = input_path.split("/")[-1].replace(".csv", "") # get fixed flights_to_airport = filter_by_airport(df=df, airport_code=airport_code, min_dr=min_dr, max_dr=max_dr) logger.info("Encoding flight ID ...") flight_ids = flights_to_airport['Flight_ID'].unique().tolist() logger.info("Total # flight ID {}".format(len(flight_ids))) flight_encoder = flight_id_encoder(flight_ids) logger.info( "Extracting trajectory coordinators and flight id from dataset") flight_df, flight_dicts = build_flight_trajectory_df( flights_to_airport=flights_to_airport, label_encoder=flight_encoder, flight_ids=flight_ids, max_flights=max_flights, epsilon=epsilon) # prepare data-frame for detect entrance points toward the airport entrance_to_airport = filter_by_airport(df=df, airport_code=airport_code, min_dr=min_dr, max_dr=max_dr) entrance_trajectories = [] for fid in flight_ids[:max_flights]: tmp_df = entrance_to_airport[entrance_to_airport['Flight_ID'] == fid] tmp_df = tmp_df.sort_values(by='DRemains', ascending=False) entrance_trajectories.append(tmp_df[['Latitude', 'Longitude']].values) simplified_coords = [ simplify_coordinator(coord_curve=curve, epsilon=epsilon) for curve in entrance_trajectories ] # create data-frame result clusters_df = pd.DataFrame() clusters_df['Flight_ID'] = flight_encoder.inverse_transform( flight_df['idx']) logger.info("Building distance matrix - {} ...".format(distance)) dist_matrix = build_matrix_distances(coords=simplified_coords, dist_type=distance) # prepare grid search for tuning epsilon alpha = 0.001 upper_bound = max(dist_matrix[0, :]) lower_bound = min(dist_matrix[0, :]) step = (upper_bound - lower_bound) * alpha logger.info("upper_bound {}, lower_bound {}, step {}".format( upper_bound, lower_bound, step)) # eps_list = np.arange(step*1, step*5, step) eps_list = [max_km / KM_PER_RADIAN / 10.0 for max_km in [5, 10, 15, 20]] print(eps_list) last_clusters = None # for min_sp in range(1, min_sample, 1): min_sp = min_sample for eps in eps_list: epsilon = eps # epsilon = eps / kms_per_radian clusters, labels, silhouette = cluster_trajectories( dist_matrix=dist_matrix, epsilon=epsilon, min_samples=min_sp) # list of cluster id along side with the encoded flight id last_clusters = clusters unique_labels = set(labels) clusters_df['c_{}_eps_{}'.format(len(unique_labels), epsilon)] = labels # export images result_file_name = "../tmp/{}_{}_dbscan_sil_{}_ms_{}_eps_{}.png".format( file_name, airport_code, silhouette, min_sp, epsilon) traffic_flight_plot(flight_ids=flight_df['idx'].tolist(), clusters=labels, flight_dicts=flight_dicts, file_path=result_file_name, info={ 'file_name': file_name, 'airport_code': airport_code }) if len(last_clusters) <= 2: break # export result clusters_df.to_csv("../tmp/{}_{}_ms_{}.csv".format(file_name, airport_code, min_sample), index=False) logger.info("\n {}".format(clusters_df.head()))
def main( input_path, airport_code='WSSS', max_flights=1000, estimated_n_entrance=9, threshold=0.6, algo='k-means', min_dr=1.0, max_dr=2.0, filter_date='', epsilon=0.001 ): # load raw-data from csv logger = gen_log_file(path_to_file='../tmp/lsh_clustering_{}.log'.format(filter_date)) df = pd.read_csv(input_path) file_name = input_path.split("/")[-1].replace(".csv", "") if filter_date != '': print("before filtering %s" % len(df)) df['filtered'] = df['Actual_Arrival_Time_(UTC)'].apply( lambda x: filter_by_date(datetime=x, filter_date=filter_date) ) df = df[df['filtered']] print("after filtering %s" % len(df)) # filter data by airport code-name flights_to_airport = filter_by_airport( df=df, airport_code=airport_code, min_dr=0.0, max_dr=max_dr ) # prepare data-frame for detect entrance points toward the airport entrance_to_airport = filter_by_airport( df=df, airport_code=airport_code, min_dr=min_dr, max_dr=max_dr ) logger.info("Encoding flight ID ... %s" % airport_code) flight_ids = flights_to_airport['Flight_ID'].unique().tolist() logger.info("Total # flight ID {}".format(len(flight_ids))) flight_encoder = flight_id_encoder(flight_ids) flight_df, flight_dicts = build_flight_trajectory_df( flights_to_airport=flights_to_airport, label_encoder=flight_encoder, flight_ids=flight_ids, max_flights=max_flights, epsilon=epsilon ) entrance_trajectories = [] total_original_points = 0 for fid in flight_ids[:max_flights]: tmp_df = entrance_to_airport[entrance_to_airport['Flight_ID'] == fid] tmp_df = tmp_df.sort_values(by='DRemains', ascending=False) lat_lon_values = tmp_df[['Latitude', 'Longitude']].values total_original_points += len(lat_lon_values) entrance_trajectories.append(lat_lon_values) simplified_coords = [simplify_coordinator(coord_curve=curve, epsilon=epsilon) for curve in entrance_trajectories ] logger.info("Total original points at entrance %s" % total_original_points) point_coords = simplified_coords[0] for item in simplified_coords[1:]: point_coords = np.concatenate((point_coords, item)) logger.info("Total points at entrance %s" % len(point_coords)) detect_entrance_algo = algo reduced_groups, classifier = detect_entrance_ways( point_coords=point_coords, algorithm=detect_entrance_algo, estimated_n_entrance=estimated_n_entrance ) # we trick each group label as a term, then each trajectory will contains # list of terms/tokens if detect_entrance_algo == 'dbscan': flight_df['groups'] = [classifier.fit_predict(X=coord) for coord in entrance_trajectories] elif detect_entrance_algo == 'k-means': entrance_groups = [] for traj in entrance_trajectories: if len(traj) > 1: entrance_groups.append(classifier.predict(X=traj)) else: entrance_groups.append([-1]) flight_df['groups'] = entrance_groups # convert clustering number to group label, flight_df['groups'] = flight_df['groups'].apply( lambda clusters: ["G{}".format(c) for c in clusters]) # Now we will apply Jaccard similarity and LSH for theses trajectories lsh_clustering = LSHClusteringLib( threshold=threshold, num_perm=128 ) flight_df['hash'] = lsh_clustering.compute_min_hash_lsh_over_data( record_ids=flight_df['idx'].tolist(), data=flight_df['groups'].tolist() ) flight_df['duplicated'] = flight_df['hash'].apply( lambda x: lsh_clustering.query_duplicated_record(x) ) flight_df['buckets'] = flight_df['duplicated'].apply( lambda x: '_'.join(x) ) unique_buckets = flight_df['buckets'].unique().tolist() logger.info("number buckets %s" % len(unique_buckets)) logger.info(len(flight_df.groupby('buckets').size())) n_curve_per_bucket = flight_df.groupby('buckets').size().to_dict() def convert_to_cluster_number(bucket_label, unique_buckets, total_buckets, n_curve_per_bucket=None): if (n_curve_per_bucket[bucket_label] * 100.0 / total_buckets) <= 5.0: return -1 return unique_buckets.index(bucket_label) cluster_labels = [ convert_to_cluster_number(bucket, unique_buckets, len(flight_df), n_curve_per_bucket) for bucket in flight_df['buckets'].tolist() ] flight_df['cluster'] = cluster_labels logger.info("Non-outlier cluster number %s" % len(flight_df[flight_df['cluster'] != -1]['cluster'].unique().tolist()) ) logger.info(flight_df[flight_df['cluster'] != -1]['cluster'].unique()) n_curve_per_cluster = flight_df.groupby('cluster').size() logger.info(n_curve_per_cluster) # # evaluation silhouette_val = None dist_matrix = build_matrix_distances( coords=flight_df['trajectory'].tolist(), dist_type='directed_hausdorff' ) silhouette_val = compute_silhouette_score( feature_matrix=dist_matrix, labels=cluster_labels ) logger.info("Silhouette Coefficient via LSH %s" % silhouette_val) # ### base-line with DBSCAN # from db_clustering import cluster_trajectories # alpha = 0.001 # upper_bound = max(dist_matrix[0, :]) # lower_bound = min(dist_matrix[0, :]) # step = (upper_bound - lower_bound) * alpha # logger.info( # "upper_bound {}, lower_bound {}, step {}".format( # upper_bound, lower_bound, step) # ) # eps_list = np.arange(step*1, step*5, step) # for eps in eps_list: # try: # clusters, labels, silhouette = cluster_trajectories( # dist_matrix=dist_matrix, # epsilon=eps, # min_samples=1 # ) # except: # continue plot_file_name = "{file_name}_{airport_code}_lsh_{threshold}_{algo}_{n_entrance}_dr_{dr_range}_sil_{silhoette}.png".format( file_name=file_name, airport_code="{}_{}_flights".format(airport_code, len(flight_df)), threshold=threshold, algo=detect_entrance_algo, n_entrance=estimated_n_entrance, dr_range="{}_{}".format(min_dr, max_dr), silhoette = silhouette_val ) traffic_flight_plot( flight_ids=flight_df['idx'].tolist(), clusters=cluster_labels, flight_dicts=flight_dicts, file_path=plot_file_name, group_clusters=reduced_groups, info={'file_name': file_name, 'airport_code': airport_code} ) result_file_name = "{file_name}_{airport_code}_lsh_{threshold}_{algo}_{n_entrance}_dr_{dr_range}_sil_{silhoette}.png".format( file_name=file_name, airport_code="{}_{}_flights".format(airport_code, len(flight_df)), threshold=threshold, algo=detect_entrance_algo, n_entrance=estimated_n_entrance, dr_range="{}_{}".format(min_dr, max_dr), silhoette=silhouette_val ) flight_df[['flight_id', 'buckets', 'cluster']].to_csv("../tmp/{}.csv".format(result_file_name), index=False)