def merge_cpr_fr24_data(date, *, max_speed=DEFAULT_MAX_SPEED, distance_accuracy=DEFAULT_DISTANCE_ACCURACY): """ Match, merge and clean refined CPR and FR24 ADS-B data for the given date. Parameters ---------- date: string The date in ISO8601 format, e.g. 2017-08-16 max_speed: string The maximum ground speed permitted between adjacent positions [Knots], default: 750 Knots. distance_accuracy: string The maximum distance between positions at the same time [Nautical Miles], default: 0.25 NM. Returns ------- True if succesful, False otherwise. """ if is_valid_iso8601_date(date): os.chdir(REFINED_DIR) tasks.match_cpr_adsb_trajectories_on(date) tasks.merge_cpr_adsb_trajectories_on(date) return tasks.clean_raw_positions_data(CPR_FR24, date, float(max_speed), float(distance_accuracy)) else: log.error("The date is not valid: %s", date) return False
def convert_airport_ids(flights_filename, airports_filename=DEFAULT_AIRPORTS_FILENAME): # Validate the filename filename_start = os.path.basename(flights_filename)[:len(IATA)] if filename_start != IATA: log.error('File is not an iata_ flights file: %s', flights_filename) return errno.EINVAL # Extract the date string from the filename and validate it flights_date = read_iso8601_date_string(flights_filename) if not is_valid_iso8601_date(flights_date): log.error('iata fr24 flights file: %s, invalid date: %s', flights_filename, flights_date) return errno.EINVAL log.info('iata fr24 file: %s', flights_filename) log.info('airports file: %s', airports_filename) # Read the flights into a pandas DataFrame flights_df = pd.DataFrame() try: flights_df = pd.read_csv(flights_filename, memory_map=True) except EnvironmentError: log.error('could not read file: %s', flights_filename) return errno.ENOENT log.info('flights file read ok') # Read the airports into a pandas DataFrame airports_df = pd.DataFrame() try: airports_df = pd.read_csv(airports_filename, memory_map=True) except EnvironmentError: log.error('could not read file: %s', airports_filename) return errno.ENOENT log.info('airports file read ok') # Create a new dataframe indexed by IATA_AP_CODE, referencing ICAO_AP_CODE airport_codes_df = airports_df.set_index('IATA_AP_CODE')['ICAO_AP_CODE'] # Replace flight IATA airport codes with ICAO airport codes flights_df['ADEP'] = flights_df['ADEP'].replace(airport_codes_df) flights_df['ADES'] = flights_df['ADES'].replace(airport_codes_df) log.info('airport ids converted') output_filename = create_flights_filename(FR24, flights_date) try: flights_df.to_csv(output_filename, index=False) log.info('written file: %s', output_filename) except EnvironmentError: log.error('could not write file: %s', output_filename) return errno.EACCES log.info('airports conversion complete') return 0
def refine_cpr_data(date, *, max_speed=DEFAULT_MAX_SPEED, distance_accuracy=DEFAULT_DISTANCE_ACCURACY): """ Refine a CPR file for the given date. Parameters ---------- date: string The date in ISO8601 format, e.g. 2017-08-16:q:q max_speed: string The maximum ground speed permitted between adjacent positions [Knots], default: 750 Knots. distance_accuracy: string The maximum distance between positions at the same time [Nautical Miles], default: 0.25 NM. Returns ------- True if succesful, False otherwise. """ if is_valid_iso8601_date(date): os.chdir(REFINED_DIR) tasks.convert_cpr_file(date) return tasks.clean_raw_positions_data(CPR, date, float(max_speed), float(distance_accuracy)) else: log.error("The date is not valid: %s", date) return False
def merge_cpr_fr24_overnight_flights(date): """ Match, merge and clean merged CPR and FR24 ADS-B data for the given date, with merged CPR and FR24 ADS-B data for the previous day. Parameters ---------- date: string The date in ISO8601 format, e.g. 2017-08-16 max_speed: string The maximum ground speed permitted between adjacent positions [Knots], default: 750 Knots. distance_accuracy: string The maximum distance between positions at the same time [Nautical Miles], default: 0.25 NM. Returns ------- True if succesful, False otherwise. """ if is_valid_iso8601_date(date): os.chdir(REFINED_DIR) if tasks.match_previous_days_flights(date): if tasks.merge_previous_days_data(date): return tasks.clean_overnight_cpr_fr24_positions(date) else: log.error("The date is not valid: %s", date) return False
def match_overnight_flights_on_day( date, max_time_difference=DEFAULT_MAXIMUM_TIME_DELTA): """ Match flights for the given day with flights for the previous day. Parameters ---------- date: string The date in ISO8601 format, e.g. 2017-08-16 """ if is_valid_iso8601_date(date): # get the CPR data from the Google bucket log.info(f'Getting data for date: {date}') match_flights_files = create_match_overnight_flights_input_filenames( date) if not get_processed(REFINED_MERGED_DAILY_CPR_FR24, match_flights_files): log.error('Flights file not found in daily_cpr_fr24 bucket') return errno.ENOENT error_code = match_overnight_flights(match_flights_files, max_time_difference) if error_code: return error_code gc.collect() prev_ids_filename = create_matching_ids_filename(PREV_DAY, date) if not put_processed(REFINED_MERGED_OVERNIGHT_CPR_FR24_IDS, [prev_ids_filename]): log.error('Could not write ids to overnight_cpr_fr24/ids bucket') return errno.EACCES extract_data_input_files = create_extract_overnight_data_input_filenames( date) if not get_processed(REFINED_MERGED_DAILY_CPR_FR24, extract_data_input_files[2:]): log.error( 'Positions or events file not found in daily_cpr_fr24 bucket') return errno.ENOENT error_code = extract_overnight_data(extract_data_input_files) if error_code: return error_code extract_data_output_files = create_extract_overnight_data_output_filenames( date) if not put_processed(REFINED_MERGED_OVERNIGHT_CPR_FR24, extract_data_output_files): log.error('Could not write to overnight_cpr_fr24 bucket') return errno.EACCES else: log.error(f'invalid date: {date}') return errno.EINVAL return 0
def extract_fleet_data(flights_filename): # Extract the date string from the filename and validate it flights_date = read_iso8601_date_string(flights_filename) if not is_valid_iso8601_date(flights_date): log.error('fr24 flights file: %s, invalid date: %s', flights_filename, flights_date) return errno.EINVAL log.info('fr24 flights file: %s', flights_filename) # Read the flights into a pandas DataFrame flights_df = pd.DataFrame() try: flights_df = pd.read_csv(flights_filename, usecols=[ 'AIRCRAFT_REG', 'AIRCRAFT_TYPE', 'AIRCRAFT_ADDRESS', 'PERIOD_START' ], memory_map=True) except EnvironmentError: log.error('could not read file: %s', flights_filename) return errno.ENOENT log.info('flights file read ok') # Get the rows with non-null AIRCRAFT_REG fields valid_reg_df = flights_df.loc[flights_df['AIRCRAFT_REG'].notnull()] # Create a fleet pandas DataFrame with rows sorted by AIRCRAFT_REG then PERIOD_START # Remove rows with duplicate: AIRCRAFT_REG, AIRCRAFT_TYPE & AIRCRAFT_ADDRESS fleet_df = valid_reg_df.sort_values(by=['AIRCRAFT_REG', 'PERIOD_START']) fleet_df.drop_duplicates( subset=['AIRCRAFT_REG', 'AIRCRAFT_TYPE', 'AIRCRAFT_ADDRESS'], inplace=True) # Output the fleet DataFrame to a '.csv file output_filename = create_fleet_data_filename(flights_date) try: fleet_df.to_csv(output_filename, index=False) except EnvironmentError: log.error('could not write file: %s', output_filename) return errno.EACCES log.info('written file: %s', output_filename) log.info('fleet data extraction complete') return 0
def refine_fr24_data(date, *, max_speed=DEFAULT_MAX_SPEED, distance_accuracy=DEFAULT_DISTANCE_ACCURACY, airports_filename=DEFAULT_AIRPORTS_FILENAME): """ Refine FR24 ADS-B data for the given date. Parameters ---------- date: string The date in ISO8601 format, e.g. 2017-08-16 max_speed: string The maximum ground speed permitted between adjacent positions [Knots], default: 750 Knots. distance_accuracy: string The maximum distance between positions at the same time [Nautical Miles], default: 0.25 NM. airports_filename: string The name of a file containing airport codes for IATA to ICAO conversion, default: airports.csv Returns ------- True if succesful, False otherwise. """ if is_valid_iso8601_date(date): os.chdir(REFINED_DIR) tasks.convert_fr24_date(date) # Note: these 3 tasks could run in parallel tasks.clean_raw_positions_data(FR24, date, float(max_speed), float(distance_accuracy)) tasks.convert_airport_codes(date, airports_filename) return tasks.extract_fleet_date(date) else: log.error("The date is not valid: %s", date) return False
def validate_data_type_and_date(data_type, date=None): """ Unprocessed data items are identified by type and date. Returns a tuple with a valid flag and if in error an error message. Parameters ---------- data_type the data type date optional a date to match the data """ valid_type = VALID_DATA_TYPES.__contains__(data_type) if date: valid_date = is_valid_iso8601_date(date) else: valid_date = True if valid_type and valid_date: return True, None else: return False, "Validation failure, data_type: " + str(valid_type) + \ " date: " + str(valid_date)
def merge_consecutive_day_trajectories(filenames): day_ids_filename = filenames[0] prev_flights_filename = filenames[1] next_flights_filename = filenames[2] prev_positions_filename = filenames[3] next_positions_filename = filenames[4] prev_events_filename = filenames[5] next_events_filename = filenames[6] # Extract date strings from the input filenames and validate them input_date_strings = [''] * len(input_filenames) for i in range(len(input_filenames)): filename = filenames[i] input_date_strings[i] = read_iso8601_date_string(filename) if is_valid_iso8601_date(input_date_strings[i]): log.info('%s: %s', input_filenames[i], filename) else: log.error('%s: %s, invalid date: %s', input_filenames[i], filename, input_date_strings[i]) return errno.EINVAL prev_date = input_date_strings[1] next_date = input_date_strings[2] if (input_date_strings[0] != next_date) \ or (next_date <= prev_date) \ or (prev_date != input_date_strings[3]) \ or (prev_date != input_date_strings[5]) \ or (next_date != input_date_strings[4]) \ or (next_date != input_date_strings[6]): log.error("Files are not for the correct dates: %s,%s,%s,%s,%s,%s,%s", input_date_strings[0], input_date_strings[1], input_date_strings[2], input_date_strings[3], input_date_strings[4], input_date_strings[5], input_date_strings[6]) return errno.EINVAL ############################################################################ # Read the Id file ids_df = pd.DataFrame() try: ids_df = pd.read_csv(day_ids_filename, index_col='FLIGHT_ID', converters={ 'FLIGHT_ID': lambda x: UUID(x), 'NEW_FLIGHT_ID': lambda x: UUID(x) }, memory_map=True) except EnvironmentError: log.error('could not read file: %s', day_ids_filename) return errno.ENOENT # Merge flights if not merge_flights(prev_flights_filename, next_flights_filename, ids_df, log): return errno.ENOENT # free memory used by merge_flights gc.collect() # Merge positions if not merge_next_day_items(prev_positions_filename, next_positions_filename, ids_df, log): return errno.ENOENT # free memory used by merge_next_day_items gc.collect() # Merge events if not merge_next_day_items(prev_events_filename, next_events_filename, ids_df, log): return errno.ENOENT log.info('merging complete') return 0
def import_data_on_day(date, max_speed=DEFAULT_MAX_SPEED, distance_accuracy=DEFAULT_DISTANCE_ACCURACY): """ Import, refine and merge the CPR and FR24 data for the given date. Parameters ---------- date: string The date in ISO8601 format, e.g. 2017-08-16 max_speed: string The maximum ground speed permitted between adjacent positions [Knots], default: 750 Knots. distance_accuracy: string The maximum distance between positions at the same time [Nautical Miles], default: 0.25 NM. """ if is_valid_iso8601_date(date): log.info(f'Getting data for date: {date}') # convert the FR24 data # Note: needs over 13GB memory if run with convert_cpr_data. get_unprocessed(FR24, date, '.') convert_fr24_data_on_day(date) gc.collect() # Clean the FR24 data in parallel procs = [] procs.append( clean_raw_positions_data(FR24, date, max_speed, distance_accuracy)) # convert the CPR data in parallel get_unprocessed(CPR, date, '.') cpr_proc = convert_cpr_data_on_day(date) # write the converted FR24 data to the Google bucket put_processed(REFINED_FR24, create_convert_fr24_filenames(date)) process_fr24_flights(date, DEFAULT_AIRPORTS_FILENAME) gc.collect() # Wait for CPR conversion to finish out, err = cpr_proc.communicate() print(out) gc.collect() # Clean the CPR and FR24 data in parallel procs.append( clean_raw_positions_data(CPR, date, max_speed, distance_accuracy)) # write the converted CPR data to the Google bucket put_processed(REFINED_CPR, create_convert_cpr_filenames(date)) # Wait for the CPR and FR24 cleaning tasks for proc in procs: out, err = proc.communicate() print(out) # write the CPR and FR24 positions to the Google bucket write_clean_positions_data(CPR, date) write_clean_positions_data(FR24, date) gc.collect() merge_cpr_and_fr24_data(date) gc.collect() # Clean the merged positions proc = clean_raw_positions_data(CPR_FR24, date, max_speed, distance_accuracy) # put the merged CPR and FR24 data to the Google bucket put_processed(REFINED_MERGED_DAILY_CPR_FR24_IDS, create_match_cpr_adsb_output_filenames(date)) put_processed(REFINED_MERGED_DAILY_CPR_FR24, create_merge_cpr_adsb_output_filenames(date)) # Wait for cleaning to finish out, err = proc.communicate() print(out) # put the merged CPR and FR24 positions to the Google bucket write_clean_positions_data(CPR_FR24, date) else: log.error(f'invalid date: {date}') return errno.EINVAL return 0
def merge_overnight_flight_data(filenames): """ Merge the positions and events data and update flight data with new times. """ new_flights_filename = filenames[0] new_positions_filename = filenames[1] overnight_positions_filename = filenames[2] new_events_filename = filenames[3] overnight_events_filename = filenames[4] # Extract date strings from the input filenames and validate them input_date_strings = [''] * len(input_filenames) for i in range(len(input_filenames)): filename = filenames[i] input_date_strings[i] = read_iso8601_date_string(filename) if is_valid_iso8601_date(input_date_strings[i]): log.info('%s: %s', input_filenames[i], filename) else: log.error('%s: %s, invalid date: %s', input_filenames[i], filename, input_date_strings[i]) return errno.EINVAL date = input_date_strings[0] if (date != input_date_strings[1]) \ or (date != input_date_strings[2]) \ or (date != input_date_strings[3]) \ or (date != input_date_strings[4]): log.error("Files are not for the correct dates: %s,%s,%s,%s,%s", input_date_strings[0], input_date_strings[1], input_date_strings[2], input_date_strings[3], input_date_strings[4]) return errno.EINVAL ############################################################################ flights_df = pd.DataFrame() try: flights_df = pd.read_csv(new_flights_filename, converters={'FLIGHT_ID': lambda x: UUID(x)}, memory_map=True) except EnvironmentError: log.error('could not read file: %s', new_flights_filename) return errno.ENOENT overnight_pos_df = pd.DataFrame() try: overnight_pos_df = pd.read_csv(overnight_positions_filename, parse_dates=['TIME'], converters={'FLIGHT_ID': lambda x: UUID(x)}, memory_map=True) except EnvironmentError: log.error('could not read file: %s', overnight_positions_filename) return errno.ENOENT # Update the flight data with the new times of the overnight positions update_flight_data(flights_df, overnight_pos_df) flights_filename = new_flights_filename[4:] try: flights_df.to_csv(flights_filename, index=False, date_format=ISO8601_DATETIME_FORMAT) log.info('written file: %s', flights_filename) except EnvironmentError: log.error('could not write file: %s', flights_filename) return errno.ENOENT ############################################################################ # Now merge the positions merged_positions = merge_overnight_items(new_positions_filename, overnight_pos_df) if merged_positions.empty: log.error('Error merging: %s', new_positions_filename) return errno.ENOENT # write merged position data raw_positions_filename = '_'.join([RAW, new_positions_filename[4:]]) try: merged_positions.to_csv(raw_positions_filename, index=False, date_format=ISO8601_DATETIME_FORMAT) log.info('written file: %s', raw_positions_filename) except EnvironmentError: log.error('could not write file: %s', raw_positions_filename) return errno.ENOENT ############################################################################ # # Merge the events overnight_events_df = pd.DataFrame() try: overnight_events_df = pd.read_csv(overnight_events_filename, parse_dates=['TIME'], converters={'FLIGHT_ID': lambda x: UUID(x)}, memory_map=True) except EnvironmentError: log.error('could not read file: %s', overnight_events_filename) return errno.ENOENT merged_events = merge_overnight_items(new_events_filename, overnight_events_df) if merged_events.empty: log.error('Error merging: %s', new_events_filename) return errno.ENOENT events_filename = new_events_filename[4:] try: merged_events.to_csv(events_filename, index=False, date_format=ISO8601_DATETIME_FORMAT) log.info('written file: %s', events_filename) except EnvironmentError: log.error('could not write file: %s', events_filename) return errno.ENOENT return 0
def find_airport_intersections( flights_filename, trajectories_filename, radius=DEFAULT_RADIUS, airports_filename=DEFAULT_MOVEMENTS_AIRPORTS_FILENAME, distance_tolerance=DEFAULT_DISTANCE_TOLERANCE): """ Find intersections between trajectories and airport cylinders. Parameters ---------- flights_filename: a string The name of a flights file. trajectories_filename: a string The name of a trajectories file. radius: float The radius of the cylinder aroud each airport [Nautical Miles], default DEFAULT_RADIUS. airports_filename: a string The name of the airports file, default DEFAULT_MOVEMENTS_AIRPORTS_FILENAME. distance_tolerance: float The tolerance for path and cylinder distances, default DEFAULT_DISTANCE_TOLERANCE. Returns ------- An errno error_code if an error occured, zero otherwise. """ # Extract the date string from the filename and validate it flights_date = read_iso8601_date_string(flights_filename) if is_valid_iso8601_date(flights_date): log.info(f'flights file: {flights_filename}') else: log.error( f'flights file: {flights_filename}, invalid date: {flights_date}') return errno.EINVAL trajectories_date = read_iso8601_date_string(trajectories_filename, is_json=True) if is_valid_iso8601_date(trajectories_date): log.info(f'trajectories file: {trajectories_filename}') else: log.error(f'trajectories file, invalid date: {trajectories_date}') return errno.EINVAL if flights_date != trajectories_date: log.error( f'Files are not for the same date! Flights date: {flights_date}' f', trajectories date: {trajectories_date}') return errno.EINVAL log.info(f'flights file: {flights_filename}') log.info(f'trajectories file: {trajectories_filename}') log.info(f'radius: {radius} NM') log.info(f'distance_tolerance: {distance_tolerance} NM') airports_df = pd.DataFrame() try: airports_df = pd.read_csv(airports_filename, index_col='AIRPORT', memory_map=True) log.info(f'{airports_filename} read ok') except EnvironmentError: log.error(f'could not read file: {airports_filename}') return errno.ENOENT flights_df = pd.DataFrame() try: flights_df = pd.read_csv(flights_filename, usecols=['FLIGHT_ID', 'ADEP', 'ADES'], index_col='FLIGHT_ID', memory_map=True) log.info(f'{flights_filename} read ok') except EnvironmentError: log.error(f'could not read file: {flights_filename}') return errno.ENOENT # Determine the departure and arrival flights departures_df = pd.merge(flights_df, airports_df, left_on='ADEP', right_index=True) destinations_df = pd.merge(flights_df, airports_df, left_on='ADES', right_index=True) trajectories_filename = os.path.basename(trajectories_filename) is_bz2 = has_bz2_extension(trajectories_filename) if is_bz2: # remove the .bz2 from the end of the filename trajectories_filename = trajectories_filename[:-len(BZ2_FILE_EXTENSION )] # Write the airport_intersections into a csv file with output_filename output_filename = trajectories_filename.replace(TRAJECTORIES, AIRPORT_INTERSECTIONS) output_filename = output_filename.replace(JSON_FILE_EXTENSION, CSV_FILE_EXTENSION) try: with open(output_filename, 'w') as file: file.write(AIRPORT_INTERSECTION_FIELDS) flights_count = 0 smoothed_trajectories = generate_SmoothedTrajectories( trajectories_filename) for smooth_traj in smoothed_trajectories: try: flight_id = smooth_traj.flight_id is_departure = flight_id in departures_df.index is_arrival = flight_id in destinations_df.index if is_departure or is_arrival: traj_path = smooth_traj.path.ecef_path() if is_departure: dep_row = departures_df.loc[flight_id] departure = dep_row['ADEP'] if len(departure) == AIRPORT_NAME_LENGTH: latitude = dep_row['LATITUDE'] longitude = dep_row['LONGITUDE'] ref_point = global_Point3d(latitude, longitude) dep_intersection = find_airport_intersection( smooth_traj, traj_path, departure, ref_point, radius, False, distance_tolerance) if not dep_intersection.empty: dep_intersection.to_csv( file, index=False, header=False, mode='a', date_format=ISO8601_DATETIME_US_FORMAT) if is_arrival: dest_row = destinations_df.loc[flight_id] destination = dest_row['ADES'] if len(destination) == AIRPORT_NAME_LENGTH: latitude = dest_row['LATITUDE'] longitude = dest_row['LONGITUDE'] ref_point = global_Point3d(latitude, longitude) dest_intersection = find_airport_intersection( smooth_traj, traj_path, destination, ref_point, radius, True, distance_tolerance) if not dest_intersection.empty: dest_intersection.to_csv( file, index=False, header=False, mode='a', date_format=ISO8601_DATETIME_US_FORMAT) flights_count += 1 except ValueError: log.exception( f'find_airport_intersections id: {flight_id}') except StopIteration: pass log.info( f'find_airport_intersections finished for {flights_count} trajectories.' ) except EnvironmentError: log.error(f'could not write file: {output_filename}') return errno.EACCES return 0
import errno from pru.trajectory_fields import is_valid_iso8601_date from scripts.tasks import \ match_apds_trajectories_on_day, merge_apds_trajectories_on_day from pru.logger import logger log = logger(__name__) if __name__ == '__main__': if len(sys.argv) < 4: print('Usage: merge_apds_data_on_day.py <from_date> <to_date> <date>') sys.exit(errno.EINVAL) from_date = sys.argv[1] if not is_valid_iso8601_date(from_date): log.error(f'invalid from_date: {from_date}') sys.exit(errno.EINVAL) to_date = sys.argv[2] if not is_valid_iso8601_date(to_date): log.error(f'invalid to_date: {to_date}') sys.exit(errno.EINVAL) date = sys.argv[3] if not is_valid_iso8601_date(date): log.error(f'invalid date: {date}') sys.exit(errno.EINVAL) if not match_apds_trajectories_on_day(from_date, to_date, date): sys.exit(errno.EACCES)
def match_apds_trajectories(filenames): day_flights_filename = filenames[0] apds_flights_filename = filenames[1] day_events_filename = filenames[2] apds_events_filename = filenames[3] # Extract date strings from the input filenames and validate them input_date_strings = [''] * len(input_filenames) for i in range(len(input_filenames)): filename = filenames[i] input_date_strings[i] = read_iso8601_date_string(filename) if is_valid_iso8601_date(input_date_strings[i]): log.info('%s: %s', input_filenames[i], filename) else: log.error('%s: %s, invalid date: %s', input_filenames[i], filename, input_date_strings[i]) return errno.EINVAL days_date = input_date_strings[0] flights_finish_date = input_date_strings[1] events_finish_date = input_date_strings[3] # Extract the start date string from the filename and validate it flights_start_date, _ = split_dual_date( os.path.basename(apds_flights_filename)) if not is_valid_iso8601_date(flights_start_date): log.error('apds flights file: %s, invalid start date: %s', apds_flights_filename, flights_start_date) return errno.EINVAL # Extract the start date string from the filename and validate it events_start_date, _ = split_dual_date( os.path.basename(apds_events_filename)) if not is_valid_iso8601_date(events_start_date): log.error('apds events file: %s, invalid start date: %s', apds_events_filename, events_start_date) return errno.EINVAL # Ensure that all files are for the same date if (input_date_strings[0] != input_date_strings[2]): log.error( "Daily files are not for the same dates!" " Flights date: %s, Events date: %s", input_date_strings[0], input_date_strings[2]) return errno.EINVAL # Ensure that all files are for the same date if (flights_start_date != events_start_date) or \ (flights_finish_date != events_finish_date): log.error( "APT files are not for the same dates!" " Flights start date: %s, Events start date: %s", flights_start_date, events_start_date) return errno.EINVAL # Ensure day is within APT data range if not (flights_start_date <= days_date <= flights_finish_date): log.error( "Daily files are not in APT data range!" " Flights date: %s, APT range: %s to %s", days_date, flights_start_date, flights_finish_date) return errno.EINVAL ############################################################################ # Read the data # Read days flights into a pandas DataFrame day_flights_df = pd.DataFrame() try: day_flights_df = pd.read_csv( day_flights_filename, converters={'FLIGHT_ID': lambda x: UUID(x)}, usecols=['FLIGHT_ID', 'CALLSIGN', 'ADEP', 'ADES'], memory_map=True) except EnvironmentError: log.error('could not read file: %s', day_flights_filename) return errno.ENOENT log.info('daily flights read ok') # Read APT flights into a pandas DataFrame apds_flights_df = pd.DataFrame() try: apds_flights_df = pd.read_csv( apds_flights_filename, usecols=['FLIGHT_ID', 'CALLSIGN', 'ADEP', 'ADES'], memory_map=True) except EnvironmentError: log.error('could not read file: %s', apds_flights_filename) return errno.ENOENT log.info('apds flights read ok') # Read days events into a pandas DataFrame day_events_df = pd.DataFrame() try: day_events_df = pd.read_csv( day_events_filename, converters={'FLIGHT_ID': lambda x: UUID(x)}, parse_dates=['TIME'], memory_map=True) except EnvironmentError: log.error('could not read file: %s', day_events_filename) return errno.ENOENT log.info('daily events read ok') # Read APT events into a pandas DataFrame apds_events_df = pd.DataFrame() try: apds_events_df = pd.read_csv(apds_events_filename, parse_dates=['TIME'], memory_map=True) except EnvironmentError: log.error('could not read file: %s', apds_events_filename) return errno.ENOENT log.info('apds events read ok') ############################################################################ # Match events apds_day_flights = match_events(day_flights_df, apds_flights_df, day_events_df, apds_events_df) # Output the days ids apds_ids_file = create_matching_ids_filename(APDS, days_date) try: apds_day_flights.to_csv(apds_ids_file, index=False, columns=['FLIGHT_ID', 'NEW_FLIGHT_ID']) except EnvironmentError: log.error('could not write file: %s', apds_ids_file) return errno.EACCESreturn log.info('written file: %s', apds_ids_file) log.info('apds matching complete') return 0
from pru.trajectory_fields import is_valid_iso8601_date from pru.trajectory_files import CPR_FR24, create_trajectories_filename from apps.find_user_airspace_intersections import DEFAULT_LOGGING_COUNT from scripts.tasks import find_trajectory_user_airspace_intersections from pru.logger import logger log = logger(__name__) if __name__ == '__main__': if len(sys.argv) < 2: print('Usage: find_user_airspace_intersections_on_day.py <date>' ' [logging_msg_count]') sys.exit(errno.EINVAL) date = sys.argv[1] trajectory_filename = 'mas_05_' if not is_valid_iso8601_date(date): log.error(f'invalid date: {date}') sys.exit(errno.EINVAL) else: trajectory_filename += create_trajectories_filename(CPR_FR24, date) logging_msg_count = DEFAULT_LOGGING_COUNT if len(sys.argv) >= 3: logging_msg_count = int(sys.argv[2]) if not find_trajectory_user_airspace_intersections( trajectory_filename, CPR_FR24, logging_msg_count): sys.exit(errno.EACCES)
def merge_overnight_data_on_day(date, max_speed=DEFAULT_MAX_SPEED, distance_accuracy=DEFAULT_DISTANCE_ACCURACY): """ Match flights for the given day with flights for the previous day. Parameters ---------- date: string The date in ISO8601 format, e.g. 2017-08-16 max_speed: string The maximum ground speed permitted between adjacent positions [Knots], default: 750 Knots. distance_accuracy: string The maximum distance between positions at the same time [Nautical Miles], default: 0.25 NM. """ if is_valid_iso8601_date(date): # get the CPR data from the Google bucket log.info(f'Getting data for date: {date}') merge_files = create_merge_overnight_flight_data_input_filenames(date) if not get_processed(REFINED_MERGED_OVERNIGHT_CPR_FR24, merge_files): log.error('Flights file not found in overnight_cpr_fr24 bucket') return errno.ENOENT error_code = merge_overnight_flight_data(merge_files) if error_code: return error_code output_files = create_merge_overnight_flight_data_output_filenames( date) if not put_processed(REFINED_MERGED_OVERNIGHT_CPR_FR24, output_files): log.error('Could not merged files to overnight_cpr_fr24 bucket') return errno.EACCES raw_filename = output_files[1] error_code = clean_position_data(raw_filename, max_speed, distance_accuracy) if error_code: log.error('clean_position_data error file: {raw_filename}') return error_code filenames = create_clean_position_data_filenames(CPR_FR24, date) source_path = REFINED_MERGED_OVERNIGHT_CPR_FR24 if not put_processed(source_path, filenames[:1]): log.error('Could not write file: {filenames[:1]} to bucket') return errno.EACCES errors_path = PRODUCTS_ERROR_METRICS_CPR_FR24_OVERNIGHT if not put_processed(errors_path, filenames[1:]): log.error('Could not write file: {filenames[1:]} to bucket') return errno.EACCES else: log.error(f'invalid date: {date}') return errno.EINVAL return 0
def match_overnight_flights(filenames, max_time_difference=DEFAULT_MAXIMUM_TIME_DELTA): """ Match overnight flights with the same aircrfat address or callsign. The end of the previous flight must be within max_time_difference of the start of the next positions flight. """ prev_flights_filename = filenames[0] next_flights_filename = filenames[1] # Extract date strings from the input filenames and validate them input_date_strings = [''] * len(input_filenames) for i in range(len(input_filenames)): filename = filenames[i] input_date_strings[i] = read_iso8601_date_string(filename) if is_valid_iso8601_date(input_date_strings[i]): log.info('%s: %s', input_filenames[i], filename) else: log.error('%s: %s, invalid date: %s', input_filenames[i], filename, input_date_strings[i]) return errno.EINVAL # Ensure that all files are for the correct dates if (input_date_strings[0] >= input_date_strings[1]): log.error( "Files are not for the correct dates prev flights date: %s, " "flights date: %s", input_date_strings[0], input_date_strings[1]) return errno.EINVAL next_days_date = input_date_strings[1] log.info('Maximum time difference: %f', max_time_difference) ############################################################################ # Read the flight files # Read previous flights into a pandas DataFrame prev_flights_df = pd.DataFrame() try: prev_flights_df = pd.read_csv( prev_flights_filename, parse_dates=['PERIOD_START', 'PERIOD_FINISH'], converters={'FLIGHT_ID': lambda x: UUID(x)}, usecols=[ 'FLIGHT_ID', 'CALLSIGN', 'AIRCRAFT_ADDRESS', 'ADEP', 'ADES', 'PERIOD_START', 'PERIOD_FINISH' ]) except EnvironmentError: log.error('could not read file: %s', prev_flights_filename) return errno.ENOENT log.info('cpr flights read ok') # Read next flights into a pandas DataFrame next_flights_df = pd.DataFrame() try: next_flights_df = pd.read_csv( next_flights_filename, parse_dates=['PERIOD_START', 'PERIOD_FINISH'], converters={'FLIGHT_ID': lambda x: UUID(x)}, usecols=[ 'FLIGHT_ID', 'CALLSIGN', 'AIRCRAFT_ADDRESS', 'ADEP', 'ADES', 'PERIOD_START', 'PERIOD_FINISH' ]) except EnvironmentError: log.error('could not read file: %s', next_flights_filename) return errno.ENOENT log.info('adsb flights read ok') # Dict to hold the flight ids flight_ids = {} # Get the prev flights with aircraft addresses prev_flights_aa = prev_flights_df.loc[ prev_flights_df['AIRCRAFT_ADDRESS'].notnull()] next_flights_aa = next_flights_df.loc[ next_flights_df['AIRCRAFT_ADDRESS'].notnull()] ############################################################################ # Match the flights # match previous and next flights on aircraft address and times wihin max_time_difference merge_aa = pd.merge(prev_flights_aa, next_flights_aa, on='AIRCRAFT_ADDRESS') merge_aa_time = merge_aa.loc[( (merge_aa.PERIOD_START_y - merge_aa.PERIOD_FINISH_x) / np.timedelta64(1, 's')) < max_time_difference] # add aircraft address matches aa_matches = add_matches(merge_aa_time, flight_ids) log.info('aircraft address matches: %d, flight_ids: %d', aa_matches, len(flight_ids)) # match previous and next flights on callsign and times wihin max_time_difference merge_cs = pd.merge(prev_flights_df, next_flights_df, on='CALLSIGN') merge_cs_time = merge_cs.loc[( (merge_cs.PERIOD_START_y - merge_cs.PERIOD_FINISH_x) / np.timedelta64(1, 's')) < max_time_difference] # add callsign matches cs_matches = add_matches(merge_cs_time, flight_ids) log.info('callsign matches: %d, total matches:%d, flight_ids: %d', cs_matches, aa_matches + cs_matches, len(flight_ids)) ############################################################################ # Output the previous day ids prev_ids_filename = create_matching_ids_filename(PREV_DAY, next_days_date) try: with open(prev_ids_filename, 'w') as file: file.write(NEW_ID_FIELDS) for key, value in flight_ids.items(): print(key, value, sep=',', file=file) log.info('written file: %s', prev_ids_filename) except EnvironmentError: log.error('could not write file: %s', prev_ids_filename) return errno.EACCES log.info('overnight flight matching complete') return 0
def convert_apds_data(filename, stands_filename): # Extract the start and finish date strings from the filename start_date, finish_date = split_dual_date(os.path.basename(filename)) if not is_valid_iso8601_date(start_date): log.error('apds data file: %s, invalid start date: %s', filename, start_date) return errno.EINVAL # validate the finish date string from the filename if not is_valid_iso8601_date(finish_date): log.error('apds data file: %s, invalid finish date: %s', filename, finish_date) return errno.EINVAL log.info('apds data file: %s', filename) airport_stands_df = pd.DataFrame() if stands_filename: try: airport_stands_df = pd.read_csv(stands_filename, index_col=['ICAO_ID', 'STAND_ID'], memory_map=True) airport_stands_df.sort_index() except EnvironmentError: log.error('could not read file: %s', stands_filename) return errno.ENOENT log.info('airport stands file: %s', stands_filename) else: log.info('airport stands not provided') # A dict to hold the APDS flights flights = {} # Read the APDS flights file into flights try: is_bz2 = has_bz2_extension(filename) with bz2.open(filename, 'rt', newline="") if (is_bz2) else \ open(filename, 'r') as file: reader = csv.reader(file, delimiter=',') next(reader, None) # skip the headers for row in reader: flights.setdefault(row[ApdsField.APDS_ID], ApdsFlight(row, airport_stands_df)) except EnvironmentError: log.error('could not read file: %s', filename) return errno.ENOENT log.info('apds flights read ok') valid_flights = 0 # Output the APDS flight data # finish_date output_files = create_convert_apds_filenames(start_date, finish_date) flight_file = output_files[0] try: with open(flight_file, 'w') as file: file.write(FLIGHT_FIELDS) for key, value in sorted(flights.items()): print(value, file=file) valid_flights += 1 log.info('written file: %s', flight_file) except EnvironmentError: log.error('could not write file: %s', flight_file) # if airport stand data was provided if len(airport_stands_df): # Output the APDS position data positions_file = output_files[1] try: with open(positions_file, 'w') as file: file.write(POSITION_FIELDS) for key, value in sorted(flights.items()): for event in sorted(value.positions): print(event, file=file) log.info('written file: %s', positions_file) except EnvironmentError: log.error('could not write file: %s', positions_file) # Output the APDS event data event_file = output_files[2] try: with open(event_file, 'w') as file: file.write(FLIGHT_EVENT_FIELDS) for key, value in sorted(flights.items()): for event in sorted(value.events): print(event, file=file) log.info('written file: %s', event_file) except EnvironmentError: log.error('could not write file: %s', event_file) return errno.EACCES log.info('apds conversion complete for %s flights on %s', valid_flights, start_date) return 0
def merge_apds_trajectories(filenames): apds_ids_filename = filenames[0] day_points_filename = filenames[1] apds_points_filename = filenames[2] day_events_filename = filenames[3] apds_events_filename = filenames[4] # Extract date strings from the input filenames and validate them input_date_strings = [''] * len(input_filenames) for i in range(len(input_filenames)): filename = filenames[i] input_date_strings[i] = read_iso8601_date_string(filename) if is_valid_iso8601_date(input_date_strings[i]): log.info('%s: %s', input_filenames[i], filename) else: log.error('%s: %s, invalid date: %s', input_filenames[i], filename, input_date_strings[i]) return errno.EINVAL days_date = input_date_strings[0] points_finish_date = input_date_strings[2] # Extract the start date string from the filename and validate it points_start_date, _ = split_dual_date(os.path.basename(apds_points_filename)) if not is_valid_iso8601_date(points_start_date): log.error('apds points file: %s, invalid start date: %s', apds_points_filename, points_start_date) return errno.EINVAL # Extract the start date string from the filename and validate it events_start_date, _ = split_dual_date(os.path.basename(apds_events_filename)) if not is_valid_iso8601_date(events_start_date): log.error('apds events file: %s, invalid start date: %s', apds_events_filename, events_start_date) return errno.EINVAL # Ensure that all files are for the same date if (days_date != input_date_strings[1]) or \ (days_date != input_date_strings[3]): log.error('Files are not for the same dates!' ' Ids date: %s, Day Positions date: %s, Day Events date: %s', days_date, input_date_strings[1], input_date_strings[3]) return errno.EINVAL # Ensure day is within APDS date range if not (points_start_date <= days_date <= points_finish_date): log.error("Daily files are not in APDS data range!" " Daily date: %s, APDS range: %s to %s", days_date, points_start_date, points_finish_date) return errno.EINVAL ############################################################################ # Merge positions and events # Read apds ids into a pandas DataFrame apds_ids_df = pd.DataFrame() try: apds_ids_df = pd.read_csv(apds_ids_filename, index_col='FLIGHT_ID', converters={'NEW_FLIGHT_ID': lambda x: UUID(x)}, memory_map=True) except EnvironmentError: log.error('could not read file: %s', apds_ids_filename) return errno.ENOENT points_df = merge_items(day_points_filename, apds_points_filename, apds_ids_df, log) if not len(points_df): log.error('Error merging points') return errno.ENOENT events_df = merge_items(day_events_filename, apds_events_filename, apds_ids_df, log) if not len(events_df): log.error('Error merging events') return errno.ENOENT ############################################################################ # Output Data # Output the merged positions output_files = create_merge_apds_output_filenames(days_date) points_file = output_files[0] try: points_df.to_csv(points_file, index=False, date_format=ISO8601_DATETIME_FORMAT) log.info('written file: %s', points_file) except EnvironmentError: log.error('could not write file: %s', points_file) return errno.EACCES # Output the merged events events_file = output_files[1] try: events_df.to_csv(events_file, index=False, date_format=ISO8601_DATETIME_FORMAT) log.info('written file: %s', events_file) except EnvironmentError: log.error('could not write file: %s', events_file) return errno.EACCES log.info('apds merging complete') return 0
def match_consecutive_day_trajectories( filenames, max_time_difference=DEFAULT_MAXIMUM_TIME_DELTA, max_speed=DEFAULT_MAXIMUM_SPEED): prev_flights_filename = filenames[0] next_flights_filename = filenames[1] # Note positions files must be 'clean' prev_positions_filename = filenames[2] next_positions_filename = filenames[3] # Extract date strings from the input filenames and validate them input_date_strings = [''] * len(input_filenames) for i in range(len(input_filenames)): filename = filenames[i] input_date_strings[i] = read_iso8601_date_string(filename) if is_valid_iso8601_date(input_date_strings[i]): log.info('%s: %s', input_filenames[i], filename) else: log.error('%s: %s, invalid date: %s', input_filenames[i], filename, input_date_strings[i]) return errno.EINVAL # Ensure that all files are for the correct dates if (input_date_strings[0] != input_date_strings[2]) \ or (input_date_strings[1] != input_date_strings[3]) \ or (input_date_strings[0] >= input_date_strings[1]): log.error( "Files are not for the correct dates prev flights date: %s, " "next flights date: %s prev Positions date: %s, " "next Positions date: %s", input_date_strings[0], input_date_strings[1], input_date_strings[2], input_date_strings[3]) return errno.EINVAL next_days_date = input_date_strings[1] log.info('Maximum time difference: %f', max_time_difference) ############################################################################ # Read the files # Read previous flights into a pandas DataFrame prev_flights_df = pd.DataFrame() try: prev_flights_df = pd.read_csv( prev_flights_filename, parse_dates=['PERIOD_START', 'PERIOD_FINISH'], converters={'FLIGHT_ID': lambda x: UUID(x)}, usecols=[ 'FLIGHT_ID', 'CALLSIGN', 'AIRCRAFT_ADDRESS', 'ADEP', 'ADES', 'PERIOD_START', 'PERIOD_FINISH' ]) except EnvironmentError: log.error('could not read file: %s', prev_flights_filename) return errno.ENOENT log.info('cpr flights read ok') # Read next flights into a pandas DataFrame next_flights_df = pd.DataFrame() try: next_flights_df = pd.read_csv( next_flights_filename, parse_dates=['PERIOD_START', 'PERIOD_FINISH'], converters={'FLIGHT_ID': lambda x: UUID(x)}, usecols=[ 'FLIGHT_ID', 'CALLSIGN', 'AIRCRAFT_ADDRESS', 'ADEP', 'ADES', 'PERIOD_START', 'PERIOD_FINISH' ]) except EnvironmentError: log.error('could not read file: %s', next_flights_filename) return errno.ENOENT log.info('adsb flights read ok') # Read previous points into a pandas DataFrame prev_points_df = pd.DataFrame() try: prev_points_df = pd.read_csv( prev_positions_filename, parse_dates=['TIME'], index_col='FLIGHT_ID', converters={'FLIGHT_ID': lambda x: UUID(x)}, usecols=['FLIGHT_ID', 'TIME', 'LAT', 'LON', 'ALT']) except EnvironmentError: log.error('could not read file: %s', prev_positions_filename) return errno.ENOENT log.info('prev points read ok') # Read the next points into a pandas DataFrame next_points_df = pd.DataFrame() try: next_points_df = pd.read_csv( next_positions_filename, parse_dates=['TIME'], index_col='FLIGHT_ID', converters={'FLIGHT_ID': lambda x: UUID(x)}, usecols=['FLIGHT_ID', 'TIME', 'LAT', 'LON', 'ALT']) except EnvironmentError: log.error('could not read file: %s', next_positions_filename) return errno.ENOENT log.info('next points read ok') # Dict to hold the flight ids flight_ids = {} # Get the prev flights with aircraft addresses prev_flights_aa = prev_flights_df.loc[ prev_flights_df['AIRCRAFT_ADDRESS'].notnull()] next_flights_aa = next_flights_df.loc[ next_flights_df['AIRCRAFT_ADDRESS'].notnull()] ############################################################################ # Match the flights # match previous and next flights on aircraft address and times wihin max_time_difference merge_aa = pd.merge(prev_flights_aa, next_flights_aa, on='AIRCRAFT_ADDRESS') merge_aa_time = merge_aa.loc[( (merge_aa.PERIOD_START_y - merge_aa.PERIOD_FINISH_x) / np.timedelta64(1, 's')) < max_time_difference] # verify aircraft address matches aa_matches = verify_matches(merge_aa_time, prev_points_df, next_points_df, flight_ids, max_time_difference, max_speed) log.info('aircraft address matches: %d, flight_ids: %d', aa_matches, len(flight_ids)) # match previous and next flights on callsign and times wihin max_time_difference merge_cs = pd.merge(prev_flights_df, next_flights_df, on='CALLSIGN') merge_cs_time = merge_cs.loc[( (merge_cs.PERIOD_START_y - merge_cs.PERIOD_FINISH_x) / np.timedelta64(1, 's')) < max_time_difference] # verify callsign matches cs_matches = verify_matches(merge_cs_time, prev_points_df, next_points_df, flight_ids, max_time_difference, max_speed) log.info('callsign matches: %d, total matches:%d, flight_ids: %d', cs_matches, aa_matches + cs_matches, len(flight_ids)) # match previous and next flights on departure, destination and overlaping start & end times merge_dep_des = pd.merge(prev_flights_df, next_flights_df, on=['ADEP', 'ADES']) merge_dep_des_time = merge_cs.loc[( (merge_dep_des.PERIOD_START_y - merge_dep_des.PERIOD_FINISH_x) / np.timedelta64(1, 's')) < max_time_difference] # verify departure and destination airport matches apt_matches = verify_matches(merge_dep_des_time, prev_points_df, next_points_df, flight_ids, max_time_difference, max_speed) log.info('airport matches: %d, total matches:%d, flight_ids: %d', apt_matches, apt_matches + aa_matches + cs_matches, len(flight_ids)) # Output the previous day ids prev_ids_filename = create_matching_ids_filename(PREV_DAY, next_days_date) try: with open(prev_ids_filename, 'w') as file: file.write(NEW_ID_FIELDS) for key, value in flight_ids.items(): print(key, value, sep=',', file=file) except EnvironmentError: log.error('could not write file: %s', prev_ids_filename) return errno.EACCES log.info('written file: %s', prev_ids_filename) log.info('consecutive day matching complete') return 0
def extract_overnight_data(filenames): """ Extract flights, postions and events for the previosu day from the data. It writes items (flights, positions or events) WITHOUT ids in the ids file into files with 'new_' prepended to the filename. It writes positions and events with ids in the ids file into positions and files for the previus day, with the ids replaced by the previous ids in the ids file. """ day_ids_filename = filenames[0] flights_filename = filenames[1] positions_filename = filenames[2] events_filename = filenames[3] # Extract date strings from the input filenames and validate them input_date_strings = [''] * len(input_filenames) for i in range(len(input_filenames)): filename = filenames[i] input_date_strings[i] = read_iso8601_date_string(filename) if is_valid_iso8601_date(input_date_strings[i]): log.info('%s: %s', input_filenames[i], filename) else: log.error('%s: %s, invalid date: %s', input_filenames[i], filename, input_date_strings[i]) return errno.EINVAL date = input_date_strings[0] if (date != input_date_strings[1]) \ or (date != input_date_strings[2]) \ or (date != input_date_strings[3]): log.error("Files are not for the same dates: %s,%s,%s,%s", input_date_strings[0], input_date_strings[1], input_date_strings[2], input_date_strings[3]) return errno.EINVAL ############################################################################ # Read the Id file ids_df = pd.DataFrame() try: ids_df = pd.read_csv(day_ids_filename, index_col='FLIGHT_ID', converters={ 'FLIGHT_ID': lambda x: UUID(x), 'NEW_FLIGHT_ID': lambda x: UUID(x) }, memory_map=True) except EnvironmentError: log.error('could not read file: %s', day_ids_filename) return errno.ENOENT # Don't require the flights # Just write flights WITHOUT mathing ids into a "new" file _ = extract_next_day_items(flights_filename, ids_df) prev_positions = extract_next_day_items(positions_filename, ids_df, ['TIME']) # Output the new next items prev_date = iso8601_previous_day(date) overnight_positions_filename = 'overnight_' + \ create_positions_filename(CPR_FR24, prev_date) try: prev_positions.to_csv(overnight_positions_filename, index=False, date_format=ISO8601_DATETIME_FORMAT) log.info('written file: %s', overnight_positions_filename) except EnvironmentError: log.error('could not write file: %s', overnight_positions_filename) return errno.ENOENT prev_events = extract_next_day_items(events_filename, ids_df, ['TIME']) overnight_events_filename = 'overnight_' + \ create_events_filename(CPR_FR24, prev_date) try: prev_events.to_csv(overnight_events_filename, index=False, date_format=ISO8601_DATETIME_FORMAT) log.info('written file: %s', overnight_events_filename) except EnvironmentError: log.error('could not write file: %s', overnight_events_filename) return errno.ENOENT log.info('extraction complete') return 0
def convert_fr24_data(filenames): flights_filename = filenames[0] points_filename = filenames[1] if flights_filename == points_filename: log.error( 'Files are the same! Flights filename: %s, points filename: %s', flights_filename, points_filename) return errno.EINVAL # Extract the date string from the filename and validate it flights_date = read_iso8601_date_string(flights_filename) if is_valid_iso8601_date(flights_date): log.info('fr24 flights file: %s', flights_filename) else: log.error('fr24 flights file: %s, invalid date: %s', flights_filename, flights_date) return errno.EINVAL # Extract the date string from the filename and validate it points_date = read_iso8601_date_string(points_filename) if is_valid_iso8601_date(points_date): log.info('fr24 points file: %s', points_filename) else: log.error('fr24 points file: %s, invalid date: %s', points_filename, points_date) return errno.EINVAL if flights_date != points_date: log.error( 'Files are not for the same date! Flights date: %s, points date: %s', flights_date, points_date) return errno.EINVAL # A dict to hold the ADS-B flights flights = {} # Read the ADS-B flights file into flights try: is_bz2 = has_bz2_extension(flights_filename) with bz2.open(flights_filename, 'rt', newline="") if (is_bz2) else \ open(flights_filename, 'r') as file: reader = csv.reader(file, delimiter=',') next(reader, None) # skip the headers for row in reader: flights.setdefault(row[AdsbFlightField.FLIGHT_ID], AdsbFlight(row)) except EnvironmentError: log.error('could not read file: %s', flights_filename) return errno.ENOENT log.info('fr24 flights read ok') # Read the ADS-B points file into flights try: is_bz2 = has_bz2_extension(points_filename) with bz2.open(points_filename, 'rt', newline="") if (is_bz2) else \ open(points_filename, 'r') as file: reader = csv.reader(file, delimiter=',') next(reader, None) # skip the headers for row in reader: if row[AdsbPointField.FLIGHT_ID] in flights: flights[row[AdsbPointField.FLIGHT_ID]].append(row) except EnvironmentError: log.error('could not read file: %s', points_filename) return errno.ENOENT log.info('fr24 points read ok') # sort positions in date time (of position) order for key, values in flights.items(): values.sort() log.info('fr24 points sorted') valid_flights = 0 # Output the ADS-B flight data for all flights output_files = create_convert_fr24_filenames(flights_date) flight_file = output_files[0] try: with open(flight_file, 'w') as file: file.write(FLIGHT_FIELDS) for key, values in sorted(flights.items()): if values.is_valid: print(values, file=file) valid_flights += 1 log.info('written file: %s', flight_file) except EnvironmentError: log.error('could not write file: %s', flight_file) # Output the ADS-B position data for all flights positions_file = output_files[1] try: with open(positions_file, 'w') as file: file.write(POSITION_FIELDS) for key, values in sorted(flights.items()): if values.is_valid: for pos in values.positions: print(pos, file=file) log.info('written file: %s', positions_file) except EnvironmentError: log.error('could not write file: %s', positions_file) return errno.EACCES log.info('fr24 conversion complete for %s flights on %s', valid_flights, points_date) return 0
def merge_cpr_adsb_trajectories(filenames): cpr_ids_filename = filenames[0] adsb_ids_filename = filenames[1] cpr_flights_filename = filenames[2] adsb_flights_filename = filenames[3] # Note positions files must be clean cpr_positions_filename = filenames[4] adsb_positions_filename = filenames[5] cpr_events_filename = filenames[6] # Extract date strings from the input filenames and validate them input_date_strings = [''] * len(input_filenames) for i in range(len(input_filenames)): filename = filenames[i] input_date_strings[i] = read_iso8601_date_string(filename) if is_valid_iso8601_date(input_date_strings[i]): log.info('%s: %s', input_filenames[i], filename) else: log.error('%s: %s, invalid date: %s', input_filenames[i], filename, input_date_strings[i]) return errno.EINVAL # Ensure that all files are for the same date if input_date_strings[1:] != input_date_strings[:-1]: log.error("Files are not for the same dates: %s,%s,%s,%s,%s,%s,%s", input_date_strings[0], input_date_strings[1], input_date_strings[2], input_date_strings[3], input_date_strings[4], input_date_strings[5], input_date_strings[6]) return errno.EINVAL ############################################################################ # Read the files # Read the Id files cpr_ids_df = pd.DataFrame() try: cpr_ids_df = pd.read_csv( cpr_ids_filename, index_col='FLIGHT_ID', converters={'NEW_FLIGHT_ID': lambda x: UUID(x)}, memory_map=True) cpr_ids_df.sort_index(inplace=True) except EnvironmentError: log.error('could not read file: %s', cpr_ids_filename) return errno.ENOENT adsb_ids_df = pd.DataFrame() try: adsb_ids_df = pd.read_csv( adsb_ids_filename, index_col='FLIGHT_ID', converters={'NEW_FLIGHT_ID': lambda x: UUID(x)}, memory_map=True) cpr_ids_df.sort_index(inplace=True) except EnvironmentError: log.error('could not read file: %s', adsb_ids_filename) return errno.ENOENT log.info('read ids files: %s,%s', cpr_ids_filename, adsb_ids_filename) # Read and merge the flights flights_df = pd.DataFrame() try: flights_df = get_merged_flights(cpr_flights_filename, adsb_flights_filename, cpr_ids_df, adsb_ids_df) except EnvironmentError: log.error('could not read file: %s or %s', cpr_flights_filename, adsb_flights_filename) return errno.ENOENT log.info('read and merged flights files: %s,%s', cpr_flights_filename, adsb_flights_filename) # Read and merge the position positions_df = pd.DataFrame() try: positions_df = get_merged_positions(cpr_positions_filename, adsb_positions_filename, cpr_ids_df, adsb_ids_df) except EnvironmentError: log.error('could not read file: %s or %s', cpr_positions_filename, adsb_positions_filename) return errno.ENOENT # Read and merge the positions log.info('read and merged positions files: %s,%s', cpr_positions_filename, adsb_positions_filename) # Read and merge the events events_df = pd.DataFrame() try: events_df = read_dataframe_with_new_ids(cpr_events_filename, cpr_ids_df) replace_old_flight_ids(events_df) except EnvironmentError: log.error('could not read file: %s', cpr_events_filename) return errno.ENOENT # Read and merge the positions log.info('read and merged events file: %s', cpr_events_filename) update_flight_data(flights_df, positions_df) # Output the flights output_files = create_merge_cpr_adsb_output_filenames( input_date_strings[0]) output_flights_filename = output_files[0] try: flights_df.to_csv(output_flights_filename, index=False, date_format=ISO8601_DATETIME_FORMAT) except EnvironmentError: log.error('could not write file: %s', output_flights_filename) return errno.EACCES log.info('written file: %s', output_flights_filename) # Convert the positions prior to output replace_old_flight_ids(positions_df) output_positions_filename = output_files[1] try: positions_df.to_csv(output_positions_filename, index=False, date_format=ISO8601_DATETIME_FORMAT) except EnvironmentError: log.error('could not write file: %s', output_positions_filename) return errno.EACCES log.info('written file: %s', output_positions_filename) # Output the events output_events_filename = output_files[2] try: events_df.to_csv(output_events_filename, index=False, date_format=ISO8601_DATETIME_FORMAT) except EnvironmentError: log.error('could not write file: %s', output_events_filename) return errno.EACCES log.info('written file: %s', output_events_filename) log.info('merging complete') return 0
def match_cpr_adsb_trajectories( filenames, distance_threshold=DEFAULT_MATCHING_DISTANCE_THRESHOLD, alt_threshold=DEFAULT_MATCHING_ALTITUDE_THRESHOLD): # Extract date strings from the input filenames and validate them input_date_strings = [''] * len(input_filenames) for i in range(len(input_filenames)): filename = filenames[i] input_date_strings[i] = read_iso8601_date_string(filename) if is_valid_iso8601_date(input_date_strings[i]): log.info('%s: %s', input_filenames[i], filename) else: log.error('%s: %s, invalid date: %s', input_filenames[i], filename, input_date_strings[i]) return errno.EINVAL # Ensure that files are all for the same date if input_date_strings[1:] != input_date_strings[:-1]: log.error( 'Files are not for the same date!' ' CPR Flights date: %s, ADSB Flights date: %s,' ' CPR Positions date: %s, ADSB Positions date: %s', input_date_strings[0], input_date_strings[1], input_date_strings[2], input_date_strings[3]) return errno.EINVAL cpr_flights_filename = filenames[0] adsb_flights_filename = filenames[1] # Note positions files must be 'clean' cpr_positions_filename = filenames[2] adsb_positions_filename = filenames[3] log.info('Distance threshold: %f', distance_threshold) log.info('Altitude threshold: %f', alt_threshold) ############################################################################ # Read the files # Read CPR flights into a pandas DataFrame cpr_flights_df = pd.DataFrame() try: cpr_flights_df = pd.read_csv( cpr_flights_filename, parse_dates=['PERIOD_START', 'PERIOD_FINISH'], converters={'FLIGHT_ID': lambda x: int(x)}, usecols=[ 'FLIGHT_ID', 'CALLSIGN', 'AIRCRAFT_ADDRESS', 'ADEP', 'ADES', 'PERIOD_START', 'PERIOD_FINISH' ], memory_map=True) log.info('cpr flights read ok') except EnvironmentError: log.error('could not read file: %s', cpr_flights_filename) return errno.ENOENT # Read ADS-B flights into a pandas DataFrame adsb_flights_df = pd.DataFrame() try: adsb_flights_df = pd.read_csv( adsb_flights_filename, parse_dates=['PERIOD_START', 'PERIOD_FINISH'], converters={'FLIGHT_ID': lambda x: int(x, 16)}, usecols=[ 'FLIGHT_ID', 'CALLSIGN', 'AIRCRAFT_ADDRESS', 'ADEP', 'ADES', 'PERIOD_START', 'PERIOD_FINISH' ], memory_map=True) log.info('adsb flights read ok') except EnvironmentError: log.error('could not read file: %s', adsb_flights_filename) return errno.ENOENT # Read CPR points into a pandas DataFrame cpr_points_df = pd.DataFrame() try: cpr_points_df = pd.read_csv( cpr_positions_filename, parse_dates=['TIME'], index_col='FLIGHT_ID', converters={'FLIGHT_ID': lambda x: int(x)}, usecols=['FLIGHT_ID', 'TIME', 'LAT', 'LON', 'ALT'], memory_map=True) except EnvironmentError: log.error('could not read file: %s', cpr_positions_filename) return errno.ENOENT log.info('cpr points read ok') # Read the ADS-B points adsb_points_df = pd.DataFrame() try: adsb_points_df = pd.read_csv( adsb_positions_filename, parse_dates=['TIME'], index_col='FLIGHT_ID', converters={'FLIGHT_ID': lambda x: int(x, 16)}, usecols=['FLIGHT_ID', 'TIME', 'LAT', 'LON', 'ALT'], memory_map=True) except EnvironmentError: log.error('could not read file: %s', adsb_positions_filename) return errno.ENOENT log.info('adsb points read ok') # Dicts to hold the flight ids cpr_flight_ids = {} adsb_flight_ids = {} merge_flight_ids = {} # Get the CPR flights with aircraft addresses cpr_flights_aa = cpr_flights_df.loc[ cpr_flights_df['AIRCRAFT_ADDRESS'].notnull()] ############################################################################ # Match the flights # match CPR and ADS-B flights on aircraft address and overlaping start & end times merge_aa = pd.merge(cpr_flights_aa, adsb_flights_df, on='AIRCRAFT_ADDRESS') merge_aa_time = merge_aa.loc[ (merge_aa.PERIOD_START_x <= merge_aa.PERIOD_FINISH_y) & (merge_aa.PERIOD_START_y <= merge_aa.PERIOD_FINISH_x)] log.info('aircraft address time matches: %d', len(merge_aa_time)) # verify aircraft address matches aa_matches = verify_flight_matches(merge_aa_time, cpr_points_df, adsb_points_df, cpr_flight_ids, adsb_flight_ids, merge_flight_ids, distance_threshold, alt_threshold) log.info( 'aircraft address matches: %d, cpr_ids: %d, adsb_ids: %d, merge_ids: %d', aa_matches, len(cpr_flight_ids), len(adsb_flight_ids), len(merge_flight_ids)) # match CPR and ADS-B flights on callsign and overlaping start & end times merge_cs = pd.merge(cpr_flights_df, adsb_flights_df, on='CALLSIGN') merge_cs_time = merge_cs.loc[ (merge_cs.PERIOD_START_x <= merge_cs.PERIOD_FINISH_y) & (merge_cs.PERIOD_START_y <= merge_cs.PERIOD_FINISH_x)] log.info('callsign time matches: %d', len(merge_cs_time)) # verify callsign matches cs_matches = verify_flight_matches(merge_cs_time, cpr_points_df, adsb_points_df, cpr_flight_ids, adsb_flight_ids, merge_flight_ids, distance_threshold, alt_threshold) log.info('callsign matches: %d, cpr_ids: %d, adsb_ids: %d, merge_ids: %d', cs_matches, len(cpr_flight_ids), len(adsb_flight_ids), len(merge_flight_ids)) # merge overlapping aircraft address and callsign matches if len(merge_flight_ids): merge_matches(cpr_flight_ids, merge_flight_ids) merge_matches(adsb_flight_ids, merge_flight_ids) merge_flight_ids.clear() # match CPR and ADS-B flights on departure, destination and overlaping start & end times merge_dep_des = pd.merge(cpr_flights_df, adsb_flights_df, on=['ADEP', 'ADES']) merge_dep_des_time = merge_cs.loc[ (merge_dep_des.PERIOD_START_x <= merge_dep_des.PERIOD_FINISH_y) & (merge_dep_des.PERIOD_START_y <= merge_dep_des.PERIOD_FINISH_x)] # verify departure, destination matches dep_des_matches = verify_flight_matches(merge_dep_des_time, cpr_points_df, adsb_points_df, cpr_flight_ids, adsb_flight_ids, merge_flight_ids, distance_threshold, alt_threshold) log.info('airport matches: %d, cpr_ids: %d, adsb_ids: %d, merge_ids: %d', dep_des_matches, len(cpr_flight_ids), len(adsb_flight_ids), len(merge_flight_ids)) # merge overlapping aircraft address, callsign and airport matches if len(merge_flight_ids): merge_matches(cpr_flight_ids, merge_flight_ids) merge_matches(adsb_flight_ids, merge_flight_ids) merge_flight_ids.clear() # Add unmatched flight ids to cpr_flight_ids and adsb_flight_ids allocate_remaining_ids(cpr_flight_ids, cpr_flights_df['FLIGHT_ID'].values) allocate_remaining_ids(adsb_flight_ids, adsb_flights_df['FLIGHT_ID'].values) ############################################################################ # Output the matching ids # Output the CPR ids output_files = create_match_cpr_adsb_output_filenames( input_date_strings[0]) cpr_ids_file = output_files[0] try: with open(cpr_ids_file, 'w') as file: file.write(NEW_ID_FIELDS) for key in cpr_flight_ids: value = cpr_flight_ids[key] print(key, value, sep=',', file=file) except EnvironmentError: log.error('could not write file: %s', cpr_ids_file) return errno.EACCES log.info('written file: %s', cpr_ids_file) # Output the ADS-B ids adsb_ids_file = output_files[1] try: with open(adsb_ids_file, 'w', newline='') as file: file.write(NEW_ID_FIELDS) for key in adsb_flight_ids: adsb_str = '0x{:06x},{}'.format(key, adsb_flight_ids[key]) print(adsb_str, file=file) except EnvironmentError: log.error('could not write file: %s', adsb_ids_file) return errno.EACCES log.info('written file: %s', adsb_ids_file) log.info('matching complete') return 0