def remap_df(self, table, df): """Remap the df columns to db columns.""" msg = "Remap df {}".format(table) logger.log(logging.INFO, msg) nt = self.dict_mapper[table] df = df.rename(columns=nt.mapping) return df[nt.sub_cols]
def route_direction(serie): """Change the name of 2 routes with the same name, in opposite direction. """ lst_index = list(serie.loc[serie.shift(-1) == serie].index) msg = "We have {} routes with the same name for 2 directions" logger.log(logging.WARNING, msg.format(len(lst_index))) for idx in lst_index: serie.loc[idx] += "_other_dire"
def insert_shapes(self, table): """Update a table.""" df = self.dict_df[table] try: self.db.insert_geography(df) except Exception as e: self.log_error(e, table) else: logger.log(logging.INFO, "{} insert".format(table))
def update_table(self, table, set_col, where_col): """Update a table.""" df = self.dict_df[table] try: self.db.update_table(df, table, set_col, where_col) except Exception as e: self.log_error(e, table) else: logger.log(logging.INFO, "{} update".format(table))
def main(self): """Generated all the table needed for the database.""" dict_subset = dict() for df in self.lst_df: msg = "Subset the df {}".format(df) logger.log(logging.INFO, msg) dict_subset[df] = self.select_sub_cols(df) return dict_subset
def gen_new_id_on_cols(self, table, cols, _id): """Generate a sha1 for a serie.""" logger.log(logging.INFO, "generate sha1 for {}".format(table)) df = self.dict_df_mapped[table].copy() sdl_id = table + "ScheduleId" df[sdl_id] = pt.sha1_for_named_columns(df, cols) on_cols = [_id, sdl_id] return df[on_cols]
def insertion_strat(self, dict_df): """Insert first or merge.""" first_gtfs = self.reader.is_the_db_clean() if first_gtfs: logger.log(logging.INFO, "This is the first GTFS") self.insert_first_gtfs(dict_df) else: msg = "There is already a gtfs, need to merge them" logger.log(logging.INFO, msg) self.merge_gtfs(dict_df)
def __init__(self, db_name): """Constructor.""" DB = getattr( importlib.import_module( "utilities.database.{}".format(db_type) ), "DB" ) logger.log(logging.INFO, "Initialize DB connection") self.db = DB(db_name) self.db_name = db_name
def get_close_stops(self): pool = mp.Pool() res = list() ln = len(self.lst_tstops) logger.log(logging.INFO, "Define all stops pair close") for lst in tqdm(pool.imap_unordered(self.one_to_n, self.lst_tstops), total=ln): res.append(lst) pool.close() return res
def set_gtfs(self): """Gen the table GTFS.""" data = { "gtfs_id": self.gtfs_id, "start_date": self.start_date, "end_date": self.end_date } msg = "The Id of this gtfs zip is {}" logger.log(logging.INFO, msg.format(self.gtfs_id)) return pd.DataFrame([data])
def prepare_stop_times(df): """Gen the arrival and departure time.""" try: df["arrival_time"] = df["arrival_time"].map(self.t2s) df["departure_time"] = df["departure_time"].map(self.t2s) msg = "We got all the stoptimes" logger.log(logging.INFO, msg) except: msg = "Some stoptimes are generating..." logger.log(logging.WARNING, msg) df = EstimateStopTimes.main(df) return df
def insert_table(self, table): """Insert df.""" if table == "Shape": return self.insert_shapes(table) df = self.dict_df[table] try: self.db.insert_dataframe(df, table) except Exception as e: self.log_error(e, table) else: logger.log(logging.INFO, "{} insert".format(table))
def gen_shapes(self, trips): """Gen or normalize the shapes.txt.""" graph = EstimateShapes(self.gtfs_path) if 'shapes.txt' in tools.list_zip_files(self.gtfs_path): shapes, trips = self.fill_shapes(trips, graph) else: msg = "We don't have shapes.txt in GTFS zip, we are generating..." logger.log(logging.WARNING, msg) shapes, trips = graph.main(trips) shapes["gtfs_id"] = self.gtfs_id return shapes, trips
def read_database(self): """Read all table in db.""" dict_df = dict() query = """ SELECT * FROM {} """ for table in _TABLE_IN_DATABASE: cquery = query.format(table) logger.log(logging.INFO, cquery) dict_df[table] = self.db.get_query_frame(cquery) return dict_df
def strat_commit(self): """Should we commit or raise if error.""" msg = "There are {} errors".format(self.nb_errors) if safe_mode: if self.nb_errors == 0: self.db.commit() else: self.db.rollback() raise WriterException(msg) else: self.db.commit() logger.log(logging.WARNING, msg)
def normalize_gtfs(self): logger.log(logging.INFO, EventLog.log_read_zip) dict_reader = CR(self.gtfs_path).main() logger.log(logging.INFO, EventLog.log_normalize_gtfs) dict_norm = CN(self.gtfs_path, dict_reader).main() logger.log(logging.INFO, EventLog.log_mapping_gtfs) dict_map = CM(dict_norm).main() logger.log(logging.INFO, EventLog.log_versioning_gtfs) dict_vers = CV(dict_map).main() logger.log(logging.INFO, EventLog.log_subset_gtfs) self.file_name = dict_vers["Gtfs"]["Id"].iloc[0] return dict_vers
def merge_gtfs(self, dict_df): """Merge and then insert gtfs.""" dict_gtfs_in_base = self.reader.read_database() gtfs_ = list(dict_gtfs_in_base["Gtfs"]) gtfs_id = dict_df["Gtfs"]["Id"].iloc[0] if gtfs_id in gtfs_: msg = "{} gtfs is already in the database".format(gtfs_id) logger.log(logging.ERROR, msg) return 0 diff = GD(dict_df, dict_gtfs_in_base) # Handle the GTFS intersection. ct = GC(dict_gtfs_in_base, dict_df, self.db_name) ct.gtfs_intersection() new_data = diff.whats_new() up_data = diff.whats_up(new_data) end_data = diff.whats_end() new_stops = new_data["Stop"] other_stops = dict_gtfs_in_base["Stop"] all_stops = pd.concat([other_stops, new_stops]) new_data["TransferTimesNDistances"] = self.gen_time_dist( new_stops, all_stops) logger.log(logging.INFO, EventLog.insert_merged_gtfs) self.writer.insert_gtfs(new_data) logger.log(logging.INFO, EventLog.reup_data) self.writer.up_gtfs(up_data) logger.log(logging.INFO, EventLog.close_data) self.writer.end_gtfs(end_data)
def compute_time_dist_pair_stops(self, pair_stops): pool = mp.Pool() fs, ts, tms, dm = [], [], [], [] ln = len(pair_stops) logger.log(logging.INFO, "Compute time & distance stops pair close") for lst in tqdm(pool.imap_unordered(self.mp_h_t_d, pair_stops), total=ln): fs.append(lst[0]) ts.append(lst[1]) tms.append(lst[2]) dm.append(lst[3]) return pd.DataFrame(data={ "FromStopId": fs, "ToStopId": ts, "TimeSeconds": tms, "DistanceMeters": dm } )
def gen_calendar_dates(self): """Gen or normalize calendar_dates.txt.""" def remove_weird_service_id(calendar_dates, calendar): serv_cal_dates = set(calendar_dates["service_id"].unique()) serv_cal = set(calendar["service_id"].unique()) weird_services = list(serv_cal_dates - serv_cal) return calendar_dates[~calendar_dates["service_id"]. isin(weird_services)] if 'calendar_dates.txt' in tools.list_zip_files(self.gtfs_path): calendar = self.dict_gtfs["calendar.txt"] calendar_dates = self.set_calendar_dates() calendar_dates = remove_weird_service_id(calendar_dates, calendar) else: msg = "We don't have calendar_dates.txt in GTFS zip." "" logger.log(logging.WARNING, msg) calendar_dates = pd.DataFrame() calendar_dates.columns = ["date", "service_id", "exception_type"] return calendar_dates
def reader(self, schedule_type, key_type): """Create an instance of the right reader class, depending on the schedule_type & key_type.""" msg = "Instance of the class {}" if schedule_type == "Theo": if key_type == "Real": cls = GtfsTheoRealKey(self.db_name, self.date) logger.log(logging.INFO, msg.format(cls.__class__.__name__)) return cls else: cls = GtfsTheoHashKey(self.db_name, self.date) logger.log(logging.INFO, msg.format(cls.__class__.__name__)) return cls elif key_type == "Real": cls = GtfsObsRealKey(self.db_name, self.date) logger.log(logging.INFO, msg.format(cls.__class__.__name__)) return cls else: cls = GtfsObsHashKey(self.db_name, self.date) logger.log(logging.INFO, msg.format(cls.__class__.__name__)) return cls
def gtfs_intersection(self): """Remove TripToDate during the range date, change the Gtfs end_date, and the stop_time and trip_to_shape end_date.""" if self.crossing(): lst_dates = range_date(self.start_date, self.end_date) msg = "There are {} trip_to_date to remove".format(len(lst_dates)) logger.log(logging.INFO, msg) lst_dates = change_list_type(lst_dates, str) self.writer.remove_trip_to_date(lst_dates) msg = 'New gtfs EndDate is {}'.format(self.new_end_date) logger.log(logging.INFO, msg) msg = 'Insteed of {}'.format(self.end_date) logger.log(logging.INFO, msg) self.writer.change_gtfs_end_date(self.gtfs) for table in dict_nt.keys(): df = self.update_end_date(table) logger.log(logging.INFO, "Update table {}".format(table)) self.writer.update_end_date(df, table, dict_nt[table].set, dict_nt[table].where) self.writer.db.commit()
def fill_shapes(self, trips, graph): """Fill the missed shapes.""" if len(trips[trips["shape_id"].isnull()]) > 0: shapes = self.dict_gtfs["shapes.txt"] true_shapes_trips = trips[~trips["shape_id"].isnull()] false_shapes_trips = trips[trips["shape_id"].isnull()] msg = "{} shapes are generating...".format(len(false_shapes_trips)) logger.log(logging.WARNING, msg) gen_shapes, gen_shape_trips = graph.main(false_shapes_trips) logger.log(logging.INFO, "They have been generated") shapes = pd.concat([shapes, gen_shapes]) shapes["gtfs_id"] = self.gtfs_id trips = pd.concat([true_shapes_trips, gen_shape_trips]) else: msg = "We got all shapes" logger.log(logging.INFO, msg) shapes = self.set_shapes() shapes = self.format_df(shapes, dict_normalize["Shape"].requiered) return shapes, trips
def gen_time_dist(self, df1, df2): """Gen time dist btw 2 df of stops.""" logger.log(logging.INFO, "Generating the TransferTimesNDistances") td = TimeDist(df1, df2) return td.main()
def gen_date_trips(self, trips, cal, cal_dates, gtfs): """Gen the table TRIPTODATE.""" logger.log(logging.INFO, "Generating the TripToDate...") dt = DateTrips(trips, cal, cal_dates, gtfs) return dt.main()
def insert_new_gtfs(self, insert=True): """Insert a gtfs mixed if already one in the base, else just insert the gtfs.""" logger.log(logging.INFO, EventLog.log_read_zip) dict_reader = CR(self.gtfs_path).main() logger.log(logging.INFO, EventLog.log_normalize_gtfs) dict_norm = CN(self.gtfs_path, dict_reader).main() logger.log(logging.INFO, EventLog.log_mapping_gtfs) dict_map = CM(dict_norm).main() logger.log(logging.INFO, EventLog.log_versioning_gtfs) dict_vers = CV(dict_map).main() logger.log(logging.INFO, EventLog.log_subset_gtfs) dict_sub = SC(dict_vers).main() logger.log(logging.INFO, EventLog.log_is_this_first_gtfs) if insert: return self.insertion_strat(dict_sub) else: return dict_sub
def insert_first_gtfs(self, dict_df): """Insert gtfs if first one.""" stops = dict_df["Stop"] dict_df["TransferTimesNDistances"] = self.gen_time_dist(stops, stops) logger.log(logging.INFO, EventLog.log_insert_gtfs) self.writer.insert_gtfs(dict_df)
def log_error(self, e, table): """Log the exception.""" self.nb_errors += 1 logger.log(logging.ERROR, table) logger.log(logging.ERROR, e)