def gen_rows(self, readers, prefixes): conn = self._conn cur = conn.cursor() for reader, prefix in zip(readers, prefixes): for row in reader: date = row['date'] date_str = '%s-%s-%s' % (date[:4], date[4:6], date[6:8]) service_id = prefix + row['service_id'] # We need to find the service_I of this. To do this we # need to check the calendar table, since that (and only # that) is the absolute list of service_ids. service_I = cur.execute( 'SELECT service_I FROM calendar WHERE service_id=?', (decode_six(service_id), )).fetchone() if service_I is None: # We have to add a new fake row in order to get a # service_I. calendar is *the* authoritative source # for service_I:s. cur.execute( 'INSERT INTO calendar ' '(service_id, m,t,w,th,f,s,su, start_date,end_date)' 'VALUES (?, 0,0,0,0,0,0,0, ?,?)', (decode_six(service_id), date_str, date_str)) service_I = cur.execute( 'SELECT service_I FROM calendar WHERE service_id=?', (decode_six(service_id), )).fetchone() service_I = service_I[0] # row tuple -> int yield dict( service_I=int(service_I), date=date_str, exception_type=int(row['exception_type']), )
def gen_rows(self, readers, prefixes): for reader, prefix in zip(readers, prefixes): for row in reader: #print row yield dict( _stop_id=prefix + decode_six(row['stop_id']), _trip_id=prefix + decode_six(row['trip_id']), arr_time=row['arrival_time'], dep_time=row['departure_time'], seq=int(row['stop_sequence']), )
def gen_rows(self, readers, prefixes): for reader, prefix in zip(readers, prefixes): for row in reader: #print row yield dict(_from_stop_id=prefix + decode_six(row['from_stop_id']).strip(), _to_stop_id=prefix + decode_six(row['to_stop_id']).strip(), transfer_type=int(row['transfer_type']), min_transfer_time=int(row['min_transfer_time']) if ('min_transfer_time' in row and (row.get('min_transfer_time').strip())) else None)
def gen_rows(self, readers, prefixes): for reader, prefix in zip(readers, prefixes): for row in reader: #print row yield dict(shape_id=prefix + decode_six(row['shape_id']), lat=float(row['shape_pt_lat']), lon=float(row['shape_pt_lon']), seq=int(row['shape_pt_sequence']))
def gen_rows(self, readers, prefixes): for reader, prefix in zip(readers, prefixes): for row in reader: yield dict( _trip_id=prefix + decode_six(row['trip_id']), start_time=row['start_time'], end_time=row['end_time'], headway_secs=int(row['headway_secs']), exact_times=int(row['exact_times']) if 'exact_times' in row and row['exact_times'].isdigit() else 0)
def gen_rows(self, readers, prefixes): for reader, prefix in zip(readers, prefixes): for row in reader: #print row start = row[ 'feed_start_date'] if 'feed_start_date' in row else None end = row['feed_end_date'] if 'feed_end_date' in row else None yield dict( feed_publisher_name=decode_six(row['feed_publisher_name']) if 'feed_publisher_name' in row else None, feed_publisher_url=decode_six(row['feed_publisher_url']) if 'feed_publisher_url' in row else None, feed_lang=decode_six(row['feed_lang']) if 'feed_lang' in row else None, feed_start_date='%s-%s-%s' % (start[:4], start[4:6], start[6:8]) if start else None, feed_end_date='%s-%s-%s' % (end[:4], end[4:6], end[6:8]) if end else None, feed_version=decode_six(row['feed_version']) if 'feed_version' in row else None, feed_id=prefix[:-1] if len(prefix) > 0 else prefix)
def gen_rows(self, readers, prefixes): for reader, prefix in zip(readers, prefixes): for row in reader: #print row assert row[ 'arrival_time'] != "", "Some stop_times entries is missing arrival time information." assert row[ 'departure_time'] != "", "Some stop_times entries is missing departure time information." assert row[ 'stop_sequence'] != "", "Some stop_times entries is missing seq information." assert row[ 'stop_id'] != "", "Some stop_times entries is missing stop_id information." assert row[ 'trip_id'] != "", "Some stop_times entries is missing trip_id information." yield dict( _stop_id=prefix + decode_six(row['stop_id']), _trip_id=prefix + decode_six(row['trip_id']), arr_time=row['arrival_time'], dep_time=row['departure_time'], seq=int(row['stop_sequence']), )
def gen_rows(self, readers, prefixes): for reader, prefix in zip(readers, prefixes): for row in reader: # and transform the "row" dictionary into a new # dictionary, which is yielded. There can be different # transformations here, as needed. yield dict( stop_id=prefix + decode_six(row['stop_id']), code=decode_six(row['stop_code']) if 'stop_code' in row else None, name=decode_six(row['stop_name']), desc=decode_six(row['stop_desc']) if 'stop_desc' in row else None, lat=float(row['stop_lat']), lon=float(row['stop_lon']), _parent_id=prefix + decode_six(row['parent_station']) if row.get('parent_station', '') else None, location_type=int(row['location_type']) if row.get('location_type') else None, wheelchair_boarding=int(row['wheelchair_boarding']) if row.get('wheelchair_boarding', '') else None, )
def gen_rows(self, readers, prefixes): from gtfspy import extended_route_types for reader, prefix in zip(readers, prefixes): for row in reader: #print (row) yield dict( route_id = prefix + decode_six(row['route_id']), _agency_id = prefix + decode_six(row['agency_id']) if 'agency_id' in row else None, name = decode_six(row['route_short_name']), long_name = decode_six(row['route_long_name']), desc = decode_six(row['route_desc']) if 'route_desc' in row else None, type = extended_route_types.ROUTE_TYPE_CONVERSION[int(row['route_type'])], url = decode_six(row['route_url']) if 'route_url' in row else None, color = decode_six(row['route_color']) if 'route_color' in row else None, text_color = decode_six(row['route_text_color']) if 'route_text_color' in row else None, )
def gen_rows(self, readers, prefixes): for reader, prefix in zip(readers, prefixes): for row in reader: yield dict( agency_id=prefix + decode_six(row.get('agency_id', '1')), name=decode_six(row['agency_name']), timezone=decode_six(row['agency_timezone']), url=decode_six(row['agency_url']), lang=decode_six(row['agency_lang']) if 'agency_lang' in row else None, phone=decode_six(row['agency_phone']) if 'agency_phone' in row else None, )
def gen_rows(self, readers, prefixes): #try: for reader, prefix in zip(readers, prefixes): for row in reader: #print row yield dict( _route_id=prefix + decode_six(row['route_id']), _service_id=prefix + decode_six(row['service_id']), trip_id=prefix + decode_six(row['trip_id']), direction_id=decode_six(row['direction_id']) if row.get( 'direction_id', '') else None, shape_id=prefix + decode_six(row['shape_id']) if row.get( 'shape_id', '') else None, headsign=decode_six(row['trip_headsign']) if 'trip_headsign' in row else None, )
def import_gtfs(gtfs_sources, output, preserve_connection=False, print_progress=True, location_name=None, **kwargs): """Import a GTFS database gtfs_sources: str, dict, list Paths to the gtfs zip file or to the directory containing the GTFS data. Alternatively, a dict can be provide that maps gtfs filenames (like 'stops.txt' and 'agencies.txt') to their string presentations. output: str or sqlite3.Connection path to the new database to be created, or an existing sqlite3 connection preserve_connection: bool, optional Whether to close the connection in the end, or not. print_progress: bool, optional Whether to print progress output location_name: str, optional set the location of this database """ if isinstance(output, sqlite3.Connection): conn = output else: # if os.path.isfile(output): # raise RuntimeError('File already exists') conn = sqlite3.connect(output) if not isinstance(gtfs_sources, list): gtfs_sources = [gtfs_sources] cur = conn.cursor() time_import_start = time.time() # These are a bit unsafe, but make importing much faster, # especially on scratch. cur.execute('PRAGMA page_size = 4096;') cur.execute('PRAGMA mmap_size = 1073741824;') cur.execute('PRAGMA cache_size = -2000000;') cur.execute('PRAGMA temp_store=2;') # Changes of isolation level are python3.6 workarounds - # eventually will probably be fixed and this can be removed. conn.isolation_level = None # change to autocommit mode (former default) cur.execute('PRAGMA journal_mode = OFF;') #cur.execute('PRAGMA journal_mode = WAL;') cur.execute('PRAGMA synchronous = OFF;') conn.isolation_level = '' # change back to python default. # end python3.6 workaround # Do the actual importing. loaders = [ L(gtfssource=gtfs_sources, print_progress=print_progress, **kwargs) for L in Loaders ] for loader in loaders: loader.assert_exists_if_required() # Do initial import. This consists of making tables, raw insert # of the CSVs, and then indexing. for loader in loaders: loader.import_(conn) # Do any operations that require all tables present. for Loader in loaders: Loader.post_import_round2(conn) # Make any views for Loader in loaders: Loader.make_views(conn) # Make any views for F in postprocessors: F(conn) # Set up same basic metadata. from gtfspy import gtfs as mod_gtfs G = mod_gtfs.GTFS(output) G.meta['gen_time_ut'] = time.time() G.meta['gen_time'] = time.ctime() G.meta['import_seconds'] = time.time() - time_import_start G.meta['download_date'] = '' G.meta['location_name'] = '' G.meta['n_gtfs_sources'] = len(gtfs_sources) # Extract things from GTFS download_date_strs = [] for i, source in enumerate(gtfs_sources): if len(gtfs_sources) == 1: prefix = "" else: prefix = "feed_" + str(i) + "_" if isinstance(source, string_types): G.meta[prefix + 'original_gtfs'] = decode_six(source) if source else None # Extract GTFS date. Last date pattern in filename. filename_date_list = re.findall(r'\d{4}-\d{2}-\d{2}', source) if filename_date_list: date_str = filename_date_list[-1] G.meta[prefix + 'download_date'] = date_str download_date_strs.append(date_str) if location_name: G.meta['location_name'] = location_name else: location_name_list = re.findall(r'/([^/]+)/\d{4}-\d{2}-\d{2}', source) if location_name_list: G.meta[prefix + 'location_name'] = location_name_list[-1] else: try: G.meta[prefix + 'location_name'] = source.split("/")[-4] except: G.meta[prefix + 'location_name'] = source if G.meta['download_date'] == "": unique_download_dates = list(set(download_date_strs)) if len(unique_download_dates) == 1: G.meta['download_date'] = unique_download_dates[0] G.meta['timezone'] = cur.execute( 'SELECT timezone FROM agencies LIMIT 1').fetchone()[0] stats.update_stats(G) del G if print_progress: print("Vacuuming...") # Next 3 lines are python 3.6 work-arounds again. conn.isolation_level = None # former default of autocommit mode cur.execute('VACUUM;') conn.isolation_level = '' # back to python default # end python3.6 workaround if print_progress: print("Analyzing...") cur.execute('ANALYZE') if not (preserve_connection is True): conn.close()