def compile_stops_from_gtfs(input_gtfs_f, output_f, all_matching_f=None, version=None, strip_suffixes='', agency_id=-1, tts_hint_language=None, operators_f=None, extra_f=None, local_languages=None, license_notice_f=None): if all_matching_f is not None: all_matching_f = [ codecs.getreader('utf-8-sig')(x) for x in all_matching_f ] if operators_f is not None: operators_f = codecs.getreader('utf-8-sig')(operators_f) if extra_f is not None: extra_f = codecs.getreader('utf-8-sig')(extra_f) # trim whitespace strip_suffixes = [x.strip().lower() for x in strip_suffixes.split(',')] all_gtfs = [Gtfs(x) for x in input_gtfs_f] first_gtfs = all_gtfs[0] if version is None: try: feed_info = first_gtfs.open('feed_info.txt') except KeyError: # feed_info.txt is not in the file. Find the newest file in the archive feed_start_date = None for f in first_gtfs.infolist(): ts = datetime(*f.date_time) if feed_start_date is None or feed_start_date < ts: feed_start_date = ts else: row = next(feed_info) feed_start_date = row['feed_start_date'] assert len(feed_start_date) == 8 feed_start_date = datetime.strptime(feed_start_date, '%Y%m%d') version = (feed_start_date - VERSION_EPOCH).days print('Data version: %s (%s)' % (version, feed_start_date.date().isoformat())) operators = {} if operators_f is not None: operators = mdst.read_operators_from_csv(operators_f) operators_f.close() db = mdst.MdstWriter( fh=open(output_f, 'wb'), version=version, operators=operators, local_languages=local_languages.split(',') if local_languages is not None else [], tts_hint_language=tts_hint_language, license_notice_f=license_notice_f, ) station_count = 0 for num, gtfs in enumerate(all_gtfs): stops = gtfs.open('stops.txt') # See if there is a matching file if all_matching_f is not None and len(all_matching_f) > num: matching_f = all_matching_f[num] else: matching_f = None if matching_f is None: # No matching data, dump all stops. stop_map = map( lambda stop: [ stop['stop_id'], massage_name(stop['stop_name'], strip_suffixes), stop[ 'stop_lat'].strip(), stop['stop_lon'].strip() ], stops) for stop_id, stop_name, lat, lon in stop_map: s = Station() s.id = int(stop_id) s.name.english = stop_name if lat and lon: s.latitude = float(lat) s.longitude = float(lon) db.push_station(s) station_count += 1 else: # Matching data is available. Lets use that. matching = csv.DictReader(matching_f) stop_codes = {} stop_ids = {} short_names = {} for match in matching: if 'stop_code' in match and match['stop_code']: if match['stop_code'] not in stop_codes: stop_codes[match['stop_code']] = [] stop_codes[match['stop_code']].append(match['reader_id']) elif 'stop_id' in match and match['stop_id']: if match['stop_id'] not in stop_ids: stop_ids[match['stop_id']] = [] stop_ids[match['stop_id']].append(match['reader_id']) else: raise Exception( 'neither stop_id or stop_code specified in row') if 'short_name' in match and match['short_name']: short_names[match['reader_id']] = match['short_name'] total_gtfs_stations = 0 dropped_gtfs_stations = 0 # Now run through the stops for stop in stops: # preprocess stop data name = massage_name(stop['stop_name'], strip_suffixes) y = float(stop['stop_lat'].strip()) x = float(stop['stop_lon'].strip()) used = False # Insert rows where a stop_id is specified for the reader_id stop_rows = [] for reader_id in stop_ids.get( stop.get('stop_id', 'stop_id_absent'), []): s = Station() s.id = int(reader_id, 0) s.name.english = name if y and x: s.latitude = y s.longitude = x if reader_id in short_names: s.name.english_short = short_names[reader_id] if agency_id >= 0: s.operator_id = agency_id db.push_station(s) station_count += 1 used = True # Insert rows where a stop_code is specified for the reader_id stop_rows = [] for reader_id in stop_codes.get( stop.get('stop_code', 'stop_code_absent'), []): s = Station() s.id = int(reader_id, 0) s.name.english = name if y and x: s.latitude = y s.longitude = x if reader_id in short_names: s.name.english_short = short_names[reader_id] if agency_id >= 0: s.operator_id = agency_id db.push_station(s) station_count += 1 used = True total_gtfs_stations += 1 if not used: dropped_gtfs_stations += 1 matching_f.close() print('Finished parsing GTFS ' + str(num) + '. Here\'s the stats:') print(' - Dropped %d out of %d GTFS stations' % (dropped_gtfs_stations, total_gtfs_stations)) print() if extra_f is not None: mdst.read_stops_from_csv(db, extra_f) extra_f.close() index_end_off = db.finalise() print('Finished writing database. Here\'s the stats:') print(' - total ............ %8d stations' % station_count) print(' %8d bytes' % index_end_off) print() station_count = float(station_count) print(' - header ........... %8d bytes' % db.stationlist_off) stations_len = (db.index_off - db.stationlist_off) print(' - stations ......... %8d bytes (%.1f per record)' % (stations_len, stations_len / station_count)) index_len = (index_end_off - db.index_off) print(' - index ............ %8d bytes (%.1f per record)' % (index_len, index_len / station_count))
def compile_stops_from_gtfs(input_gtfs_f, output_f, matching_f=None, version=None, strip_suffixes='', extra_fields='', extra_fields_from_child=False, agency_id=-1, skip_create_table=False): if matching_f is not None: matching_f = codecs.getreader('utf-8-sig')(matching_f) # trim whitespace strip_suffixes = [x.strip().lower() for x in strip_suffixes.split(',')] if extra_fields is None or extra_fields == '': extra_fields = [] else: extra_fields = [x.strip() for x in extra_fields.split(',')] if extra_fields: db_schema = DB_SCHEMA % dict(extra_fields=',\n\t' + (',\n\t'.join(extra_fields))) insert_query = INSERT_QUERY % dict(extra_fields=', ?' * len(extra_fields)) else: db_schema = DB_SCHEMA % dict(extra_fields='') insert_query = INSERT_QUERY % dict(extra_fields='') gtfs = Gtfs(input_gtfs_f) if version is None: try: feed_info = gtfs.open('feed_info.txt') except KeyError: # feed_info.txt is not in the file. Find the newest file in the archive feed_start_date = None for f in gtfs.infolist(): ts = datetime(*f.date_time) if feed_start_date is None or feed_start_date < ts: feed_start_date = ts else: row = next(feed_info) feed_start_date = row['feed_start_date'] assert len(feed_start_date) == 8 feed_start_date = datetime.strptime(feed_start_date, '%Y%m%d') version = (feed_start_date - VERSION_EPOCH).days print('Data version: %s (%s)' % (version, feed_start_date.date().isoformat())) stops = gtfs.open('stops.txt') if extra_fields_from_child: child_data = {} parent_station_extras = {} db = sqlite3.connect(output_f) cur = db.cursor() if not skip_create_table: cur.execute(db_schema) # See if there is a matching file if matching_f is None: # No matching data, dump all stops. stop_map = map( lambda stop: [ stop['stop_id'], massage_name(stop['stop_name'], strip_suffixes), stop[ 'stop_lat'].strip(), stop['stop_lon'].strip() ] + [stop[x] for x in extra_fields], stops) cur.executemany(insert_query, stop_map) else: # Matching data is available. Lets use that. matching = csv.DictReader(matching_f) stop_codes = {} stop_ids = {} stop_extra_fields = {} for match in matching: if match['stop_code']: if match['stop_code'] not in stop_codes: stop_codes[match['stop_code']] = [] stop_codes[match['stop_code']].append(match['reader_id']) elif match['stop_id']: if match['stop_id'] not in stop_ids: stop_ids[match['stop_id']] = [] stop_ids[match['stop_id']].append(match['reader_id']) else: raise Exception( 'neither stop_id or stop_code specified in row') # At least one of stop_id or stop_code was specified # Lets allow an override of any custom fields stop_extra_fields[match['reader_id']] = {} for extra_field in extra_fields: if not empty(match[extra_field]): # There is an override available stop_extra_fields[ match['reader_id']][extra_field] = match[extra_field] # Now run through the stops for stop in stops: # preprocess stop data name = massage_name(stop['stop_name'], strip_suffixes) y = stop['stop_lat'].strip() x = stop['stop_lon'].strip() if extra_fields_from_child and not empty(stop['parent_station']): parent = stop['parent_station'].strip() if parent in stop_ids and parent not in child_data: # This is child has a parent we are interested in, and don't # already have. child_data[parent] = {} for k in extra_fields: if k in stop: child_data[parent][k] = stop[k] e = [ None if i not in stop else (child_data[stop['stop_id']][i] if (empty(stop[i]) and stop['stop_id'] in child_data and i in child_data[stop['stop_id']]) else (stop[i])) for i in extra_fields ] # Insert rows where a stop_id is specified for the reader_id stop_rows = [] for reader_id in stop_ids.get(stop['stop_id'], []): r = [reader_id, agency_id, name, y, x] + e # Check for any overrides for k, v in stop_extra_fields[reader_id].items(): r[extra_fields.index(k) + 5] = v stop_rows.append(r) cur.executemany(insert_query, stop_rows) # Insert rows where a stop_code is specified for the reader_id stop_rows = [] for reader_id in stop_codes.get(stop['stop_code'], []): stop_rows.append([reader_id, agency_id, name, y, x] + e) # Check for any overrides for k, v in stop_extra_fields[reader_id].items(): r[extra_fields.index(k) + 5] = v cur.executemany(insert_query, stop_rows) matching_f.close() # Increate the user_version only if it makes it newer. cur.execute('PRAGMA user_version') current_version = cur.fetchall()[0][0] if current_version < version: cur.execute('PRAGMA user_version = %d' % version) db.commit() db.close()
def compile_stops_from_gtfs(input_gtfs_f, output_f, matching_f=None, version=None, strip_suffixes='', extra_fields='', extra_fields_from_child=False, agency_id=-1, skip_create_table=False): # trim whitespace strip_suffixes = [x.strip().lower() for x in strip_suffixes.split(',')] if extra_fields is None or extra_fields == '': extra_fields = [] else: extra_fields = [x.strip() for x in extra_fields.split(',')] if extra_fields: db_schema = DB_SCHEMA % dict(extra_fields=',\n\t' + (',\n\t'.join(extra_fields))) insert_query = INSERT_QUERY % dict(extra_fields=', ?' * len(extra_fields)) else: db_schema = DB_SCHEMA % dict(extra_fields='') insert_query = INSERT_QUERY % dict(extra_fields='') gtfs = Gtfs(input_gtfs_f) if version is None: try: feed_info = gtfs.open('feed_info.txt') except KeyError: # feed_info.txt is not in the file. Find the newest file in the archive feed_start_date = None for f in gtfs.infolist(): ts = datetime(*f.date_time) if feed_start_date is None or feed_start_date < ts: feed_start_date = ts else: row = feed_info.next() feed_start_date = row['feed_start_date'] assert len(feed_start_date) == 8 feed_start_date = datetime.strptime(feed_start_date, '%Y%m%d') version = (feed_start_date - VERSION_EPOCH).days print 'Data version: %s (%s)' % (version, feed_start_date.date().isoformat()) stops = gtfs.open('stops.txt') if extra_fields_from_child: child_data = {} parent_station_extras = {} db = sqlite3.connect(output_f) cur = db.cursor() if not skip_create_table: cur.execute(db_schema) # See if there is a matching file if matching_f is None: # No matching data, dump all stops. stop_map = map(lambda stop: [stop['stop_id'], massage_name(stop['stop_name'], strip_suffixes), stop['stop_lat'].strip(), stop['stop_lon'].strip()] + [stop[x] for x in extra_fields], stops) cur.executemany(insert_query, stop_map) else: # Matching data is available. Lets use that. matching = csv.DictReader(matching_f) stop_codes = {} stop_ids = {} stop_extra_fields = {} for match in matching: if match['stop_code']: if match['stop_code'] not in stop_codes: stop_codes[match['stop_code']] = [] stop_codes[match['stop_code']].append(match['reader_id']) elif match['stop_id']: if match['stop_id'] not in stop_ids: stop_ids[match['stop_id']] = [] stop_ids[match['stop_id']].append(match['reader_id']) else: raise Exception, 'neither stop_id or stop_code specified in row' # At least one of stop_id or stop_code was specified # Lets allow an override of any custom fields stop_extra_fields[match['reader_id']] = {} for extra_field in extra_fields: if not empty(match[extra_field]): # There is an override available stop_extra_fields[match['reader_id']][extra_field] = match[extra_field] # Now run through the stops for stop in stops: # preprocess stop data name = massage_name(stop['stop_name'], strip_suffixes) y = stop['stop_lat'].strip() x = stop['stop_lon'].strip() if extra_fields_from_child and not empty(stop['parent_station']): parent = stop['parent_station'].strip() if parent in stop_ids and parent not in child_data: # This is child has a parent we are interested in, and don't # already have. child_data[parent] = {} for k in extra_fields: if k in stop: child_data[parent][k] = stop[k] e = [None if i not in stop else ( child_data[stop['stop_id']][i] if (empty(stop[i]) and stop['stop_id'] in child_data and i in child_data[stop['stop_id']]) else (stop[i]) ) for i in extra_fields] # Insert rows where a stop_id is specified for the reader_id stop_rows = [] for reader_id in stop_ids.get(stop['stop_id'], []): r = [reader_id, agency_id, name, y, x] + e # Check for any overrides for k, v in stop_extra_fields[reader_id].iteritems(): r[extra_fields.index(k) + 5] = v stop_rows.append(r) cur.executemany(insert_query, stop_rows) # Insert rows where a stop_code is specified for the reader_id stop_rows = [] for reader_id in stop_codes.get(stop['stop_code'], []): stop_rows.append([reader_id, agency_id, name, y, x] + e) # Check for any overrides for k, v in stop_extra_fields[reader_id].iteritems(): r[extra_fields.index(k) + 5] = v cur.executemany(insert_query, stop_rows) matching_f.close() # Increate the user_version only if it makes it newer. cur.execute('PRAGMA user_version') current_version = cur.fetchall()[0][0] if current_version < version: cur.execute('PRAGMA user_version = %d' % version) db.commit() db.close()