Exemplo n.º 1
0
def compile_stops_from_gtfs(input_gtfs_f,
                            output_f,
                            all_matching_f=None,
                            version=None,
                            strip_suffixes='',
                            agency_id=-1,
                            tts_hint_language=None,
                            operators_f=None,
                            extra_f=None,
                            local_languages=None,
                            license_notice_f=None):
    if all_matching_f is not None:
        all_matching_f = [
            codecs.getreader('utf-8-sig')(x) for x in all_matching_f
        ]
    if operators_f is not None:
        operators_f = codecs.getreader('utf-8-sig')(operators_f)
    if extra_f is not None:
        extra_f = codecs.getreader('utf-8-sig')(extra_f)
    # trim whitespace
    strip_suffixes = [x.strip().lower() for x in strip_suffixes.split(',')]

    all_gtfs = [Gtfs(x) for x in input_gtfs_f]
    first_gtfs = all_gtfs[0]

    if version is None:
        try:
            feed_info = first_gtfs.open('feed_info.txt')
        except KeyError:
            # feed_info.txt is not in the file. Find the newest file in the archive
            feed_start_date = None
            for f in first_gtfs.infolist():
                ts = datetime(*f.date_time)
                if feed_start_date is None or feed_start_date < ts:
                    feed_start_date = ts
        else:
            row = next(feed_info)
            feed_start_date = row['feed_start_date']
            assert len(feed_start_date) == 8
            feed_start_date = datetime.strptime(feed_start_date, '%Y%m%d')

        version = (feed_start_date - VERSION_EPOCH).days
        print('Data version: %s (%s)' %
              (version, feed_start_date.date().isoformat()))

    operators = {}

    if operators_f is not None:
        operators = mdst.read_operators_from_csv(operators_f)
        operators_f.close()

    db = mdst.MdstWriter(
        fh=open(output_f, 'wb'),
        version=version,
        operators=operators,
        local_languages=local_languages.split(',')
        if local_languages is not None else [],
        tts_hint_language=tts_hint_language,
        license_notice_f=license_notice_f,
    )

    station_count = 0

    for num, gtfs in enumerate(all_gtfs):
        stops = gtfs.open('stops.txt')
        # See if there is a matching file
        if all_matching_f is not None and len(all_matching_f) > num:
            matching_f = all_matching_f[num]
        else:
            matching_f = None
        if matching_f is None:
            # No matching data, dump all stops.
            stop_map = map(
                lambda stop: [
                    stop['stop_id'],
                    massage_name(stop['stop_name'], strip_suffixes), stop[
                        'stop_lat'].strip(), stop['stop_lon'].strip()
                ], stops)

            for stop_id, stop_name, lat, lon in stop_map:
                s = Station()
                s.id = int(stop_id)
                s.name.english = stop_name
                if lat and lon:
                    s.latitude = float(lat)
                    s.longitude = float(lon)

                db.push_station(s)
                station_count += 1
        else:
            # Matching data is available.  Lets use that.
            matching = csv.DictReader(matching_f)

            stop_codes = {}
            stop_ids = {}
            short_names = {}
            for match in matching:
                if 'stop_code' in match and match['stop_code']:
                    if match['stop_code'] not in stop_codes:
                        stop_codes[match['stop_code']] = []
                    stop_codes[match['stop_code']].append(match['reader_id'])
                elif 'stop_id' in match and match['stop_id']:
                    if match['stop_id'] not in stop_ids:
                        stop_ids[match['stop_id']] = []
                    stop_ids[match['stop_id']].append(match['reader_id'])
                else:
                    raise Exception(
                        'neither stop_id or stop_code specified in row')
                if 'short_name' in match and match['short_name']:
                    short_names[match['reader_id']] = match['short_name']

            total_gtfs_stations = 0
            dropped_gtfs_stations = 0

            # Now run through the stops
            for stop in stops:
                # preprocess stop data
                name = massage_name(stop['stop_name'], strip_suffixes)
                y = float(stop['stop_lat'].strip())
                x = float(stop['stop_lon'].strip())

                used = False

                # Insert rows where a stop_id is specified for the reader_id
                stop_rows = []
                for reader_id in stop_ids.get(
                        stop.get('stop_id', 'stop_id_absent'), []):
                    s = Station()
                    s.id = int(reader_id, 0)
                    s.name.english = name
                    if y and x:
                        s.latitude = y
                        s.longitude = x
                    if reader_id in short_names:
                        s.name.english_short = short_names[reader_id]
                    if agency_id >= 0:
                        s.operator_id = agency_id

                    db.push_station(s)
                    station_count += 1
                    used = True

                # Insert rows where a stop_code is specified for the reader_id
                stop_rows = []
                for reader_id in stop_codes.get(
                        stop.get('stop_code', 'stop_code_absent'), []):
                    s = Station()
                    s.id = int(reader_id, 0)
                    s.name.english = name

                    if y and x:
                        s.latitude = y
                        s.longitude = x

                    if reader_id in short_names:
                        s.name.english_short = short_names[reader_id]
                    if agency_id >= 0:
                        s.operator_id = agency_id

                    db.push_station(s)
                    station_count += 1
                    used = True
                total_gtfs_stations += 1
                if not used:
                    dropped_gtfs_stations += 1

            matching_f.close()
            print('Finished parsing GTFS ' + str(num) +
                  '.  Here\'s the stats:')
            print(' - Dropped %d out of %d GTFS stations' %
                  (dropped_gtfs_stations, total_gtfs_stations))
            print()

    if extra_f is not None:
        mdst.read_stops_from_csv(db, extra_f)
        extra_f.close()

    index_end_off = db.finalise()

    print('Finished writing database.  Here\'s the stats:')
    print(' - total ............ %8d stations' % station_count)
    print('                      %8d bytes' % index_end_off)
    print()
    station_count = float(station_count)
    print(' - header ........... %8d bytes' % db.stationlist_off)
    stations_len = (db.index_off - db.stationlist_off)
    print(' - stations ......... %8d bytes (%.1f per record)' %
          (stations_len, stations_len / station_count))
    index_len = (index_end_off - db.index_off)
    print(' - index ............ %8d bytes (%.1f per record)' %
          (index_len, index_len / station_count))
Exemplo n.º 2
0
def compile_stops_from_gtfs(input_gtfs_f,
                            output_f,
                            matching_f=None,
                            version=None,
                            strip_suffixes='',
                            extra_fields='',
                            extra_fields_from_child=False,
                            agency_id=-1,
                            skip_create_table=False):
    if matching_f is not None:
        matching_f = codecs.getreader('utf-8-sig')(matching_f)
    # trim whitespace
    strip_suffixes = [x.strip().lower() for x in strip_suffixes.split(',')]
    if extra_fields is None or extra_fields == '':
        extra_fields = []
    else:
        extra_fields = [x.strip() for x in extra_fields.split(',')]

    if extra_fields:
        db_schema = DB_SCHEMA % dict(extra_fields=',\n\t' +
                                     (',\n\t'.join(extra_fields)))
        insert_query = INSERT_QUERY % dict(extra_fields=', ?' *
                                           len(extra_fields))
    else:
        db_schema = DB_SCHEMA % dict(extra_fields='')
        insert_query = INSERT_QUERY % dict(extra_fields='')

    gtfs = Gtfs(input_gtfs_f)

    if version is None:
        try:
            feed_info = gtfs.open('feed_info.txt')
        except KeyError:
            # feed_info.txt is not in the file. Find the newest file in the archive
            feed_start_date = None
            for f in gtfs.infolist():
                ts = datetime(*f.date_time)
                if feed_start_date is None or feed_start_date < ts:
                    feed_start_date = ts
        else:
            row = next(feed_info)
            feed_start_date = row['feed_start_date']
            assert len(feed_start_date) == 8
            feed_start_date = datetime.strptime(feed_start_date, '%Y%m%d')

        version = (feed_start_date - VERSION_EPOCH).days
        print('Data version: %s (%s)' %
              (version, feed_start_date.date().isoformat()))

    stops = gtfs.open('stops.txt')
    if extra_fields_from_child:
        child_data = {}

    parent_station_extras = {}

    db = sqlite3.connect(output_f)
    cur = db.cursor()
    if not skip_create_table:
        cur.execute(db_schema)

    # See if there is a matching file
    if matching_f is None:
        # No matching data, dump all stops.
        stop_map = map(
            lambda stop: [
                stop['stop_id'],
                massage_name(stop['stop_name'], strip_suffixes), stop[
                    'stop_lat'].strip(), stop['stop_lon'].strip()
            ] + [stop[x] for x in extra_fields], stops)

        cur.executemany(insert_query, stop_map)
    else:
        # Matching data is available.  Lets use that.
        matching = csv.DictReader(matching_f)

        stop_codes = {}
        stop_ids = {}
        stop_extra_fields = {}
        for match in matching:
            if match['stop_code']:
                if match['stop_code'] not in stop_codes:
                    stop_codes[match['stop_code']] = []
                stop_codes[match['stop_code']].append(match['reader_id'])
            elif match['stop_id']:
                if match['stop_id'] not in stop_ids:
                    stop_ids[match['stop_id']] = []
                stop_ids[match['stop_id']].append(match['reader_id'])
            else:
                raise Exception(
                    'neither stop_id or stop_code specified in row')

            # At least one of stop_id or stop_code was specified
            # Lets allow an override of any custom fields
            stop_extra_fields[match['reader_id']] = {}
            for extra_field in extra_fields:
                if not empty(match[extra_field]):
                    # There is an override available
                    stop_extra_fields[
                        match['reader_id']][extra_field] = match[extra_field]

        # Now run through the stops
        for stop in stops:
            # preprocess stop data
            name = massage_name(stop['stop_name'], strip_suffixes)
            y = stop['stop_lat'].strip()
            x = stop['stop_lon'].strip()
            if extra_fields_from_child and not empty(stop['parent_station']):
                parent = stop['parent_station'].strip()
                if parent in stop_ids and parent not in child_data:
                    # This is child has a parent we are interested in, and don't
                    # already have.
                    child_data[parent] = {}
                    for k in extra_fields:
                        if k in stop:
                            child_data[parent][k] = stop[k]

            e = [
                None if i not in stop else
                (child_data[stop['stop_id']][i] if
                 (empty(stop[i]) and stop['stop_id'] in child_data
                  and i in child_data[stop['stop_id']]) else (stop[i]))
                for i in extra_fields
            ]

            # Insert rows where a stop_id is specified for the reader_id
            stop_rows = []
            for reader_id in stop_ids.get(stop['stop_id'], []):
                r = [reader_id, agency_id, name, y, x] + e
                # Check for any overrides
                for k, v in stop_extra_fields[reader_id].items():
                    r[extra_fields.index(k) + 5] = v
                stop_rows.append(r)

            cur.executemany(insert_query, stop_rows)

            # Insert rows where a stop_code is specified for the reader_id
            stop_rows = []
            for reader_id in stop_codes.get(stop['stop_code'], []):
                stop_rows.append([reader_id, agency_id, name, y, x] + e)
                # Check for any overrides
                for k, v in stop_extra_fields[reader_id].items():
                    r[extra_fields.index(k) + 5] = v

            cur.executemany(insert_query, stop_rows)

        matching_f.close()

    # Increate the user_version only if it makes it newer.
    cur.execute('PRAGMA user_version')
    current_version = cur.fetchall()[0][0]
    if current_version < version:
        cur.execute('PRAGMA user_version = %d' % version)
    db.commit()
    db.close()
def compile_stops_from_gtfs(input_gtfs_f, output_f, matching_f=None, version=None, strip_suffixes='', extra_fields='', extra_fields_from_child=False, agency_id=-1, skip_create_table=False):
	# trim whitespace
	strip_suffixes = [x.strip().lower() for x in strip_suffixes.split(',')]
	if extra_fields is None or extra_fields == '':
		extra_fields = []
	else:
		extra_fields = [x.strip() for x in extra_fields.split(',')]
	
	if extra_fields:
		db_schema = DB_SCHEMA % dict(extra_fields=',\n\t' + (',\n\t'.join(extra_fields)))
		insert_query = INSERT_QUERY % dict(extra_fields=', ?' * len(extra_fields))
	else:
		db_schema = DB_SCHEMA % dict(extra_fields='')
		insert_query = INSERT_QUERY % dict(extra_fields='')
	
	gtfs = Gtfs(input_gtfs_f)

	if version is None:
		try:
			feed_info = gtfs.open('feed_info.txt')
		except KeyError:
			# feed_info.txt is not in the file. Find the newest file in the archive
			feed_start_date = None
			for f in gtfs.infolist():
				ts = datetime(*f.date_time)
				if feed_start_date is None or feed_start_date < ts:
					feed_start_date = ts
		else:
			row = feed_info.next()
			feed_start_date = row['feed_start_date']
			assert len(feed_start_date) == 8
			feed_start_date = datetime.strptime(feed_start_date, '%Y%m%d')

		version = (feed_start_date - VERSION_EPOCH).days
		print 'Data version: %s (%s)' % (version, feed_start_date.date().isoformat())
			

	stops = gtfs.open('stops.txt')
	if extra_fields_from_child:
		child_data = {}

	parent_station_extras = {}

	db = sqlite3.connect(output_f)
	cur = db.cursor()
	if not skip_create_table:
		cur.execute(db_schema)

	# See if there is a matching file
	if matching_f is None:
		# No matching data, dump all stops.
		stop_map = map(lambda stop: [stop['stop_id'], massage_name(stop['stop_name'], strip_suffixes), stop['stop_lat'].strip(), stop['stop_lon'].strip()] + [stop[x] for x in extra_fields],
			stops)

		cur.executemany(insert_query, stop_map)
	else:
		# Matching data is available.  Lets use that.
		matching = csv.DictReader(matching_f)
		
		stop_codes = {}
		stop_ids = {}
		stop_extra_fields = {}
		for match in matching:
			if match['stop_code']:
				if match['stop_code'] not in stop_codes:
					stop_codes[match['stop_code']] = []
				stop_codes[match['stop_code']].append(match['reader_id'])
			elif match['stop_id']:
				if match['stop_id'] not in stop_ids:
					stop_ids[match['stop_id']] = []
				stop_ids[match['stop_id']].append(match['reader_id'])
			else:
				raise Exception, 'neither stop_id or stop_code specified in row'
				
			# At least one of stop_id or stop_code was specified
			# Lets allow an override of any custom fields
			stop_extra_fields[match['reader_id']] = {}
			for extra_field in extra_fields:
				if not empty(match[extra_field]):
					# There is an override available
					stop_extra_fields[match['reader_id']][extra_field] = match[extra_field]

		# Now run through the stops
		for stop in stops:
			# preprocess stop data
			name = massage_name(stop['stop_name'], strip_suffixes)
			y = stop['stop_lat'].strip()
			x = stop['stop_lon'].strip()
			if extra_fields_from_child and not empty(stop['parent_station']):
				parent = stop['parent_station'].strip()
				if parent in stop_ids and parent not in child_data:
					# This is child has a parent we are interested in, and don't
					# already have.
					child_data[parent] = {}
					for k in extra_fields:
						if k in stop:
							child_data[parent][k] = stop[k]

			e = [None
				if i not in stop else (
					child_data[stop['stop_id']][i]
					if (empty(stop[i]) and stop['stop_id'] in child_data and i in child_data[stop['stop_id']])
					else (stop[i])
				) for i in extra_fields]

			# Insert rows where a stop_id is specified for the reader_id
			stop_rows = []
			for reader_id in stop_ids.get(stop['stop_id'], []):
				r = [reader_id, agency_id, name, y, x] + e
				# Check for any overrides
				for k, v in stop_extra_fields[reader_id].iteritems():
					r[extra_fields.index(k) + 5] = v
				stop_rows.append(r)

			cur.executemany(insert_query, stop_rows)

			# Insert rows where a stop_code is specified for the reader_id
			stop_rows = []
			for reader_id in stop_codes.get(stop['stop_code'], []):
				stop_rows.append([reader_id, agency_id, name, y, x] + e)
				# Check for any overrides
				for k, v in stop_extra_fields[reader_id].iteritems():
					r[extra_fields.index(k) + 5] = v

			cur.executemany(insert_query, stop_rows)

		matching_f.close()

	# Increate the user_version only if it makes it newer.
	cur.execute('PRAGMA user_version')
	current_version = cur.fetchall()[0][0]
	if current_version < version:
		cur.execute('PRAGMA user_version = %d' % version)
	db.commit()
	db.close()