示例#1
0
def measure_naptan_groups(gdf, naptan_column_name="LocalityName"):
    """[summary]  this function measures the number of groups present within
    the given geodataframe, when groupbed the [default LocalityNames]

    Args:
        gdf ([type]): [description]
        naptan_column_name ([type]): [description]

    Returns:
        [type]: [description]
    """

    # filter dataset to bare minimum needed.
    naptan_column_name = "LocalityName"
    gdf2 = gdf[[
        "AreaName", "LocalityName", "geometry", "Longitude", "Latitude"
    ]]
    # reconvert to geo
    gdf3 = geo.calculate_naptan_geometry(gdf2)
    gdf3 = gdf3.drop(["Longitude", "Latitude"], axis=1)
    # groupby the naptan column type.
    groups = gdf3.groupby(naptan_column_name)
    # gets us the number of values in the geometry column.
    counts = gdf3.groupby(naptan_column_name).size().reset_index()
    counts.columns = [f"{naptan_column_name}", f"Size_{naptan_column_name}"]

    return counts, groups
示例#2
0
def group_naptan_datatypes(gdf, naptan_column='LocalityName'):
    """[summary] groups together naptan datasets into subsets that are grouped
    by the given naptan column.

    Args:
        gdf ([type]): [description]
        naptan_column (str, optional): [description]. Defaults to 'LocalityName'.

    Returns:
        [type]: [description]
    """
    # collapse dataset to minimum, keeping possibly useable datasets
    gdf2 = gdf[[
        'LocalityName', 'NptgLocalityCode', 'AreaName', 'StopAreaCode',
        'Latitude', 'Longitude'
    ]]
    # calculates the centroid of each given naptan segment.
    gdf3 = gdf2.groupby([naptan_column],
                        as_index=False)[['Latitude', 'Longitude'
                                         ]].apply(lambda x: np.mean(x, axis=0))
    # convert the lat lon into centroid geometry points.
    gdf4 = geo.calculate_naptan_geometry(gdf3)
    # save output to csv.
    gdf4.to_csv(f'{naptan_column}.csv', encoding='utf-8', sep=',')
    return gdf4
示例#3
0
def naptan_gazette_localities():
    """[summary] returns the gazette locality data for use with the stops
    data.
    """
    # just the basics
    cols = [
        'NptgLocalityCode', 'LocalityName', 'AdministrativeAreaCode',
        'QualifierName', 'NptgDistrictCode', 'SourceLocalityType', 'GridType',
        'Easting', 'Northing'
    ]
    # read the file
    gaz_locs = pd.read_csv(f'{nptg_dir}/Localities.csv',
                           encoding='iso-8859-1',
                           low_memory=True,
                           usecols=cols)
    # TODO -
    gaz_locs = gaz_locs.rename(columns={'AdministrativeAreaCode': 'AdminCode'})
    gaz_locs['AdminCode'] = gaz_locs['AdminCode'].astype(str)
    # convert lat long
    gaz_locs = geo.convert_to_lat_long(gaz_locs)
    # calculate geometry point for geodataframe.
    gaz_locs = geo.calculate_naptan_geometry(gaz_locs)
    # rename for later merger.
    gaz_locs.rename(columns={
        'NptgLocalityCode': 'NptgLocalityCode',
        'LocalityName': 'LocalityName',
        'QualifierName': 'QualifierName',
        'NptgDistrictCode': 'NptgDistrictCode',
        'SourceLocalityType': 'SourceLocalityType',
        'GridType': 'NptgGridType',
        'Longitude': 'Gazette_Longitude',
        'Latitude': 'Gazette_Latitude',
        'geometry': 'Gazette_geometry'
    },
                    inplace=True)
    # TODO new column merge Locality and qualifier name, check if duplicate.
    gaz_locs['Qualified_Locality'] = gaz_locs['LocalityName'] + ', ' +\
        gaz_locs['QualifierName']
    return gaz_locs
示例#4
0
    def find_unused_localities(cls, gdf):
        """[summary] returns a list of admin areas in nptg,
            checks those are in the nodes file, if the nodes file has aac not in 

            Args:
                ([gdf])
            Raises:
                NotImplementedError: [description]
                ve: [description]

            Returns:
                [pandas.core.frame.DataFrame]: [localities that are not used in the
                nodes file.]
            """
        # node values

        localities = etl_pipe.naptan_gazette_localities()
        unused = localities[~localities['NptgLocalityCode'].
                            isin(gdf['NptgLocalityCode'])]
        # conversion for geometry.
        unused = unused.rename(columns={
            "Gazette_Longitude": "Longitude",
            "Gazette_Latitude": "Latitude"
        })
        #
        unused = geo_pipe.calculate_naptan_geometry(unused)
        # reporting function
        rep.report_failing_nodes(gdf,
                                 'unused localities near stops',
                                 failed_nodes=failedNodes)
        # m = vis.generate_base_map(unused, 'LocalityName')
        # m
        # TODO find out if any stops are inside the boundaries of the unused areas
        # TODO the geometries are just points for the unused localites
        # TODO find out the closest stops to these points.
        #  localites.
        return unused
示例#5
0
def main(named_area):
    """Downloads the naptan dataset and runs the basic internal
    consistency checks and geospatial checks"""
    # etl pipeline functions.
    etl.naptan_data_source("nptg", "csv")
    etl.naptan_data_source("naptan_nodes", "csv")
    nodes = Path(f"{dl_home}/{timestr}_naptan_nodes.zip")
    nptg = Path(f"{dl_home}/{timestr}_nptg.zip")
    etl.extract_naptan_files(nodes)
    etl.extract_naptan_files(nptg)
    # naptanfilenames = etl.file_verification('ext')

    # dataframe creation
    gdf = etl.read_naptan_file("Stops")
    gdf = etl.deactivated_nodes(gdf)
    # we join the gazette locality code and admin code data onto the nodes data
    # frame, this gives us accurate locality and admin area names.
    locality_codes = etl.naptan_gazette_localities()
    gdf = etl.map_gazette_to_nodes(gdf, locality_codes, "NptgLocalityCode")
    admin_codes = etl.naptan_gazette_admin_area_codes()
    gdf = etl.map_gazette_to_nodes(gdf, admin_codes, "AdminCode")
    # we merge on the stop area data and corresponding codes for stop area
    gdf = etl.merge_stop_areas(gdf)
    gdf = geopipe.calculate_naptan_geometry(gdf)
    # Check that the naptan data structure downloaded is within acceptable
    # tolerances
    NaptanStructureChecks.check_naptan_stop_number_limits(gdf)
    # cli to provide a named administrative area within the naptan dataset.
    naptan_area_level = "AreaName"
    named_area = named_area
    # TODO or locality.
    # TODO make the named area geojson polygon with feature data.
    gdf_sub = etl.create_naptan_subframe(gdf, naptan_area_level, named_area)

    # Data Cleansing functions
    # illegal captials
    IllegalCaptials.check_illegal_caps(gdf_sub, "StopPoint")
    #  illegal characters
    IllegalCharacters.check_illegal_characters(gdf_sub, "StopPoint")
    # check for illegal spaces in required string columns.
    IllegalSpaces.check_illegal_spaces(gdf_sub)
    # The internal data consistency checks
    LocalitiesIDStops.localities_with_identical_stops(gdf_sub)
    NameContainsLocality.stop_name_contains_locality_name(gdf_sub)
    BearingMissing.stop_with_bearing_missing(gdf_sub)
    StopNameHighRisks.stop_names_with_high_risk_words(gdf_sub)
    StopsDifferentNamedAdminArea.stops_in_different_admin_area(gdf_sub)
    # TODO new checks - add to release notes
    CheckDateTime.check_stop_dates_not_after_today(gdf_sub)
    CheckName.check_name_length(gdf_sub)
    MultiRoadName.stop_with_multiple_road_names(gdf_sub, "CommonName")
    AtcocodeCheck.check_atcocode_length(gdf_sub)
    print("All internal consistency checks have been completed.")

    # geospatial data checks
    CoastlineStops.naptan_coastal_nodes(gdf_sub)
    #  checks that should only be performed on locality level, get passed out to
    # this function collection for running through the size of each type.
    etl.locality_level_checks(gdf_sub)
    # area specific checks
    print("All geospatial functions have been completed.")
    # make the map and populate with node cluster.
    generate_base_map(gdf_sub)
    return gdf_sub
示例#6
0
def create_naptan_subframe(gdf, naptan_area_level, col_value):
    """[summary] creates a naptan sub frame based on a given column name,
     by the value that is presented must be in that column.

    Arguments:
        gdf {[geopandas dataframe]} -- [the naptan dataframe]
        naptan_area_level {[str]} -- [the name of the column which will split
        the dataframe on]
        colvalue {[str]} -- [the value in the column to query.]

    Returns:
        [geodataframe] -- [The naptan subframe]
    """

    # convert the colvalue string into a lower case.
    try:
        if isinstance(col_value, str):
            pass
    except isinstance(col_value, int):
        col_value = f'{col_value}'
    finally:
        col_value = col_value.lower()

    lower_case = gdf[naptan_area_level].str.lower()
    new_df = pd.DataFrame(lower_case)
    gdf.update(new_df)
    # we put this here so we can filter out all areas that are managed by
    # dft centrally
    dft_authorities = [
        'National - National Rail', 'National - National Air',
        'National - National Ferry', 'National - National Tram'
    ]
    # for grouping we need to pass in wildcard values, ideally string
    #  contains but the stop area code will always start with the atcocode for
    # the area or mode.
    if naptan_area_level == 'AreaName':
        # check if an invalid area has been passed.
        try:
            if col_value in dft_authorities:
                # if the user passes in a DfT managed area, we exit out
                sys.exit(f'{col_value} is a DfT central authority.')

        except KeyError:
            # catch the value if isn't found.
            sys.exit(f"{col_value} was not found in the given dataframe.")

        finally:
            # We get the nptg locality codes within the given admin area, as
            # this will include all forms of transport for the area and not
            # just bus transport infrastructure.
            gdf_sub = gdf[gdf[f'{naptan_area_level}'] == col_value]
            gdf_subframe = geo.calculate_naptan_geometry(gdf_sub)
            return gdf_subframe

    elif naptan_area_level == 'StopType':
        try:
            gdf1 = gdf[gdf['StopType'].str.match(col_value)]
            gdf_subframe = geo.calculate_naptan_geometry(gdf1)
            return gdf_subframe
        except KeyError:
            # catch of the value just isn't found.
            sys.exit(f"{col_value} is not known stoptype.")

    elif naptan_area_level == 'StopAreaCode':
        mask = gdf[f'{naptan_area_level}'].str.startswith(f'{col_value}')
        gdf_subframe = gdf[mask]
        gdf_sub = geo.calculate_naptan_geometry(gdf_subframe)
        return gdf_sub

    # expects the full string of the npgt or admin code to work.
    elif naptan_area_level == 'NptgLocalityCode' or 'LocalityName':
        # print('This is a locality area.')
        columngroup = gdf.groupby(naptan_area_level)
        gdf_subframe = columngroup.get_group(col_value)
        gdf_subframe.reset_index(drop=True, inplace=True)
        gdf_sub = geo.calculate_naptan_geometry(gdf_subframe)
        return gdf_sub
    else:
        sys.exit('Column type is not supported.')