def locality_with_unusually_elongated_shape(cls, gdf_locality): """[summary] the enclosing bounding box is arbitrary - this check and all other checks that require to build a shape are variants of Minimum Bounding Box/Convex Hull problems (https://en.wikipedia.org/wiki/Minimum_bounding_box_algorithms). There should be something pre-made in python, otherwise we can just look at implementing an existing algorithm. Many of these problems are arbitrarily defined by ITO, so we need to come up with a definition of "elongated" (assuming it is really a problem). For example, we could say that the shape is elongated if the longest edge is 10x longer than the shortest edge. We need to think what makes sense. Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "locality_with_unusually_elongated_shape" PolygonStructure.check_area_length_is_regular(gdf_locality, "Name of locality") # list of stops not in correct admin areas by geo position. # TODO - if failed_nodes = "" rep.report_failing_nodes(gdf_locality, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def locality_not_unique(cls, gdf): """[summary] The name of the locality with its qualifier (if any) is not unique nationally. To ensure that a search for a locality based on the National Gazetteer (NPTG) will differentiate between localities that may be identically named, by applying an appropriate qualifier to each ambiguous entry. Ensure that the appropriate qualifier is added to a locality which is ambiguous. Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "locality_not_unique" gdf['Local_Qualifer_Name'] = gdf['LocalityName'] + \ ', ' + gdf['QualifierName'] # remove all the notana values from the dataframe. nodes = gdf[gdf["Local_Qualifer_Name"].notna()] # check for duplicates in the locality qualifier name column. boolean = nodes.duplicated(subset=['Local_Qualifer_Name']) nodes_dup = nodes[boolean] # TODO this might work, it returns 8000 localities out of 28000 that are # not unique... not sure that all is correct. a = nodes_dup.loc[~nodes_dup.duplicated(keep=False), 'Locality_Qualifier_Name'].unique() # TODO check that the returned are correct failed_nodes = '' rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def check_area_length_is_regular(cls, gdf, naptan_locality): """[summary] when given a geodataframe, checks the matching polygon, is under a 1000 nodes Args: gdf ([naptan master geodataframe]): [the naptan master.] df_area ([naptan geodataframe]): [the sub area we are checking, not we pass the entire frame, (a locality)] polygon ([[shapely.geometry.polygon.Polygon]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = cls.check_area_length_is_regular.__name__ # make the polygon from area data, check length area_polygon = make_naptan_polygon(naptan_locality) # TODO get the longest length poly_long = cls.polygon_longest_side(area_polygon) # TODO get the shortest length poly_short = cls.polygon_shortest_side(area_polygon) # check area length. if poly_long >= 800: print("Is not a locality and the area is excluded from this check") pass elif poly_long <= 1000: print("Polygon area is regular") else: print("Polygon area is irregular") rep.report_failing_nodes(gdf, check_name, naptan_locality)
def check_name_length(cls, gdf): """[summary]:- A stop point fails if StopPoint has a full name [Locality, CommonName (Indicator)] that is more than 80 characters in length. Arguments: gdf {[geopandas dataframe]} -- [The naptan master dataframe.] Returns: df_str {[dataframe of ]} -- Nodes that failed the check. """ try: # get name for report check_name = "check_name_length" # clean frame gdf1 = gdf # get the stoppoint name gdf1["newName"] = ( gdf1["CommonName"].astype(str) + ", " + gdf1["LocalityName"].astype(str) ) # mask the names against 80 chars mask = gdf1["newName"].str.len() > 80 df_str = gdf1.loc[mask] # send to report rep.report_failing_nodes(gdf, check_name, df_str) return df_str.ATCOCode except Exception as e: raise e sys.exit(f"{check_name} failed because of {e}.")
def stop_with_bearing_missing(cls, gdf): """[summary] The data does not include a value for “bearing” for all BCT stops except those in the FLX (flexible zone) sub-type. Args: gdf {[geopandas dataframe]} -- [The naptan master dataframe.] Returns: [type]: [description] """ # get the check name check_name = "stop_with_bearing_missing" try: # the permitted bearings that can be present in that field. valid_bearing = ["SW", "NE", "SE", "S", "N", "NW", "E", "W"] # merged form, checking for the validing bearing list is not # present failed_nodes = gdf[(gdf["StopType"] == "BCT") & (gdf["BusStopType"] != "FLX") & (~gdf["Bearing"].isin(valid_bearing))] # reporting. rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes except Exception as e: sys.exit(f"{check_name} has failed because of {e}")
def road_name_matches_coordinates(gdf, atcocode): """[summary] Checks that the road name in the record, matches if the The “street” shown in the data does not correspond with the name attached to the road segment to which the stop is snapped in the Navteq mapping data used by Ito. Arguments: gdf {[geopandas dataframe]} -- [pass in the chosen dataframe] ATCOCode {[str]} -- [Pass in the given naptan unique stop id.] Returns: [type] -- [description] """ # check name check_name = road_name_matches_coordinates.__name__ # masking ideally. gdf1 = gdf node = gdf1.loc[gdf1['ATCOCode'] == atcocode] # api call to get nearest road name found_name = geopipe.get_nearest_road_name(gdf1, atcocode) if found_name[1] == node['Street'][0]: print('Road Name Matches') pass else: # TODO - needs testing. res = node["ATCOCode"] rep.report_failing_nodes(gdf, check_name, res) return res
def check_stop_dates_not_after_today(cls, gdf): """[summary] checks if the dates for bus stops have not been added to the naptan database in a future date. Args: gdf ([type]): [description] check_name ([type]): [description] check_warning_level ([type]): [description] check_geographical_level ([type]): [description] Returns: [type]: [description] """ check_name = "Check stop dates are after today" check_geographic_level = "stop" check_warning_level = "low" # just use between for both date fields. today = pd.Timestamp(datetime.today().date()) # check if greater than today in mod date column baddates = gdf[gdf.ModificationDateTime > today] # mask for speed. bad_timeframe = baddates # check if we report. if bad_timeframe.empty: print("No stop dates are in the future.") else: print(f"Stop creation or modification date after {today}.") report_failing_nodes(gdf, check_name, bad_timeframe) return bad_timeframe
def naptan_coastal_nodes(cls, gdf): # TODO - add a column to the master naptan dataframe, and then count up # false values, to get the percent of stops that fail, and then compare # those stops, to find out which ones are near the coast and how near # the coast they are. """[summary] provided a dataframe, returns a list of nodes that are near the coast line, this uses global land mask library, a numpy & pandas extension, for mapping the boundaries of the coastline. Arguments: df {[geospatial dataframe]} -- [the naptan master dataframe.] Raises: ve: [Raises description] e: [] Returns: [type] -- [description] """ check_name = "naptan_coastal_nodes" try: # remove ferry based stops / jetty stop types, as they proximity to the # coastline isn't a problem. coastal_infrastructure = ['FTD', 'FBT', 'FER'] gdf = gdf[~gdf['StopType'].isin(coastal_infrastructure)] # we compare against the compressed land geometry dataset for # coordinates outside the coastline. gdf['Land_State'] = globe.is_land(gdf['Latitude'], gdf['Longitude']) coastal_nodes = gdf.loc[~gdf.Land_State] # get the count of failing nodes as a values high_node_areas = coastal_nodes['LocalityName'].value_counts() percent = ((len(coastal_nodes) / len(gdf)) * 100.0) # if the number of nodes is over this percent, console warning. if percent >= 1.1: print( f"The {gdf.AreaName.iloc[0]} has {len(coastal_nodes)} stops\ that are off the UK Coastline, that is {percent: 0.2f} %\ of all stops in the named admin area.") elif percent <= 0: print('No Nodes were found along the coastline.') pass else: print( f"The area has {len(coastal_nodes)} nodes that are off the\ coastline boundary. UK coastline, this is {percent: 0.2f} % of all nodes in the area.") rep.report_failing_nodes(gdf, check_name, coastal_nodes) return high_node_areas except ValueError as ve: raise(ve) except Exception as e: print(e)
def detect_nan_values(gdf, col_name): """[summary] This is an internal open naptan method, it shouldn't be exposed to the user unless required for reporting widespread data issues. Returns the presence of nan values in the naptan dataset for a given column. Arguments: gdf {[pandas dataframe]} -- [A naptan dataframe.] colName {[str]} -- [a column str name as expected in the naptan data column.] Returns: [pandas dataframe] -- [a dataframe] """ # nodes into a report , check_name = 'Required columns contain null values.' # the below list is columns which must required_cols = [ 'ATCOCode', 'CommonName', 'Street', 'Indicator', 'Bearing', 'NptgLocalityCode', 'Town', 'TownLang', 'Suburb', 'LocalityCentre', 'Longitude', 'Latitude', 'StopType', 'BusStopType', 'TimingStatus', 'AdminCode', 'CreationDateTime', 'ModificationDateTime', 'Status', 'StopPoint', 'LocalityName', 'QualifierName', 'AtcoAreaCode', 'AreaName', 'RegionCode', 'Status_area', 'geometry' ] # check if the column is a required column. Defensive. if col_name not in required_cols: message = f'{col_name} for the area {gdf.AreaName.iloc[0]} the \ requested column can have null values. This ' write_basic_log_file(message) pass # Check is the column contains any null or na values. elif gdf[col_name].isnull().values.any() != 0: nan_array = gdf[col_name].isnull() # build array of missing values, using masking. missing_values = gdf[nan_array] percent_missing = gdf[col_name].isna().sum() / (len( gdf.Indicator)) / 100 # return missing percentage of rows. print(f'{percent_missing:.4%}') df_cleaned = gdf[missing_values] report_failing_nodes(gdf, test_name=check_name, failed_nodes=missing_values) return df_cleaned else: message = f'{col_name} for the area {gdf.AreaName.iloc[0]} has missing \ values in a required column and has failed this test.' print('all good.') pass
def test_report_failing_nodes(naptan_sample): """[summary] """ assert naptan_sample.report_failing_nodes() assert rep.report_failing_nodes(complete_gdf, 'Check Name Length', gdf)
def check_atcocode_length(cls, gdf): """[summary] checks the atcocode (unique identifier) length is 12 and if not the stop fails the check. Args: gdf ([geopandas dataset, master or sub]): [description] Returns: [geopandas dataframe]: [Geopandas dataframe of failed nodes.] """ # variance? Stop type? Authority? check_name = "check_atcocode_length_is_12" gdf["AtcoCode_Character_Len"] = gdf["ATCOCode"].apply(len) fail_range = gdf["AtcoCode_Character_Len"].unique() try: # create a mask that include no inactive nodes and atcocodes under # 12 if len(fail_range) != 1: mask = (gdf["Status"] != "del") & (gdf["AtcoCode_Character_Len"] != 12) # get the failing nodes. fn = gdf[mask] # makes report rep.report_failing_nodes(gdf, check_name, fn) # TODO make a sample level map of the failing area with codes. # get the name of the area that is failing fail_area = gdf.AreaName.iloc[0] print(fail_area) # the below returns a short dataframe counting the number of # atcocodes. # that are less than 12 alphanumeric characters in length. result_agg = ( fn[["AtcoCode_Character_Len", "ATCOCode"]] .groupby(["AtcoCode_Character_Len"]) .count() ) return result_agg except ValueError as ve: sys.exit(f"This error occured {ve}") except Exception as e: sys.exit(f"{e} was encounter check has been cancelled.") else: message = f"{gdf.AreaName.iloc[0]} all Atcocode unique identifiers are the correct length." rep.write_basic_log_file(message)
def stop_road_distance(cls, gdf): """[summary] Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "stop_road_distance".__name__ # list of stops not in correct admin areas by geo position. failed_nodes = '' rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes
def stop_names_with_high_risk_words(cls, gdf): """[summary] Descriptions: StopPoint has a CommonName that contains one of the following high risk words: DELETE, DELETED, N/A, N/K, OBSOLETE, UNUSED (case-insensitive). Args: gdf ([geopandas ]): [a pandas dataframe of the current naptan file.] Returns: df_risks [type]: [csv file containing risk updates.] """ # name of check. check_name = "stop_names_with_high_risk_words" # clone gdf1 = gdf try: # list of risk words. riskwords = [ "DELETE", "DELETED", "N/A", "NOT IN USE" "N/K", "OBSOLETE", "UNUSED", ] # text captialising managment gdf1["CommonName"] = gdf1["CommonName"].str.upper() gdf1["RiskWords"] = gdf1["CommonName"].apply( lambda x: 1 if any(i in x for i in riskwords) else 0) # df_risks = gdf1.loc[gdf1["RiskWords"] != 0] # endcol = len(df_risks.columns) # df_risks.insert(endcol, "Warning Flag", check_name) # rep.report_failing_nodes(gdf, check_name, df_risks) return df_risks # TODO indicate if it's a bus stop, if so flag locality or # TODO authorities that should confirm the stops deletion from the # TODO database. except Exception as e: raise (e) sys.exit(f"{check_name} has failed due to {e}.")
def stop_with_wrong_types(cls, gdf): """[summary] Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "stop_with_wrong_types" # list of stops not in correct admin areas by geo position. failed_nodes = "" rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def hail_ride_section_length(cls, gdf): """[summary] Hail and Ride Bus Stop where total length of section is greater than 1km Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "hail_ride_section_length" # list of stops not in correct admin areas by geo position. failed_nodes = '' rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes
def hail_ride_invalid(cls, gdf): """[summary] Hail and Ride Bus Stops that do not have a valid entry, centroid or exit record. Args: gdf([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "hail_ride_invalid" # list of stops not in correct admin areas by geo position. failed_nodes = '' rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes
def localities_with_identical_stops(cls, gdf_locality): """[summary]StopArea containing StopPoints that do not have identical CommonNames. The CommonName of stops within a single stoparea should be the same as each other (and the same as the name of the stoparea) wherever possible. This test identifies examples where the stopnames are not identical. At present this test does not identify cases where the stoparea name is different from any one or more of the individual stop‟s CommonName – but this may be added. Given a stop point within a locality, check if the stoppoint is duplicated at any point. Arguments: gdf {[geopandas dataframe]} -- [The Master naptan node frame.] Returns: df_warnings[type] -- [description] """ # for reporting check_name = "Check localities for identical stops." check_warning_level = "high" check_geographic_level = "localities" # clone the stop name gdf1 = gdf_locality # get the area name. try: # check nptg locality length is not over 1, otherwise this is # not a single locality if len(gdf1["NptgLocalityCode"].unique()) == 1: # get duplicates. mask = gdf1["StopPoint"].duplicated() # mask failed_nodes = gdf1[mask] rep.report_failing_nodes(gdf_locality, check_name, failed_nodes) return failed_nodes except Exception as e: # pass if this is not a localities, we just catching. print(f"Not a locality, test can not be performed. {e}") pass
def stops_area_members_without_identical_names(cls, gdf): """[summary] StopArea containing StopPoints that do not have identical CommonNames Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "stops_area_members_without_identical_names" gdf1 = gdf failed_nodes = '' rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def stops_in_alternate_localities(cls, gdf): """[summary] Locality is an alternative but has members or children that should be connected to the primary Locality. This checks if the stop can be linked to an nptg locality. Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "stops_in_alternate_localities" gdf1 = gdf failed_nodes = "" rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def find_unused_localities(cls, gdf): """[summary] returns a list of admin areas in nptg, checks those are in the nodes file, if the nodes file has aac not in Args: ([gdf]) Raises: NotImplementedError: [description] ve: [description] Returns: [pandas.core.frame.DataFrame]: [localities that are not used in the nodes file.] """ # node values localities = etl_pipe.naptan_gazette_localities() unused = localities[~localities['NptgLocalityCode']. isin(gdf['NptgLocalityCode'])] # conversion for geometry. unused = unused.rename(columns={ "Gazette_Longitude": "Longitude", "Gazette_Latitude": "Latitude" }) # unused = geo_pipe.calculate_naptan_geometry(unused) # reporting function rep.report_failing_nodes(gdf, 'unused localities near stops', failed_nodes=failedNodes) # m = vis.generate_base_map(unused, 'LocalityName') # m # TODO find out if any stops are inside the boundaries of the unused areas # TODO the geometries are just points for the unused localites # TODO find out the closest stops to these points. # localites. return unused
def stops_in_different_admin_area(cls, gdf): """[summary] Checks if a stop is in a different administrative area, based on the AtcoAreaCode Column. We take the first 3 characters prefix of the atcocode and check them against the atcoareacode for the admin area. They should match. Args: gdf ([pandas dataframe]): [The Master naptan node frame.] Returns: [panda dataframe] -- [description] Raises: NotImplementedError: [geo spatial cross checking, not implemented yet.] """ check_name = "stops_in_different_admin_area" gdf1 = gdf try: # get prefix from atcocode column gdf1["atcocodeprefix"] = gdf1["ATCOCode"].str[:3] # get the AtcoAreaCode column value, making sure that we account for # 2-digit atcocode prefixes and int types, using to_numeric gdf1["AtcoAreaCode"] = gdf1["AtcoAreaCode"].astype(str) gdf1["atcocodeprefix"] = pd.to_numeric(gdf1["atcocodeprefix"]) gdf1["AtcoAreaCode"] = pd.to_numeric(gdf1["AtcoAreaCode"]) # compare the two together, they should match gdf1["not matching"] = gdf1["atcocodeprefix"].eq( pd.to_numeric(gdf1["AtcoAreaCode"], errors="coerce")) failed_nodes = gdf1[~gdf1["not matching"]] rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes # TODO if they don't match, report the nodes that don't match # TODO compare the geometry point to the polygon boundaries of the # expected admin area # TODO if the geometry point is further 500 meters outside the # boundaries of the given area, then the node fails except Exception as e: raise e sys.exit(f"{check_name} has failed because of {e}")
def stops_in_different_admin_authority_geo_position(cls, gdf): """[summary] The AtcoCode prefix for the StopPoint represents an AdminArea other than the one associated with the stop‟s Locality. This test highlights those stops which are associated with a locality that is itself not in the same administrative area. This is often not wrong – but in some cases it indicates a stop that is incorrectly located, or associated with the wrong locality. Check each example and confirm that each represents a stop close to the boundary of your authority‟s area – and consider whether the locality association with each stop is reasonable, even if it is with a locality that is in the adjacent admin area. Check that the coordinates of the stop are right, and correct them if not. Args: gdf ([gdf]): [the naptan total dataframe] stops ([node_type_stops]): [description] authorities ([gdf]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "stops_in_different_admin_authority_geo_position" # TODO - check if any other stops or points are within the authority # TODO polygon of this boundary, # TODO check from the surrounding admin areas # TODO if so add the stop to the failed nodes report file # TODO include how the name of the other area and distance outside. # # list of stops not in correct admin areas by geo position. failed_nodes = '' rep.report_failing_nodes(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def unused_locality_near_stops(cls, nodes, nptg): """[summary] Locality has no stops or child Localities, but is within 250 metres of a StopPoint associated with a different Locality. Args: nodes ([type]): [description] nptg ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = "unused_locality_near_stops" unused_localities = check_nodes_match_nptg_data(nodes, "") # list of unused localities, finding nearest naptan stop within the area. # geos.g # list of stops not in correct admin areas by geo position. failed_nodes = "" rep.report_failing_nodes(nodes, check_name, failed_nodes) return failed_nodes
Arguments: gdf {[geopandas dataframe]} -- [the naptan master dataframe] columnName {[str]} -- [a given column name to search through] Returns: df -- a df object that consists of only stops with illegal characters removed, from the given field. """ check_name = 'Check Illegal Characters' # our regex pattern of allowed special characters in stop point names pattern = r"\bO/S|NO\.|P\.H\.|P\.O\.|ST\.|'s|St\.|st.\b" # our none allowed non-alphanumeric characters. searchfor = ['!', '[', ']', '.', ',', '/', '/?'] # clone dataframe, removing none gdf1 = IllegalCharacters.filter_bus_stops(gdf) # remove the nodes with the permitted exceptions. excluded_nodes = gdf1[gdf1[col_name].str.contains(pattern, case=False, regex=True)] mask = gdf1[col_name].isin(excluded_nodes[col_name]) # removing excluded nodes from bus stops frame. gdf_filter = gdf1[~mask] # use map and regex to create a generator that str contain pat = '|'.join(map(re.escape, searchfor)) # check the given column for any illegal characters filtered_nodes = gdf_filter[gdf_filter[col_name].str.contains(pat)] # report on failing nodes that contain illegal characters. report_failing_nodes(gdf, check_name, filtered_nodes) return filtered_nodes
[ mns["CommonName"].str.contains(t, regex=False, case=False) for t in terms ] ) # return common name mask and added mns = mns[cn_mask] # ln_mask = np.logical_or.reduce( [ mns["LocalityName"].str.contains(t, regex=False, case=False) for t in terms ] ) # mns = mns[ln_mask] # reports returns percentage of bad stops out of all stsops, # about 0.03% rep.report_failing_nodes(gdf, check_name, mns) return mns # double check if the return data frame is empty or not, if it's empty, we are good for that area. elif df["NameMatch"].isnull().all(): success_message = f"{gdf.AreaName.iloc[0]} has no stops names containing locality names." rep.write_basic_log_file(success_message) except ValueError as ve: # ValueError: Cannot mask with non-boolean array containing NA / # NaN values raise ve sys.exit(f"{check_name} failed because of {ve}")
def naptan_sample(): """[summary] """ # what t = rep.report_failing_nodes() return t
def check_illegal_caps(cls, gdf, col_name="StopPoint"): """[summary] Descriptions:CommonNames should not contain acronyms as single capitals separated by spaces or full stops – with the exception of „R C‟, „P.H.‟, and „P.O.‟. CommonNames should not contain a sequence of lowercase letter followed by uppercase letter – with the exceptions of 'McX' and 'MacX' Args: gdf ([pandas dataframe]): [the master naptan nodes file.] columnName ([type]): [description] Returns: IIC [type]: [description] """ except_caps = [ "AFC", "ASDA", "BBC", "BP", "CE", "DHSS", "DLR", "FC", "GMEX", "HMP", "HQ", "HSBC", "II", "III", "IKEA", "IV", "IX", "MFI", "MOD", "NCP", "NE", "NR", "NW", "PH", "PO", "RAF", "RC", "RSPCA", "SE", "SPT", "SW", "VI", "VII", "VIII", "WMC", "XI", "XII", "YMCA", "YWCA", ] try: # clone gdf1 = gdf gdf1["capitals"] = gdf1[col_name].str.count("[A-Z]{3,}") gdf1 = gdf1[gdf1["capitals"] != 0] # the below, compares a list against named column mask = ~gdf1[col_name].apply( lambda x: np.intersect1d(x, except_caps).size > 0) # masking if required. illegal_caps = gdf1[mask] # save the report. report_failing_nodes(gdf, "Check illegal capitals", illegal_caps) print("Illegal Captials has completed.") return illegal_caps except Exception as e: print(f"{e}") except ValueError as ve: print(f"{ve}")
def check_nodes_match_nptg_data(cls, gdf, named_area): """[summary] returns a list of admin areas in nptg, checks those are in the nodes file, if the nodes file has aac not in Args: gdf ([type]): [the master or named area naptan data file] named_area ([type]): [the named area of the naptan subframe] Raises: NotImplementedError: [description] NotImplementedError: [description] NotImplementedError: [description] NotImplementedError: [description] NotImplementedError: [description] NotImplementedError: [description] Returns: [type]: [description] """ # check_name = "check_nodes_match_nptg_data" # list of all geographic admin areas admin_areas = [ "Aberdeen", "Aberdeenshire", "Angus", "Argyll & Bute", "Bath & North East Somerset", "Bedford", "Blackburn with Darwen", "Blackpool", "Blaenau Gwent", "Bournemouth", "Bracknell Forest", "Bridgend", "Brighton and Hove", "Bristol", "Buckinghamshire", "Caerphilly", "Cambridgeshire", "Cardiff", "Carmarthenshire", "Central Bedfordshire", "Ceredigion", "Cheshire East", "Cheshire West & Chester", "Clackmannanshire", "Conwy", "Cornwall", "Cumbria", "Darlington", "Denbighshire", "Derby", "Derbyshire", "Devon", "Dorset", "Dumfries & Galloway", "Dundee", "Durham", "East Ayrshire", "East Dunbartonshire", "East Lothian", "East Renfrewshire", "East Riding of Yorkshire", "East Sussex", "Edinburgh", "Essex", "Falkirk", "Fife", "Flintshire", "Glasgow", "Gloucestershire", "Greater London", "Greater Manchester", "Gwynedd", "Halton", "Hampshire", "Hartlepool", "Herefordshire", "Hertfordshire", "Highland", "Inverclyde", "Isle of Anglesey", "Isle of Wight", "Kent", "Kingston upon Hull", "Lancashire", "Leicester", "Leicestershire", "Lincolnshire", "Luton", "Medway", "Merseyside", "Merthyr Tydfil", "Middlesbrough", "Midlothian", "Milton Keynes", "Monmouthshire", "Moray", "Neath Port Talbot", "Newport", "Norfolk", "North Ayrshire", "North East Lincolnshire", "North Lanarkshire", "North Lincolnshire", "North Somerset", "North Yorkshire", "Northamptonshire", "Northumberland", "Nottingham", "Nottinghamshire", "Orkney Islands", "Oxfordshire", "Pembrokeshire", "Perth & Kinross", "Peterborough", "Plymouth", "Poole", "Portsmouth", "Powys", "Reading", "Redcar & Cleveland", "Renfrewshire", "Rhondda Cynon Taff", "Rutland", "Scottish Borders", "Shetland Islands", "Shropshire", "Slough", "Somerset", "South Ayrshire", "South Gloucestershire", "South Lanarkshire", "South Yorkshire", "Southampton", "Southend-on-Sea", "Staffordshire", "Stirling", "Stockton-on-Tees", "Stoke-on-Trent", "Suffolk", "Surrey", "Swansea", "Swindon", "Telford & Wrekin", "Thurrock", "Torbay", "Torfaen", "Tyne & Wear", "Vale of Glamorgan", "Warrington", "Warwickshire", "West Berkshire", "West Dunbartonshire", "West Lothian", "West Midlands", "West Sussex", "West Yorkshire", "Western Isles", "Wiltshire", "Windsor & Maidenhead", "Wokingham", "Worcestershire", "Wrexham", "York", ] # TODO get the admin areas from teh nodes file, compare against the list of # area names # nptg values adjanct_locals = etl.load_gazette_adjanct_localities() admin_codes = etl.naptan_gazette_admin_area_codes() districts = etl.naptan_gazette_districts() localities = etl.naptan_gazette_localities() locality_alternate = etl.load_gazette_localities_alternative_names() locality_hierarch = etl.load_gazette_locality_hierarchy() plusbusmap = etl.load_gazette_plusbus_mapping() plusbuszone = etl.load_gazette_plusbus_zones() regions = etl.naptan_gazette_region() # node values node_locs = gdf["LocalityName"].unique() # get nptg localities, nptg_locs = localities["LocalityName"].unique() # TODO filter to nptg to nodes, get all the localities in nptg for # this area # get the unique area code for this admin area. area_admin_code = node_locs["AdminCode"].unique() # check the area admin code in the nptg file for the corresponding # localities. missing_localities = nptg_locs[~nptg_locs.AdminCode.isin(area_admin_code)] # check if locality is df3 = gaz_locs[gaz_locs.LocalityName.isin(gdf.LocalityName)] # get all the localities # TODO list the localities in nptg but not nodes # TODO plot sample on map # TODO write unused localities in given area to file. report_failing_nodes( gdf, check_name, ) return
def stop_with_multiple_road_names(cls, gdf, col_name="CommonName"): """[summary]CommonNames in NaPTAN should be simple and not composite. Most examples of commonnames which include two of the designated words are ones where two road names are used in a composite name, contrary to NaPTAN guidance. This uses regex, but they could be some other way of doing this... Arguments: df {[type]} -- [description] """ swmrn_gdf = gdf swmrn_gdf[col_name] = swmrn_gdf[col_name].str.lower() try: # leave this here, no it's not being used, just leave it anyway. targets = [ "road", "roads", "street", "streets", "avenue", "avenues", "garden", "gardens", "lane", "lanes", "drive", "drives", "way", "ways", ] # regex patterns for detection. pattern = r"\b(road|roads|\ street|streets|\ avenue|\avenues|\ garden|gardens|\ lane|lanes\ drive|drives\ way|ways)\b" fail_rds_re = (r"\b('street|streets|avenue|avenues|garden|" r"gardens|lane|lanes|drive|drives|way|ways')\b") fail_aves_re = (r"\b('road|roads|street|streets|garden|gardens|" r"lane|lanes|drive|drives|way|ways')\b") fail_gdns_re = (r"\b('road|roads|street|streets|avenue|avenues|" r"lane|lanes|drive|drives|way|ways')\b") fail_lanes_re = (r"\b('road|roads|street|streets|avenue|avenues|" r"garden|gardens|drive|drives|way|ways')\b") fail_drives_re = (r"\b('road|roads|street|streets|avenue|avenues|" r"garden|gardens|lane|lanes|way|ways')\b") fail_ways_re = (r"\b('road|roads|street|streets|avenue|avenues|" r"garden|gardens|lane|lanes|drive|drives')\b") tn = swmrn_gdf[swmrn_gdf[col_name].str.contains(pattern, regex=True)] roads = tn[tn[col_name].str.contains(r"\b(road|roads)\b")] fail_rds = roads[roads[col_name].str.contains(fail_rds_re, regex=True)] aves = tn[tn[col_name].str.contains(r"\b(avenue|avenues)\b")] fail_aves = aves[aves[col_name].str.contains(fail_aves_re, regex=True)] gdns = tn[tn[col_name].str.contains(r"\b(garden|gardens)\b")] failgdns = gdns[gdns[col_name].str.contains(fail_gdns_re, regex=True)] lanes = tn[tn[col_name].str.contains(r"\b(lane|lanes)\b")] faillanes = lanes[lanes[col_name].str.contains(fail_lanes_re, regex=True)] drives = tn[tn[col_name].str.contains(r"\b(drive|drives)\b")] faildrives = drives[drives[col_name].str.contains(fail_drives_re, regex=True)] ways = tn[tn[col_name].str.contains(r"\b(way|ways)\b")] failways = ways[ways[col_name].str.contains(fail_ways_re, regex=True)] all_dfs = [ fail_rds, fail_aves, failgdns, faillanes, faildrives, failways ] failed_nodes = pd.concat(all_dfs) failed_nodes[col_name] = failed_nodes[col_name].str.title() rep.report_failing_nodes(gdf, "Stop with Multiple road type names", failed_nodes) return failed_nodes except Exception as e: raise (e)