def remove_illegal_caps(gdf, column_name): """[summary] Descriptions:CommonNames should not contain acronyms as single capitals separated by spaces or full stops – with the exception of „R C‟, „P.H.‟, and „P.O.‟. CommonNames should not contain a sequence of lowercase letter followed by uppercase letter – with the exceptions of 'McX' and 'MacX' Args: gdf ([pandas dataframe]): [the master naptan nodes file.] columnName ([type]): [description] Returns: IIC [type]: [description] """ check_name = remove_illegal_caps.__name__ gdf1 = gdf except_caps = [ 'AFC', 'ASDA', 'BBC', 'BP', 'CE', 'DHSS', 'DLR', 'FC', 'GMEX', 'HMP', 'HQ', 'HSBC', 'II', 'III', 'IKEA', 'IV', 'IX', 'MFI', 'MOD', 'NCP', 'NE', 'NR', 'NW', 'PH', 'PO', 'RAF', 'RC', 'RSPCA', 'SE', 'SPT', 'SW', 'VI', 'VII', 'VIII', 'WMC', 'XI', 'XII', 'YMCA', 'YWCA' ] gdf1['capitals'] = gdf1[column_name].str.count('[A-Z]{3,}') gdf1 = gdf1[gdf1['capitals'] != 0] # the below, compares a list against named column mask = ~gdf1[column_name].apply( lambda x: np.intersect1d(x, except_caps).size > 0) iic = gdf1[mask] report.nodes_error_reporting(gdf, check_name, iic) return iic
def localities_with_identical_stops(gdf_locality): """[summary]StopArea containing StopPoints that do not have identical CommonNames. The CommonName of stops within a single stoparea should be the same as each other (and the same as the name of the stoparea) wherever possible. This test identifies examples where the stopnames are not identical. At present this test does not identify cases where the stoparea name is different from any one or more of the individual stop‟s CommonName – but this may be added. Given a stop point within a locality, check if the stoppoint is duplicated at any point. Arguments: gdf {[geopandas dataframe]} -- [The Master naptan node frame.] Returns: df_warnings[type] -- [description] """ check_name = localities_with_identical_stops.__name__ gdf1 = gdf_locality try: if len(gdf1['NptgLocalityCode'].unique()) == 1: mask = gdf1['StopPoint'].duplicated() failed_nodes = gdf1[mask] report.nodes_error_reporting(gdf_locality, check_name, failed_nodes) return failed_nodes except Exception as e: print(f'Not a locality, test can not be performed. {e}') pass
def stops_in_different_admin_authority_geo_position(gdf, stops, authorities): """[summary] The AtcoCode prefix for the StopPoint represents an AdminArea other than the one associated with the stop‟s Locality. This test highlights those stops which are associated with a locality that is itself not in the same administrative area. This is often not wrong – but in some cases it indicates a stop that is incorrectly located, or associated with the wrong locality. Check each example and confirm that each represents a stop close to the boundary of your authority‟s area – and consider whether the locality association with each stop is reasonable, even if it is with a locality that is in the adjacent admin area. Check that the coordinates of the stop are right, and correct them if not. Args: gdf ([gdf]): [the naptan total dataframe] stops ([node_type_stops]): [description] authorities ([gdf]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = stops_in_different_admin_authority_geo_position.__name__ # list of stops not in correct admin areas by geo position. failed_nodes = '' report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def stops_in_different_admin_area(gdf): """[summary] Checks if a stop is in a different administrative area, based on the AtcoAreaCode Column. We take the first 3 characters prefix of the atcocode and check them against the atcoareacode for the admin area. They should match. Args: gdf ([pandas dataframe]): [The Master naptan node frame.] Returns: [panda dataframe] -- [description] Raises: NotImplementedError: [geo spatial cross checking, not implemented yet.] """ check_name = stops_in_different_admin_area.__name__ gdf1 = gdf # get prefix from atcocode column gdf1['atcocodeprefix'] = gdf1['ATCOCode'].str[:3] # get the AtcoAreaCode column value, making sure that we account for # 2-digit atcocode prefixes and int types, using to_numeric gdf1['AtcoAreaCode'] = gdf1['AtcoAreaCode'].astype(str) gdf1['atcocodeprefix'] = pd.to_numeric(gdf1['atcocodeprefix']) gdf1['AtcoAreaCode'] = pd.to_numeric(gdf1['AtcoAreaCode']) # compare the two together, they should match gdf1['not matching'] = gdf1['atcocodeprefix'].eq( pd.to_numeric(gdf1['AtcoAreaCode'], errors='coerce')) failed_nodes = gdf1[~gdf1['not matching']] report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def stop_names_with_high_risk_words(gdf): """[summary] Descriptions: StopPoint has a CommonName that contains one of the following high risk words: DELETE, DELETED, N/A, N/K, OBSOLETE, UNUSED (case-insensitive). Args: gdf ([geopandas ]): [a pandas dataframe of the current naptan file.] Returns: df_risks [type]: [csv file containing risk updates.] """ check_name = stop_names_with_high_risk_words.__name__ gdf1 = gdf riskwords = [ 'DELETE', 'DELETED', 'N/A', 'NOT IN USE' 'N/K', 'OBSOLETE', 'UNUSED' ] gdf1['CommonName'] = gdf1['CommonName'].str.upper() gdf1['RiskWords'] = gdf1['CommonName'].apply( lambda x: 1 if any(i in x for i in riskwords) else 0) df_risks = gdf1.loc[gdf1['RiskWords'] != 0] endcol = len(df_risks.columns) df_risks.insert(endcol, 'Warning Flag', check_name) report.nodes_error_reporting(gdf, check_name, df_risks) return df_risks
def stop_with_multiple_road_names(gdf): """[summary]CommonNames in NaPTAN should be simple and not composite. Most examples of commonnames which include two of the designated words are ones where two road names are used in a composite name, contrary to NaPTAN guidance. This uses regex, but they could be some other way of doing this... Arguments: df {[type]} -- [description] """ check_names = stop_with_multiple_road_names.__name__ swmrn_gdf = gdf swmrn_gdf['CommonName'] = swmrn_gdf['CommonName'].str.lower() # leave this here, no it's not being used, just leave it anyway. targets = [ 'road', 'roads', 'street', 'streets', 'avenue', 'avenues', 'garden', 'gardens', 'lane', 'lanes', 'drive', 'drives', 'way', 'ways' ] pattern = (r"\b(road|roads|\ street|streets|\ avenue|\avenues|\ garden|gardens|\ lane|lanes\ drive|drives\ way|ways)\b") fail_rds_re = (r"\b('street|streets|avenue|avenues|garden|" r"gardens|lane|lanes|drive|drives|way|ways')\b") fail_aves_re = (r"\b('road|roads|street|streets|garden|gardens|" r"lane|lanes|drive|drives|way|ways')\b") fail_gdns_re = (r"\b('road|roads|street|streets|avenue|avenues|" r"lane|lanes|drive|drives|way|ways')\b") fail_lanes_re = (r"\b('road|roads|street|streets|avenue|avenues|" r"garden|gardens|drive|drives|way|ways')\b") fail_drives_re = (r"\b('road|roads|street|streets|avenue|avenues|" r"garden|gardens|lane|lanes|way|ways')\b") fail_ways_re = (r"\b('road|roads|street|streets|avenue|avenues|" r"garden|gardens|lane|lanes|drive|drives')\b") tn = swmrn_gdf[swmrn_gdf['CommonName'].str.contains(pattern, regex=True)] roads = tn[tn['CommonName'].str.contains(r"\b(road|roads)\b")] fail_rds = roads[roads['CommonName'].str.contains(fail_rds_re, regex=True)] aves = tn[tn['CommonName'].str.contains(r"\b(avenue|avenues)\b")] fail_aves = aves[aves['CommonName'].str.contains(fail_aves_re, regex=True)] gdns = tn[tn['CommonName'].str.contains(r"\b(garden|gardens)\b")] failgdns = gdns[gdns['CommonName'].str.contains(fail_gdns_re, regex=True)] lanes = tn[tn['CommonName'].str.contains(r"\b(lane|lanes)\b")] faillanes = lanes[lanes['CommonName'].str.contains(fail_lanes_re, regex=True)] drives = tn[tn['CommonName'].str.contains(r"\b(drive|drives)\b")] faildrives = drives[drives['CommonName'].str.contains(fail_drives_re, regex=True)] ways = tn[tn['CommonName'].str.contains(r"\b(way|ways)\b")] failways = ways[ways['CommonName'].str.contains(fail_ways_re, regex=True)] all_dfs = [fail_rds, fail_aves, failgdns, faillanes, faildrives, failways] failed_nodes = pd.concat(all_dfs) failed_nodes['CommonName'] = failed_nodes['CommonName'].str.title() report.nodes_error_reporting(gdf, check_names, failed_nodes) return failed_nodes
def nodes_error_reporting_tests(self): """[summary] """ report.nodes_error_reporting() def error_folder_created(self): """[summary] """ pass def error_report_created(self): """[summary] """ pass
def stop_area_members_with_different_localities(gdf): """[summary] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = stop_area_members_with_different_localities.__name__ # list of stops not in correct admin areas by geo position. failed_nodes = '' report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def unused_locality_near_stops(gdf): """[summary] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = unused_locality_near_stops.__name__ # list of stops not in correct admin areas by geo position. failed_nodes = '' report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def check_name_too_long(gdf): """[summary]:- A stop point fails if StopPoint has a full name [Locality, CommonName (Indicator)] that is more than 80 characters in length. Arguments: gdf {[geopandas dataframe]} -- [The naptan master dataframe.] Returns: df {[dataframe of failed nodes]} -- Nodes that failed the check. """ check_name = check_name_too_long.__name__ gdf1 = gdf gdf1['newName'] = gdf1['CommonName'].astype( str) + ', ' + gdf1['LocalityName'].astype(str) mask = (gdf1['newName'].str.len() > 80) df_str = gdf1.loc[mask] report.nodes_error_reporting(gdf, check_name, df_str) return df_str.ATCOCode
def stop_with_bearing_missing(gdf): """[summary] The data does not include a value for “bearing” for all BCT stops except those in the FLX (flexible zone) sub-type. Args: gdf {[geopandas dataframe]} -- [The naptan master dataframe.] Returns: [type]: [description] """ check_name = stop_with_bearing_missing.__name__ valid_bearing = ['SW', 'NE', 'SE', 'S', 'N', 'NW', 'E', 'W'] failed_nodes = gdf[(gdf['StopType'] == 'BCT') & (gdf['BusStopType'] != 'FLX') & (~gdf['Bearing'].isin(valid_bearing))] report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes
def locality_not_unique(gdf): """[summary] Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = locality_not_unique.__name__ gdf1 = gdf failed_nodes = '' report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def stops_in_alternate_localities(gdf): """[summary] Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = stops_in_alternate_localities.__name__ gdf1 = gdf failed_nodes = '' report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def stops_area_members_without_identical_names(gdf): """[summary] Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = stops_area_members_without_identical_names.__name__ gdf1 = gdf failed_nodes = '' report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def stops_in_parent_locality(gdf): """[summary] Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = stops_in_parent_locality.__name__ # list of stops not in correct admin areas by geo position. failed_nodes = '' report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def locality_with_unusually_elongated_shape(gdf): """[summary] Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ check_name = hail_ride_section_length.__name__ # list of stops not in correct admin areas by geo position. failed_nodes = '' report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def localities_contained_by_non_parent(gdf): """[summary] Args: gdf ([type]): [description] Raises: NotImplementedError: [description] Returns: [type]: [description] """ #TODO use for both 90% overlap rule and 40-89% rule. check_name = localities_contained_by_non_parent.__name__ # list of stops not in correct admin areas by geo position. failed_nodes = '' report.nodes_error_reporting(gdf, check_name, failed_nodes) return failed_nodes raise NotImplementedError
def naptan_coastal_nodes(gdf): # TODO - add a column to the master naptan dataframe, and then count up # false values, to get the percent of stops that fail, and then compare # those stops, to find out which ones are near the coast and how near # the coast they are. """[summary] provided a dataframe, returns a list of nodes that are near the coast line, this uses global land mask library, a numpy & pandas extension, for mapping the boundaries of the coastline. Arguments: df {[geospatial dataframe]} -- [the naptan master dataframe.] Raises: ve: [Raises description] e: [] Returns: [type] -- [description] """ check_name = naptan_coastal_nodes.__name__ try: gdf['Land_State'] = globe.is_land(gdf['Latitude'], gdf['Longitude']) coastal_nodes = gdf.loc[~gdf.Land_State] high_node_areas = coastal_nodes['LocalityName'].value_counts() percentage = ((len(coastal_nodes) / len(gdf)) * 100.0) if percentage >= 1.1: print(f"The area has a total of {coastal_nodes}, nodes which are \ at sea error ratio is {percentage:0.2f}% too high.") elif percentage <= 0: print('No Nodes were found along the coastline') pass else: print(f"The area has a total of {coastal_nodes} in the area.\ {percentage:0.2f}") report.nodes_error_reporting(gdf, check_name, coastal_nodes) return high_node_areas except ValueError as ve: raise (ve) except Exception as e: print(e)
def road_name_matches_coordinates(gdf, ATCOCode): """[summary] Checks that the road name in the record, matches if the The “street” shown in the data does not correspond with the name attached to the road segment to which the stop is snapped in the Navteq mapping data used by Ito. Arguments: gdf {[geopandas dataframe]} -- [pass in the chosen dataframe] ATCOCode {[str]} -- [Pass in the given naptan unique stop id.] Returns: [type] -- [description] """ check_name = road_name_matches_coordinates.__name__ gdf1 = gdf node = gdf1.loc[gdf1['ATCOCode'] == ATCOCode] found_name = get_nearest_road_name(gdf1, ATCOCode) if found_name[1] == node['Street'][0]: print('Road Name Matches') pass else: res = node["ATCOCode"] report.nodes_error_reporting(gdf, check_name, res) return res
# for reporting check_name = remove_illegal_chars.__name__ gdf1 = gdf # our regex pattern of allowed special characters. pattern = r"\bO/S|NO\.|P\.H\.|P\.O\.|ST\.|'s\b" excluded_nodes = gdf1[gdf1[col_name].str.contains(pattern, case=False, regex=True)] mask = gdf1[col_name].isin(excluded_nodes[col_name]) df_filter = gdf1[~mask] # removing excluded nodes from stops frame. regex = re.compile(r"\[^a-zA-Z !@#$%&\*_\+=\|:;<>,./[\]\{\}\']", flags=re.IGNORECASE) df_filter[col_name] = df_filter[col_name].str.replace(regex, '', regex=True) report.nodes_error_reporting(gdf, check_name, df_filter) result = df_filter.append(excluded_nodes) return result # %% def stop_with_bearing_missing(gdf): """[summary] The data does not include a value for “bearing” for all BCT stops except those in the FLX (flexible zone) sub-type. Args: gdf {[geopandas dataframe]} -- [The naptan master dataframe.] Returns: [type]: [description] """