def guess_stations(flats_list, constraint, config): """ Try to match the station field with a list of available stations nearby. :param flats_list: A list of flats dict. :param constraint: The constraint that the ``flats_list`` should satisfy. :param config: A config dict. :return: An updated list of flats dict with guessed nearby stations. """ distance_threshold = config['max_distance_housing_station'] opendata = { "postal_codes": data.load_data(PostalCode, constraint, config), "stations": data.load_data(PublicTransport, constraint, config) } for flat in flats_list: flat_station = flat.get("station", None) if not flat_station: # Skip everything if empty station LOGGER.info( "No stations field for flat %s, skipping stations lookup.", flat["id"]) continue # Weboob modules can return several stations in a comma-separated list. flat_stations = flat_station.split(',') # But some stations containing a comma exist, so let's add the initial # value to the list of stations to check if there was one. if len(flat_stations) > 1: flat_stations.append(flat_station) matched_stations = [] for tentative_station in flat_stations: matched_stations += fuzzy_match( tentative_station, [x.name for x in opendata["stations"]], limit=10, threshold=50) # Keep only one occurrence of each station matched_stations = list(set(matched_stations)) # Filter out the stations that are obviously too far and not well # guessed good_matched_stations = [] postal_code = flat["flatisfy"].get("postal_code", None) if postal_code: # If there is a postal code, check that the matched station is # closed to it postal_code_gps = next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == postal_code) for station in matched_stations: # Note that multiple stations with the same name exist in a # city, hence the list of stations objects for a given matching # station name. stations_objects = [ x for x in opendata["stations"] if x.name == station[0] ] for station_data in stations_objects: distance = tools.distance( (station_data.lat, station_data.lng), postal_code_gps) if distance < distance_threshold: # If at least one of the coordinates for a given # station is close enough, that's ok and we can add # the station good_matched_stations.append({ "key": station[0], "name": station_data.name, "confidence": station[1], "gps": (station_data.lat, station_data.lng) }) break LOGGER.info( ("Station %s is too far from flat %s (%dm > %dm), " "discarding this station."), station[0], flat["id"], int(distance), int(distance_threshold)) else: LOGGER.info( "No postal code for flat %s, skipping stations detection.", flat["id"]) if not good_matched_stations: # No stations found, log it and cotninue with next housing LOGGER.info("No stations found for flat %s, matching %s.", flat["id"], flat["station"]) continue LOGGER.info("Found stations for flat %s: %s (matching %s).", flat["id"], ", ".join(x["name"] for x in good_matched_stations), flat["station"]) # If some stations were already filled in and the result is different, # display some warning to the user if ("matched_stations" in flat["flatisfy"] and ( # Do a set comparison, as ordering is not important set([ station["name"] for station in flat["flatisfy"]["matched_stations"] ]) != set( [station["name"] for station in good_matched_stations]))): LOGGER.warning( "Replacing previously fetched stations for flat %s. Found " "stations differ from the previously found ones.", flat["id"]) flat["flatisfy"]["matched_stations"] = good_matched_stations return flats_list
def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): """ Try to guess the postal code from the location of the flats. :param flats_list: A list of flats dict. :param constraint: The constraint that the ``flats_list`` should satisfy. :param config: A config dict. :param distance_threshold: Maximum distance in meters between the constraint postal codes (from config) and the one found by this function, to avoid bad fuzzy matching. Can be ``None`` to disable thresholding. :return: An updated list of flats dict with guessed postal code. """ opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)} for flat in flats_list: location = flat.get("location", None) if not location: # Skip everything if empty location LOGGER.info(("No location field for flat %s, skipping postal " "code lookup."), flat["id"]) continue postal_code = None # Try to find a postal code directly try: postal_code = re.search(r"[0-9]{5}", location) assert postal_code is not None postal_code = postal_code.group(0) # Check the postal code is within the db assert postal_code in [ x.postal_code for x in opendata["postal_codes"] ] LOGGER.info("Found postal code in location field for flat %s: %s.", flat["id"], postal_code) except AssertionError: postal_code = None # If not found, try to find a city cities = {x.name: x for x in opendata["postal_codes"]} if not postal_code: matched_city = fuzzy_match(location, cities.keys(), limit=1) if matched_city: # Store the matching postal code matched_city = matched_city[0] matched_city_name = matched_city[0] postal_code = (cities[matched_city_name].postal_code) LOGGER.info( ("Found postal code in location field through city lookup " "for flat %s: %s."), flat["id"], postal_code) # Check that postal code is not too far from the ones listed in config, # limit bad fuzzy matching if postal_code and distance_threshold: distance = min( tools.distance( next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == postal_code), next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == constraint_postal_code)) for constraint_postal_code in constraint["postal_codes"]) if distance > distance_threshold: LOGGER.info( ("Postal code %s found for flat %s is off-constraints " "(distance is %dm > %dm). Let's consider it is an " "artifact match and keep the post without this postal " "code."), postal_code, flat["id"], int(distance), int(distance_threshold)) postal_code = None # Store it if postal_code: existing_postal_code = flat["flatisfy"].get("postal_code", None) if existing_postal_code and existing_postal_code != postal_code: LOGGER.warning( "Replacing previous postal code %s by %s for flat %s.", existing_postal_code, postal_code, flat["id"]) flat["flatisfy"]["postal_code"] = postal_code else: LOGGER.info("No postal code found for flat %s.", flat["id"]) return flats_list
def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): """ Try to guess the postal code from the location of the flats. :param flats_list: A list of flats dict. :param constraint: The constraint that the ``flats_list`` should satisfy. :param config: A config dict. :param distance_threshold: Maximum distance in meters between the constraint postal codes (from config) and the one found by this function, to avoid bad fuzzy matching. Can be ``None`` to disable thresholding. :return: An updated list of flats dict with guessed postal code. """ opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)} for flat in flats_list: location = flat.get("location", None) if not location: # Skip everything if empty location LOGGER.info(("No location field for flat %s, skipping postal " "code lookup."), flat["id"]) continue postal_code = None # Try to find a postal code directly try: postal_code = re.search(r"[0-9]{5}", location) assert postal_code is not None postal_code = postal_code.group(0) # Check the postal code is within the db assert postal_code in [ x.postal_code for x in opendata["postal_codes"] ] LOGGER.info("Found postal code in location field for flat %s: %s.", flat["id"], postal_code) except AssertionError: postal_code = None # If not found, try to find a city if not postal_code: # Find all fuzzy-matching cities matched_cities = fuzzy_match( location, [x.name for x in opendata["postal_codes"]], limit=None) if matched_cities: # Find associated postal codes matched_postal_codes = [] for matched_city_name, _ in matched_cities: postal_code_objects_for_city = [ x for x in opendata["postal_codes"] if x.name == matched_city_name ] matched_postal_codes.extend( pc.postal_code for pc in postal_code_objects_for_city) # Try to match them with postal codes in config constraint matched_postal_codes_in_config = ( set(matched_postal_codes) & set(constraint["postal_codes"])) if matched_postal_codes_in_config: # If there are some matched postal codes which are also in # config, use them preferentially. This avoid ignoring # incorrectly some flats in cities with multiple postal # codes, see #110. postal_code = next(iter(matched_postal_codes_in_config)) else: # Otherwise, simply take any matched postal code. postal_code = matched_postal_codes[0] LOGGER.info( ("Found postal code in location field through city lookup " "for flat %s: %s."), flat["id"], postal_code) # Check that postal code is not too far from the ones listed in config, # limit bad fuzzy matching if postal_code and distance_threshold: distance = min( tools.distance( next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == postal_code), next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == constraint_postal_code)) for constraint_postal_code in constraint["postal_codes"]) if distance > distance_threshold: LOGGER.info( ("Postal code %s found for flat %s is off-constraints " "(distance is %dm > %dm). Let's consider it is an " "artifact match and keep the post without this postal " "code."), postal_code, flat["id"], int(distance), int(distance_threshold)) postal_code = None # Store it if postal_code: existing_postal_code = flat["flatisfy"].get("postal_code", None) if existing_postal_code and existing_postal_code != postal_code: LOGGER.warning( "Replacing previous postal code %s by %s for flat %s.", existing_postal_code, postal_code, flat["id"]) flat["flatisfy"]["postal_code"] = postal_code else: LOGGER.info("No postal code found for flat %s.", flat["id"]) return flats_list
def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): """ Try to guess the postal code from the location of the flats. :param flats_list: A list of flats dict. :param constraint: The constraint that the ``flats_list`` should satisfy. :param config: A config dict. :param distance_threshold: Maximum distance in meters between the constraint postal codes (from config) and the one found by this function, to avoid bad fuzzy matching. Can be ``None`` to disable thresholding. :return: An updated list of flats dict with guessed postal code. """ opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)} for flat in flats_list: location = flat.get("location", None) if not location: addr = flat.get("address", None) if addr: location = addr["full_address"] if not location: # Skip everything if empty location LOGGER.info( ("No location field for flat %s, skipping postal code lookup. (%s)"), flat["id"], flat.get("address"), ) continue postal_code = None insee_code = None position = None # Try to find a postal code directly try: postal_code = re.search(r"[0-9]{5}", location) assert postal_code is not None postal_code = postal_code.group(0) # Check the postal code is within the db assert postal_code in [x.postal_code for x in opendata["postal_codes"]] LOGGER.debug( "Found postal code directly in location field for flat %s: %s.", flat["id"], postal_code, ) except AssertionError: postal_code = None # Then fetch position (and postal_code is couldn't be found earlier) cities = opendata["postal_codes"] if postal_code: cities = [x for x in cities if x.postal_code == postal_code] (postal_code, insee_code, position) = guess_location_position( location, cities, constraint, postal_code is not None ) # Check that postal code is not too far from the ones listed in config, # limit bad fuzzy matching if postal_code and distance_threshold: distance = min( tools.distance( next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == postal_code), next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == constraint_postal_code), ) for constraint_postal_code in constraint["postal_codes"] ) if distance > distance_threshold: LOGGER.info( ( "Postal code %s found for flat %s @ %s is off-constraints " "(distance is %dm > %dm). Let's consider it is an " "artifact match and keep the post without this postal " "code." ), postal_code, flat["id"], location, int(distance), int(distance_threshold), ) postal_code = None position = None # Store it if postal_code: existing_postal_code = flat["flatisfy"].get("postal_code", None) if existing_postal_code and existing_postal_code != postal_code: LOGGER.warning( "Replacing previous postal code %s by %s for flat %s.", existing_postal_code, postal_code, flat["id"], ) flat["flatisfy"]["postal_code"] = postal_code else: LOGGER.info("No postal code found for flat %s.", flat["id"]) if insee_code: flat["flatisfy"]["insee_code"] = insee_code if position: flat["flatisfy"]["position"] = position LOGGER.debug( "found postal_code=%s insee_code=%s position=%s for flat %s (%s).", postal_code, insee_code, position, flat["id"], location, ) return flats_list