def validate_config(config, check_with_data): """ Check that the config passed as argument is a valid configuration. :param config: A config dictionary to fetch. :param check_with_data: Whether we should use the available OpenData to check the config values. :return: ``True`` if the configuration is valid, ``False`` otherwise. """ def _check_constraints_bounds(bounds): """ Check the bounds for numeric constraints. """ assert len(bounds) == 2 assert all( x is None or ( isinstance(x, (float, int)) and x >= 0 ) for x in bounds ) if bounds[0] is not None and bounds[1] is not None: assert bounds[1] > bounds[0] try: # Note: The traceback fetching code only handle single line asserts. # Then, we disable line-too-long pylint check and E501 flake8 checks # and use long lines whenever needed, in order to have the full assert # message in the log output. # pylint: disable=locally-disabled,line-too-long assert config["passes"] in [0, 1, 2, 3] assert config["max_entries"] is None or (isinstance(config["max_entries"], int) and config["max_entries"] > 0) # noqa: E501 assert config["data_directory"] is None or isinstance(config["data_directory"], str) # noqa: E501 assert os.path.isdir(config["data_directory"]) assert isinstance(config["search_index"], str) assert config["modules_path"] is None or isinstance(config["modules_path"], str) # noqa: E501 assert config["database"] is None or isinstance(config["database"], str) # noqa: E501 assert isinstance(config["port"], int) assert isinstance(config["host"], str) assert config["webserver"] is None or isinstance(config["webserver"], str) # noqa: E501 assert config["backends"] is None or isinstance(config["backends"], list) # noqa: E501 assert isinstance(config["send_email"], bool) assert config["smtp_server"] is None or isinstance(config["smtp_server"], str) # noqa: E501 assert config["smtp_port"] is None or isinstance(config["smtp_port"], int) # noqa: E501 assert config["smtp_to"] is None or isinstance(config["smtp_to"], list) # Ensure constraints are ok assert config["constraints"] for constraint in config["constraints"].values(): assert "type" in constraint assert isinstance(constraint["type"], str) assert constraint["type"].upper() in ["RENT", "SALE", "SHARING"] assert "minimum_pictures" in constraint assert isinstance(constraint["minimum_pictures"], int) assert constraint["minimum_pictures"] >= 0 assert "house_types" in constraint assert constraint["house_types"] for house_type in constraint["house_types"]: assert house_type.upper() in ["APART", "HOUSE", "PARKING", "LAND", "OTHER", "UNKNOWN"] # noqa: E501 assert "postal_codes" in constraint assert constraint["postal_codes"] if check_with_data: opendata_postal_codes = [ x.postal_code for x in data.load_data(PostalCode, constraint, config) ] for postal_code in constraint["postal_codes"]: assert postal_code in opendata_postal_codes # noqa: E501 assert "area" in constraint _check_constraints_bounds(constraint["area"]) assert "cost" in constraint _check_constraints_bounds(constraint["cost"]) assert "rooms" in constraint _check_constraints_bounds(constraint["rooms"]) assert "bedrooms" in constraint _check_constraints_bounds(constraint["bedrooms"]) assert "time_to" in constraint assert isinstance(constraint["time_to"], dict) for name, item in constraint["time_to"].items(): assert isinstance(name, str) assert "gps" in item assert isinstance(item["gps"], list) assert len(item["gps"]) == 2 assert "time" in item _check_constraints_bounds(item["time"]) return True except (AssertionError, KeyError): _, _, exc_traceback = sys.exc_info() return traceback.extract_tb(exc_traceback)[-1][-1]
def guess_stations(flats_list, constraint, config): """ Try to match the station field with a list of available stations nearby. :param flats_list: A list of flats dict. :param constraint: The constraint that the ``flats_list`` should satisfy. :param config: A config dict. :return: An updated list of flats dict with guessed nearby stations. """ distance_threshold = config['max_distance_housing_station'] opendata = { "postal_codes": data.load_data(PostalCode, constraint, config), "stations": data.load_data(PublicTransport, constraint, config) } for flat in flats_list: flat_station = flat.get("station", None) if not flat_station: # Skip everything if empty station LOGGER.info( "No stations field for flat %s, skipping stations lookup.", flat["id"]) continue # Weboob modules can return several stations in a comma-separated list. flat_stations = flat_station.split(',') # But some stations containing a comma exist, so let's add the initial # value to the list of stations to check if there was one. if len(flat_stations) > 1: flat_stations.append(flat_station) matched_stations = [] for tentative_station in flat_stations: matched_stations += fuzzy_match( tentative_station, [x.name for x in opendata["stations"]], limit=10, threshold=50) # Keep only one occurrence of each station matched_stations = list(set(matched_stations)) # Filter out the stations that are obviously too far and not well # guessed good_matched_stations = [] postal_code = flat["flatisfy"].get("postal_code", None) if postal_code: # If there is a postal code, check that the matched station is # closed to it postal_code_gps = next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == postal_code) for station in matched_stations: # Note that multiple stations with the same name exist in a # city, hence the list of stations objects for a given matching # station name. stations_objects = [ x for x in opendata["stations"] if x.name == station[0] ] for station_data in stations_objects: distance = tools.distance( (station_data.lat, station_data.lng), postal_code_gps) if distance < distance_threshold: # If at least one of the coordinates for a given # station is close enough, that's ok and we can add # the station good_matched_stations.append({ "key": station[0], "name": station_data.name, "confidence": station[1], "gps": (station_data.lat, station_data.lng) }) break LOGGER.info( ("Station %s is too far from flat %s (%dm > %dm), " "discarding this station."), station[0], flat["id"], int(distance), int(distance_threshold)) else: LOGGER.info( "No postal code for flat %s, skipping stations detection.", flat["id"]) if not good_matched_stations: # No stations found, log it and cotninue with next housing LOGGER.info("No stations found for flat %s, matching %s.", flat["id"], flat["station"]) continue LOGGER.info("Found stations for flat %s: %s (matching %s).", flat["id"], ", ".join(x["name"] for x in good_matched_stations), flat["station"]) # If some stations were already filled in and the result is different, # display some warning to the user if ("matched_stations" in flat["flatisfy"] and ( # Do a set comparison, as ordering is not important set([ station["name"] for station in flat["flatisfy"]["matched_stations"] ]) != set( [station["name"] for station in good_matched_stations]))): LOGGER.warning( "Replacing previously fetched stations for flat %s. Found " "stations differ from the previously found ones.", flat["id"]) flat["flatisfy"]["matched_stations"] = good_matched_stations return flats_list
def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): """ Try to guess the postal code from the location of the flats. :param flats_list: A list of flats dict. :param constraint: The constraint that the ``flats_list`` should satisfy. :param config: A config dict. :param distance_threshold: Maximum distance in meters between the constraint postal codes (from config) and the one found by this function, to avoid bad fuzzy matching. Can be ``None`` to disable thresholding. :return: An updated list of flats dict with guessed postal code. """ opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)} for flat in flats_list: location = flat.get("location", None) if not location: # Skip everything if empty location LOGGER.info(("No location field for flat %s, skipping postal " "code lookup."), flat["id"]) continue postal_code = None # Try to find a postal code directly try: postal_code = re.search(r"[0-9]{5}", location) assert postal_code is not None postal_code = postal_code.group(0) # Check the postal code is within the db assert postal_code in [ x.postal_code for x in opendata["postal_codes"] ] LOGGER.info("Found postal code in location field for flat %s: %s.", flat["id"], postal_code) except AssertionError: postal_code = None # If not found, try to find a city if not postal_code: # Find all fuzzy-matching cities matched_cities = fuzzy_match( location, [x.name for x in opendata["postal_codes"]], limit=None) if matched_cities: # Find associated postal codes matched_postal_codes = [] for matched_city_name, _ in matched_cities: postal_code_objects_for_city = [ x for x in opendata["postal_codes"] if x.name == matched_city_name ] matched_postal_codes.extend( pc.postal_code for pc in postal_code_objects_for_city) # Try to match them with postal codes in config constraint matched_postal_codes_in_config = ( set(matched_postal_codes) & set(constraint["postal_codes"])) if matched_postal_codes_in_config: # If there are some matched postal codes which are also in # config, use them preferentially. This avoid ignoring # incorrectly some flats in cities with multiple postal # codes, see #110. postal_code = next(iter(matched_postal_codes_in_config)) else: # Otherwise, simply take any matched postal code. postal_code = matched_postal_codes[0] LOGGER.info( ("Found postal code in location field through city lookup " "for flat %s: %s."), flat["id"], postal_code) # Check that postal code is not too far from the ones listed in config, # limit bad fuzzy matching if postal_code and distance_threshold: distance = min( tools.distance( next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == postal_code), next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == constraint_postal_code)) for constraint_postal_code in constraint["postal_codes"]) if distance > distance_threshold: LOGGER.info( ("Postal code %s found for flat %s is off-constraints " "(distance is %dm > %dm). Let's consider it is an " "artifact match and keep the post without this postal " "code."), postal_code, flat["id"], int(distance), int(distance_threshold)) postal_code = None # Store it if postal_code: existing_postal_code = flat["flatisfy"].get("postal_code", None) if existing_postal_code and existing_postal_code != postal_code: LOGGER.warning( "Replacing previous postal code %s by %s for flat %s.", existing_postal_code, postal_code, flat["id"]) flat["flatisfy"]["postal_code"] = postal_code else: LOGGER.info("No postal code found for flat %s.", flat["id"]) return flats_list
def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): """ Try to guess the postal code from the location of the flats. :param flats_list: A list of flats dict. :param constraint: The constraint that the ``flats_list`` should satisfy. :param config: A config dict. :param distance_threshold: Maximum distance in meters between the constraint postal codes (from config) and the one found by this function, to avoid bad fuzzy matching. Can be ``None`` to disable thresholding. :return: An updated list of flats dict with guessed postal code. """ opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)} for flat in flats_list: location = flat.get("location", None) if not location: # Skip everything if empty location LOGGER.info(("No location field for flat %s, skipping postal " "code lookup."), flat["id"]) continue postal_code = None # Try to find a postal code directly try: postal_code = re.search(r"[0-9]{5}", location) assert postal_code is not None postal_code = postal_code.group(0) # Check the postal code is within the db assert postal_code in [ x.postal_code for x in opendata["postal_codes"] ] LOGGER.info("Found postal code in location field for flat %s: %s.", flat["id"], postal_code) except AssertionError: postal_code = None # If not found, try to find a city cities = {x.name: x for x in opendata["postal_codes"]} if not postal_code: matched_city = fuzzy_match(location, cities.keys(), limit=1) if matched_city: # Store the matching postal code matched_city = matched_city[0] matched_city_name = matched_city[0] postal_code = (cities[matched_city_name].postal_code) LOGGER.info( ("Found postal code in location field through city lookup " "for flat %s: %s."), flat["id"], postal_code) # Check that postal code is not too far from the ones listed in config, # limit bad fuzzy matching if postal_code and distance_threshold: distance = min( tools.distance( next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == postal_code), next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == constraint_postal_code)) for constraint_postal_code in constraint["postal_codes"]) if distance > distance_threshold: LOGGER.info( ("Postal code %s found for flat %s is off-constraints " "(distance is %dm > %dm). Let's consider it is an " "artifact match and keep the post without this postal " "code."), postal_code, flat["id"], int(distance), int(distance_threshold)) postal_code = None # Store it if postal_code: existing_postal_code = flat["flatisfy"].get("postal_code", None) if existing_postal_code and existing_postal_code != postal_code: LOGGER.warning( "Replacing previous postal code %s by %s for flat %s.", existing_postal_code, postal_code, flat["id"]) flat["flatisfy"]["postal_code"] = postal_code else: LOGGER.info("No postal code found for flat %s.", flat["id"]) return flats_list
def validate_config(config, check_with_data): """ Check that the config passed as argument is a valid configuration. :param config: A config dictionary to fetch. :param check_with_data: Whether we should use the available OpenData to check the config values. :return: ``True`` if the configuration is valid, ``False`` otherwise. """ def _check_constraints_bounds(bounds): """ Check the bounds for numeric constraints. """ assert isinstance(bounds, list) assert len(bounds) == 2 assert all( x is None or ( isinstance(x, (float, int)) and x >= 0 ) for x in bounds ) if bounds[0] is not None and bounds[1] is not None: assert bounds[1] > bounds[0] try: # Note: The traceback fetching code only handle single line asserts. # Then, we disable line-too-long pylint check and E501 flake8 checks # and use long lines whenever needed, in order to have the full assert # message in the log output. # pylint: disable=locally-disabled,line-too-long assert config["passes"] in [0, 1, 2, 3] assert config["max_entries"] is None or (isinstance(config["max_entries"], int) and config["max_entries"] > 0) # noqa: E501 assert config["data_directory"] is None or isinstance(config["data_directory"], str) # noqa: E501 assert os.path.isdir(config["data_directory"]) assert isinstance(config["search_index"], str) assert config["modules_path"] is None or isinstance(config["modules_path"], str) # noqa: E501 assert config["database"] is None or isinstance(config["database"], str) # noqa: E501 assert isinstance(config["port"], int) assert isinstance(config["host"], str) assert config["webserver"] is None or isinstance(config["webserver"], str) # noqa: E501 assert config["backends"] is None or isinstance(config["backends"], list) # noqa: E501 assert isinstance(config["send_email"], bool) assert config["smtp_server"] is None or isinstance(config["smtp_server"], str) # noqa: E501 assert config["smtp_port"] is None or isinstance(config["smtp_port"], int) # noqa: E501 assert config["smtp_username"] is None or isinstance(config["smtp_username"], str) # noqa: E501 assert config["smtp_password"] is None or isinstance(config["smtp_password"], str) # noqa: E501 assert config["smtp_to"] is None or isinstance(config["smtp_to"], list) assert isinstance(config["store_personal_data"], bool) assert isinstance(config["max_distance_housing_station"], (int, float)) assert isinstance(config["duplicate_threshold"], int) assert isinstance(config["duplicate_image_hash_threshold"], int) # API keys assert config["navitia_api_key"] is None or isinstance(config["navitia_api_key"], str) # noqa: E501 assert config["mapbox_api_key"] is None or isinstance(config["mapbox_api_key"], str) # noqa: E501 # Ensure constraints are ok assert config["constraints"] for constraint in config["constraints"].values(): assert "type" in constraint assert isinstance(constraint["type"], str) assert constraint["type"].upper() in POSTS_TYPES.__members__ assert "minimum_nb_photos" in constraint if constraint["minimum_nb_photos"]: assert isinstance(constraint["minimum_nb_photos"], int) assert constraint["minimum_nb_photos"] >= 0 assert "description_should_contain" in constraint assert isinstance(constraint["description_should_contain"], list) if constraint["description_should_contain"]: for term in constraint["description_should_contain"]: assert isinstance(term, str) assert "description_should_not_contain" in constraint assert isinstance(constraint["description_should_not_contain"], list) if constraint["description_should_not_contain"]: for term in constraint["description_should_not_contain"]: assert isinstance(term, str) assert "house_types" in constraint assert constraint["house_types"] for house_type in constraint["house_types"]: assert house_type.upper() in HOUSE_TYPES.__members__ assert "postal_codes" in constraint assert constraint["postal_codes"] assert all(isinstance(x, str) for x in constraint["postal_codes"]) if check_with_data: # Ensure data is built into db data.preprocess_data(config, force=False) # Check postal codes opendata_postal_codes = [ x.postal_code for x in data.load_data(PostalCode, constraint, config) ] for postal_code in constraint["postal_codes"]: assert postal_code in opendata_postal_codes # noqa: E501 assert "area" in constraint _check_constraints_bounds(constraint["area"]) assert "cost" in constraint _check_constraints_bounds(constraint["cost"]) assert "rooms" in constraint _check_constraints_bounds(constraint["rooms"]) assert "bedrooms" in constraint _check_constraints_bounds(constraint["bedrooms"]) assert "time_to" in constraint assert isinstance(constraint["time_to"], dict) for name, item in constraint["time_to"].items(): assert isinstance(name, str) assert "gps" in item assert isinstance(item["gps"], list) assert len(item["gps"]) == 2 assert "time" in item _check_constraints_bounds(item["time"]) if "mode" in item: TimeToModes[item["mode"]] return True except (AssertionError, KeyError): _, _, exc_traceback = sys.exc_info() return traceback.extract_tb(exc_traceback)[-1][-1]
def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): """ Try to guess the postal code from the location of the flats. :param flats_list: A list of flats dict. :param constraint: The constraint that the ``flats_list`` should satisfy. :param config: A config dict. :param distance_threshold: Maximum distance in meters between the constraint postal codes (from config) and the one found by this function, to avoid bad fuzzy matching. Can be ``None`` to disable thresholding. :return: An updated list of flats dict with guessed postal code. """ opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)} for flat in flats_list: location = flat.get("location", None) if not location: addr = flat.get("address", None) if addr: location = addr["full_address"] if not location: # Skip everything if empty location LOGGER.info( ("No location field for flat %s, skipping postal code lookup. (%s)"), flat["id"], flat.get("address"), ) continue postal_code = None insee_code = None position = None # Try to find a postal code directly try: postal_code = re.search(r"[0-9]{5}", location) assert postal_code is not None postal_code = postal_code.group(0) # Check the postal code is within the db assert postal_code in [x.postal_code for x in opendata["postal_codes"]] LOGGER.debug( "Found postal code directly in location field for flat %s: %s.", flat["id"], postal_code, ) except AssertionError: postal_code = None # Then fetch position (and postal_code is couldn't be found earlier) cities = opendata["postal_codes"] if postal_code: cities = [x for x in cities if x.postal_code == postal_code] (postal_code, insee_code, position) = guess_location_position( location, cities, constraint, postal_code is not None ) # Check that postal code is not too far from the ones listed in config, # limit bad fuzzy matching if postal_code and distance_threshold: distance = min( tools.distance( next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == postal_code), next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == constraint_postal_code), ) for constraint_postal_code in constraint["postal_codes"] ) if distance > distance_threshold: LOGGER.info( ( "Postal code %s found for flat %s @ %s is off-constraints " "(distance is %dm > %dm). Let's consider it is an " "artifact match and keep the post without this postal " "code." ), postal_code, flat["id"], location, int(distance), int(distance_threshold), ) postal_code = None position = None # Store it if postal_code: existing_postal_code = flat["flatisfy"].get("postal_code", None) if existing_postal_code and existing_postal_code != postal_code: LOGGER.warning( "Replacing previous postal code %s by %s for flat %s.", existing_postal_code, postal_code, flat["id"], ) flat["flatisfy"]["postal_code"] = postal_code else: LOGGER.info("No postal code found for flat %s.", flat["id"]) if insee_code: flat["flatisfy"]["insee_code"] = insee_code if position: flat["flatisfy"]["position"] = position LOGGER.debug( "found postal_code=%s insee_code=%s position=%s for flat %s (%s).", postal_code, insee_code, position, flat["id"], location, ) return flats_list