Пример #1
0
def filter_flats_list(config, constraint_name, flats_list, fetch_details=True, past_flats=None):
    """
    Filter the available flats list. Then, filter it according to criteria.

    :param config: A config dict.
    :param constraint_name: The constraint name that the ``flats_list`` should
        satisfy.
    :param flats_list: The initial list of flat objects to filter.
    :param fetch_details: Whether additional details should be fetched between
        the two passes.
    :param past_flats: The list of already fetched flats
    :return: A dict mapping flat status and list of flat objects.
    """
    # Add the flatisfy metadata entry and prepare the flat objects
    flats_list = metadata.init(flats_list, constraint_name)

    # Get the associated constraint from config
    try:
        constraint = config["constraints"][constraint_name]
    except KeyError:
        LOGGER.error(
            "Missing constraint %s. Skipping filtering for these posts.",
            constraint_name,
        )
        return {"new": [], "duplicate": [], "ignored": []}

    first_pass_result = collections.defaultdict(list)
    second_pass_result = collections.defaultdict(list)
    third_pass_result = collections.defaultdict(list)
    # Do a first pass with the available infos to try to remove as much
    # unwanted postings as possible
    if config["passes"] > 0:
        first_pass_result = flatisfy.filters.first_pass(flats_list, constraint, config)
    else:
        first_pass_result["new"] = flats_list

    # Load additional infos
    if fetch_details:
        past_ids = {x["id"]: x for x in past_flats} if past_flats else {}
        for i, flat in enumerate(first_pass_result["new"]):
            details = None

            use_cache = past_ids.get(flat["id"])
            if use_cache:
                LOGGER.debug("Skipping details download for %s.", flat["id"])
                details = use_cache
            else:
                if flat["id"].split("@")[1] in ["seloger", "leboncoin"]:
                    try:
                        details = fetch.fetch_details_rate_limited(config, flat["id"])
                    except RateLimitException:
                        time.sleep(60)
                        details = fetch.fetch_details_rate_limited(config, flat["id"])
                else:
                    details = fetch.fetch_details(config, flat["id"])

            first_pass_result["new"][i] = tools.merge_dicts(flat, details)

    # Do a second pass to consolidate all the infos we found and make use of
    # additional infos
    if config["passes"] > 1:
        second_pass_result = flatisfy.filters.second_pass(first_pass_result["new"], constraint, config)
    else:
        second_pass_result["new"] = first_pass_result["new"]

    # Do a third pass to deduplicate better
    if config["passes"] > 2:
        third_pass_result = flatisfy.filters.third_pass(second_pass_result["new"], config)
    else:
        third_pass_result["new"] = second_pass_result["new"]

    return {
        "new": third_pass_result["new"],
        "duplicate": (
            first_pass_result["duplicate"] + second_pass_result["duplicate"] + third_pass_result["duplicate"]
        ),
        "ignored": (first_pass_result["ignored"] + second_pass_result["ignored"] + third_pass_result["ignored"]),
    }
Пример #2
0
def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
    """
    Filter the available flats list. Then, filter it according to criteria.

    :param config: A config dict.
    :param constraint_name: The constraint name that the ``flats_list`` should
        satisfy.
    :param fetch_details: Whether additional details should be fetched between
        the two passes.
    :param flats_list: The initial list of flat objects to filter.
    :return: A dict mapping flat status and list of flat objects.
    """
    # Add the flatisfy metadata entry and prepare the flat objects
    flats_list = metadata.init(flats_list, constraint_name)

    # Get the associated constraint from config
    try:
        constraint = config["constraints"][constraint_name]
    except KeyError:
        LOGGER.error(
            "Missing constraint %s. Skipping filtering for these posts.",
            constraint_name)
        return {"new": [], "duplicate": [], "ignored": []}

    first_pass_result = collections.defaultdict(list)
    second_pass_result = collections.defaultdict(list)
    third_pass_result = collections.defaultdict(list)
    # Do a first pass with the available infos to try to remove as much
    # unwanted postings as possible
    if config["passes"] > 0:
        first_pass_result = flatisfy.filters.first_pass(
            flats_list, constraint, config)
    else:
        first_pass_result["new"] = flats_list

    # Load additional infos
    if fetch_details:
        for i, flat in enumerate(first_pass_result["new"]):
            details = fetch.fetch_details(config, flat["id"])
            first_pass_result["new"][i] = tools.merge_dicts(flat, details)

    # Do a second pass to consolidate all the infos we found and make use of
    # additional infos
    if config["passes"] > 1:
        second_pass_result = flatisfy.filters.second_pass(
            first_pass_result["new"], constraint, config)
    else:
        second_pass_result["new"] = first_pass_result["new"]

    # Do a third pass to deduplicate better
    if config["passes"] > 2:
        third_pass_result = flatisfy.filters.third_pass(
            second_pass_result["new"], config)
    else:
        third_pass_result["new"] = second_pass_result["new"]

    return {
        "new":
        third_pass_result["new"],
        "duplicate":
        (first_pass_result["duplicate"] + second_pass_result["duplicate"] +
         third_pass_result["duplicate"]),
        "ignored":
        (first_pass_result["ignored"] + second_pass_result["ignored"] +
         third_pass_result["ignored"])
    }
Пример #3
0
def deep_detect(flats_list):
    """
    Deeper detection of duplicates based on any available data.

    :param flats_list: A list of flats dicts.
    :return: A tuple of the deduplicated list of flat dicts and the list of all
    the flats objects that should be removed and considered as duplicates (they
    were already merged).
    """

    photo_cache = ImageCache()

    LOGGER.info("Running deep duplicates detection.")
    matching_flats = collections.defaultdict(list)
    for i, flat1 in enumerate(flats_list):
        matching_flats[flat1["id"]].append(flat1["id"])
        for j, flat2 in enumerate(flats_list):
            if i <= j:
                continue

            if flat2["id"] in matching_flats[flat1["id"]]:
                continue

            n_common_items = 0
            try:
                # They should have the same area, up to one unit
                assert abs(flat1["area"] - flat2["area"]) < 1
                n_common_items += 1

                # They should be at the same price, up to one unit
                assert abs(flat1["cost"] - flat2["cost"]) < 1
                n_common_items += 1

                # They should have the same number of bedrooms if this was
                # fetched for both
                if flat1["bedrooms"] and flat2["bedrooms"]:
                    assert flat1["bedrooms"] == flat2["bedrooms"]
                    n_common_items += 1

                # They should have the same utilities (included or excluded for
                # both of them), if this was fetched for both
                if flat1["utilities"] and flat2["utilities"]:
                    assert flat1["utilities"] == flat2["utilities"]
                    n_common_items += 1

                # They should have the same number of rooms if it was fetched
                # for both of them
                if flat1["rooms"] and flat2["rooms"]:
                    assert flat1["rooms"] == flat2["rooms"]
                    n_common_items += 1

                # They should have the same postal code, if available
                if (flat1["flatisfy"].get("postal_code", None)
                        and flat2["flatisfy"].get("postal_code", None)):
                    assert (flat1["flatisfy"]["postal_code"] ==
                            flat2["flatisfy"]["postal_code"])
                    n_common_items += 1

                # TODO: Compare texts (one is included in another? fuzzymatch?)

                # They should have the same phone number if it was fetched for
                # both
                flat1_phone = homogeneize_phone_number(flat1["phone"])
                flat2_phone = homogeneize_phone_number(flat2["phone"])
                if flat1_phone and flat2_phone:
                    assert flat1_phone == flat2_phone
                    n_common_items += 10  # Counts much more that the rest

                # They should have at least one photo in common if there
                # are some photos
                if flat1["photos"] and flat2["photos"]:
                    n_common_photos = find_number_common_photos(
                        photo_cache, flat1["photos"], flat2["photos"])
                    assert n_common_photos > 1

                    min_number_photos = min(len(flat1["photos"]),
                                            len(flat2["photos"]))

                    # Either all the photos are the same, or there are at least
                    # three common photos.
                    if n_common_photos == min_number_photos:
                        n_common_items += 15
                    else:
                        n_common_items += 5 * min(n_common_photos, 3)

                # Minimal score to consider they are duplicates
                assert n_common_items >= 15
            except (AssertionError, TypeError):
                # Skip and consider as not duplicates whenever the conditions
                # are not met
                # TypeError occurs when an area or a cost is None, which should
                # not be considered as duplicates
                continue

            # Mark flats as duplicates
            LOGGER.info(("Found duplicates using deep detection: (%s, %s). "
                         "Score is %d."), flat1["id"], flat2["id"],
                        n_common_items)
            matching_flats[flat1["id"]].append(flat2["id"])
            matching_flats[flat2["id"]].append(flat1["id"])

    if photo_cache.total():
        LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.",
                     photo_cache.hit_rate(), photo_cache.miss_rate())

    seen_ids = []
    duplicate_flats = []
    unique_flats_list = []
    for flat_id in [flat["id"] for flat in flats_list]:
        if flat_id in seen_ids:
            continue

        seen_ids.extend(matching_flats[flat_id])
        to_merge = sorted([
            flat
            for flat in flats_list if flat["id"] in matching_flats[flat_id]
        ],
                          key=lambda flat:
                          next(i for
                               (i, backend) in enumerate(BACKENDS_PRECEDENCE)
                               if flat["id"].endswith(backend)),
                          reverse=True)
        unique_flats_list.append(tools.merge_dicts(*to_merge))
        # The ID of the added merged flat will be the one of the last item
        # in ``matching_flats``. Then, any flat object that was before in
        # the ``matching_flats`` list is to be considered as a duplicate
        # and should have a ``duplicate`` status.
        duplicate_flats.extend(to_merge[:-1])

    return unique_flats_list, duplicate_flats
Пример #4
0
def detect(flats_list, key="id", merge=True, should_intersect=False):
    """
    Detect obvious duplicates within a given list of flats.

    There may be duplicates found, as some queries could overlap (especially
    since when asking for a given place, websites tend to return housings in
    nearby locations as well). We need to handle them, by either deleting the
    duplicates (``merge=False``) or merging them together in a single flat
    object.

    :param flats_list: A list of flats dicts.
    :param key: The flat dicts key on which the duplicate detection should be
    done.
    :param merge: Whether the found duplicates should be merged or we should
    only keep one of them.
    :param should_intersect: Set to ``True`` if the values in the flat dicts
    are lists and you want to deduplicate on non-empty intersection (typically
    if they have a common url).

    :return: A tuple of the deduplicated list of flat dicts and the list of all
    the flats objects that should be removed and considered as duplicates (they
    were already merged).
    """
    # ``seen`` is a dict mapping aggregating the flats by the deduplication
    # keys. We basically make buckets of flats for every key value. Flats in
    # the same bucket should be merged together afterwards.
    seen = collections.defaultdict(list)
    for flat in flats_list:
        if should_intersect:
            # We add each value separately. We will add some flats multiple
            # times, but we deduplicate again on id below to compensate.
            for value in flat.get(key, []):
                seen[value].append(flat)
        else:
            seen[flat.get(key, None)].append(flat)

    # Generate the unique flats list based on these buckets
    unique_flats_list = []
    # Keep track of all the flats that were removed by deduplication
    duplicate_flats = []

    for flat_key, matching_flats in seen.items():
        if flat_key is None:
            # If the key is None, it means Weboob could not load the data. In
            # this case, we consider every matching item as being independant
            # of the others, to avoid over-deduplication.
            unique_flats_list.extend(matching_flats)
        else:
            # Sort matching flats by backend precedence
            matching_flats.sort(key=lambda flat: next(
                i for (i, backend) in enumerate(BACKENDS_PRECEDENCE)
                if flat["id"].endswith(backend)),
                                reverse=True)

            if len(matching_flats) > 1:
                LOGGER.info("Found duplicates using key \"%s\": %s.", key,
                            [flat["id"] for flat in matching_flats])
            # Otherwise, check the policy
            if merge:
                # If a merge is requested, do the merge
                unique_flats_list.append(tools.merge_dicts(*matching_flats))
            else:
                # Otherwise, just keep the most important of them
                unique_flats_list.append(matching_flats[-1])

            # The ID of the added merged flat will be the one of the last item
            # in ``matching_flats``. Then, any flat object that was before in
            # the ``matching_flats`` list is to be considered as a duplicate
            # and should have a ``duplicate`` status.
            duplicate_flats.extend(matching_flats[:-1])

    if should_intersect:
        # We added some flats twice with the above method, let's deduplicate on
        # id.
        unique_flats_list, _ = detect(unique_flats_list,
                                      key="id",
                                      merge=True,
                                      should_intersect=False)

    return unique_flats_list, duplicate_flats
Пример #5
0
def deep_detect(flats_list, config):
    """
    Deeper detection of duplicates based on any available data.

    :param flats_list: A list of flats dicts.
    :param config: A config dict.
    :return: A tuple of the deduplicated list of flat dicts and the list of all
        the flats objects that should be removed and considered as duplicates
        (they were already merged).
    """
    if config["serve_images_locally"]:
        storage_dir = os.path.join(config["data_directory"], "images")
    else:
        storage_dir = None
    photo_cache = ImageCache(storage_dir=storage_dir)

    LOGGER.info("Running deep duplicates detection.")
    matching_flats = collections.defaultdict(list)
    for i, flat1 in enumerate(flats_list):
        matching_flats[flat1["id"]].append(flat1["id"])
        for j, flat2 in enumerate(flats_list):
            if i <= j:
                continue

            if flat2["id"] in matching_flats[flat1["id"]]:
                continue

            n_common_items = get_duplicate_score(
                flat1, flat2, photo_cache,
                config["duplicate_image_hash_threshold"])

            # Minimal score to consider they are duplicates
            if n_common_items >= config["duplicate_threshold"]:
                # Mark flats as duplicates
                LOGGER.info(
                    ("Found duplicates using deep detection: (%s, %s). Score is %d."
                     ),
                    flat1["id"],
                    flat2["id"],
                    n_common_items,
                )
                matching_flats[flat1["id"]].append(flat2["id"])
                matching_flats[flat2["id"]].append(flat1["id"])

    if photo_cache.total():
        LOGGER.debug(
            "Photo cache: hits: %d%% / misses: %d%%.",
            photo_cache.hit_rate(),
            photo_cache.miss_rate(),
        )

    seen_ids = []
    duplicate_flats = []
    unique_flats_list = []
    for flat_id in [flat["id"] for flat in flats_list]:
        if flat_id in seen_ids:
            continue

        seen_ids.extend(matching_flats[flat_id])
        to_merge = sorted(
            [
                flat
                for flat in flats_list if flat["id"] in matching_flats[flat_id]
            ],
            key=lambda flat: next(i for (i, backend) in enumerate(
                BACKENDS_BY_PRECEDENCE) if flat["id"].endswith(backend)),
            reverse=True,
        )
        unique_flats_list.append(tools.merge_dicts(*to_merge))
        # The ID of the added merged flat will be the one of the last item
        # in ``matching_flats``. Then, any flat object that was before in
        # the ``matching_flats`` list is to be considered as a duplicate
        # and should have a ``duplicate`` status.
        duplicate_flats.extend(to_merge[:-1])

    return unique_flats_list, duplicate_flats