示例#1
0
def geocode():
    """ Geocode Basol adresses """

    # input dataset
    basol_filtered = Dataset("etl", "basol_filtered")

    # output dataset
    basol_geocoded = Dataset("etl", "basol_geocoded")

    # write output schema
    dtype = basol_filtered.read_dtype(primary_key="numerobasol")

    output_dtype = [
        Column("id", BigInteger(), primary_key=True, autoincrement=True),
        *dtype,
        Column("geocoded_latitude", Float(precision=10)),
        Column("geocoded_longitude", Float(precision=10)),
        Column("geocoded_result_score", Float()),
        Column("geocoded_result_type", String()),
        Column("adresse_id", String())
    ]

    basol_geocoded.write_dtype(output_dtype)

    with basol_geocoded.get_writer() as writer:

        for df in basol_filtered.get_dataframes(chunksize=100):

            df = df.replace({np.nan: None})
            rows = df.to_dict(orient="records")
            payload = [{
                "adresse": row["adresse"],
                "code_insee": row["code_insee"]
            } for row in rows]

            geocoded = bulk_geocode(payload,
                                    columns=["adresse"],
                                    citycode="code_insee")

            zipped = list(zip(rows, geocoded))

            for (row, geocodage) in zipped:
                latitude = geocodage["latitude"]
                row["geocoded_latitude"] = float(latitude) \
                    if latitude else None
                longitude = geocodage["longitude"]
                row["geocoded_longitude"] = float(longitude) \
                    if longitude else None
                result_score = geocodage["result_score"]
                row["geocoded_result_score"] = float(result_score) \
                    if result_score else None
                row["geocoded_result_type"] = geocodage["result_type"]

                if row["geocoded_result_type"] == precisions.HOUSENUMBER and \
                   row["geocoded_result_score"] > 0.6:
                    row["adresse_id"] = geocodage["result_id"]
                else:
                    row["adresse_id"] = None

                writer.write_row_dict(row)
def scrap_adresses():
    """
    Scrappe les adresses présentes sur les fiches détails Géorisques

    Exemple:

    À partir de l'url
    http://www.installationsclassees.developpement-durable.gouv.fr
    /ficheEtablissement.php?champEtablBase=61&champEtablNumero=14605

    On extraie => Lieu dit 'Les Murettes' 26300 BEAUREGARD BARET

    Pour des raisons de performance, on scrappe uniquement les adresses
    pour les enregistrements dont la précision est "Centroïde Commune"
    """

    # input dataset
    s3ic_filtered = Dataset("etl", "s3ic_source")

    # output dataset
    s3ic_scraped = Dataset("etl", "s3ic_scraped")

    dtype = s3ic_filtered.read_dtype()

    output_dtype = [*dtype, Column("adresse", String)]

    s3ic_scraped.write_dtype(output_dtype)

    with s3ic_scraped.get_writer() as writer:

        for df in s3ic_filtered.get_dataframes(chunksize=100):

            filtered = df.loc[(df["lib_precis"] == "Centroïde Commune")
                              & (df["url_fiche"].notnull())].copy()

            urls = filtered["url_fiche"].tolist()
            scrapers = [IcpeScraper(url) for url in urls]
            fetch_parallel(scrapers)

            for scraper in scrapers:
                scraper.parse()
                scraper.find_adresse()

            filtered["adresse"] = [s.adresse for s in scrapers]

            def f(row):
                try:
                    return filtered["adresse"].loc[row.name]
                except KeyError:
                    return None

            df["adresse"] = df.apply(lambda row: f(row), axis=1)

            writer.write_dataframe(df)