def geocode(): """ Geocode Basol adresses """ # input dataset basol_filtered = Dataset("etl", "basol_filtered") # output dataset basol_geocoded = Dataset("etl", "basol_geocoded") # write output schema dtype = basol_filtered.read_dtype(primary_key="numerobasol") output_dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), *dtype, Column("geocoded_latitude", Float(precision=10)), Column("geocoded_longitude", Float(precision=10)), Column("geocoded_result_score", Float()), Column("geocoded_result_type", String()), Column("adresse_id", String()) ] basol_geocoded.write_dtype(output_dtype) with basol_geocoded.get_writer() as writer: for df in basol_filtered.get_dataframes(chunksize=100): df = df.replace({np.nan: None}) rows = df.to_dict(orient="records") payload = [{ "adresse": row["adresse"], "code_insee": row["code_insee"] } for row in rows] geocoded = bulk_geocode(payload, columns=["adresse"], citycode="code_insee") zipped = list(zip(rows, geocoded)) for (row, geocodage) in zipped: latitude = geocodage["latitude"] row["geocoded_latitude"] = float(latitude) \ if latitude else None longitude = geocodage["longitude"] row["geocoded_longitude"] = float(longitude) \ if longitude else None result_score = geocodage["result_score"] row["geocoded_result_score"] = float(result_score) \ if result_score else None row["geocoded_result_type"] = geocodage["result_type"] if row["geocoded_result_type"] == precisions.HOUSENUMBER and \ row["geocoded_result_score"] > 0.6: row["adresse_id"] = geocodage["result_id"] else: row["adresse_id"] = None writer.write_row_dict(row)
def scrap_adresses(): """ Scrappe les adresses présentes sur les fiches détails Géorisques Exemple: À partir de l'url http://www.installationsclassees.developpement-durable.gouv.fr /ficheEtablissement.php?champEtablBase=61&champEtablNumero=14605 On extraie => Lieu dit 'Les Murettes' 26300 BEAUREGARD BARET Pour des raisons de performance, on scrappe uniquement les adresses pour les enregistrements dont la précision est "Centroïde Commune" """ # input dataset s3ic_filtered = Dataset("etl", "s3ic_source") # output dataset s3ic_scraped = Dataset("etl", "s3ic_scraped") dtype = s3ic_filtered.read_dtype() output_dtype = [*dtype, Column("adresse", String)] s3ic_scraped.write_dtype(output_dtype) with s3ic_scraped.get_writer() as writer: for df in s3ic_filtered.get_dataframes(chunksize=100): filtered = df.loc[(df["lib_precis"] == "Centroïde Commune") & (df["url_fiche"].notnull())].copy() urls = filtered["url_fiche"].tolist() scrapers = [IcpeScraper(url) for url in urls] fetch_parallel(scrapers) for scraper in scrapers: scraper.parse() scraper.find_adresse() filtered["adresse"] = [s.adresse for s in scrapers] def f(row): try: return filtered["adresse"].loc[row.name] except KeyError: return None df["adresse"] = df.apply(lambda row: f(row), axis=1) writer.write_dataframe(df)