def filter_departements(): """ Filtre les données pour conserver uniquement les enregistrements localisés dans les départements sélectionnés dans la config """ # Input dataset sis_source = Dataset("etl", "sis_source") # output dataset sis_filtered = Dataset("etl", "sis_filtered") sis_filtered.write_dtype(sis_source.read_dtype()) with sis_filtered.get_writer() as writer: for row in sis_source.iter_rows(): code_insee = row["code_insee"] keep_row = False for departement in DEPARTEMENTS: if code_insee.startswith(departement): keep_row = True break if keep_row: writer.write_row_dict(row)
def geocode(): """ Geocode Basol adresses """ # input dataset basol_filtered = Dataset("etl", "basol_filtered") # output dataset basol_geocoded = Dataset("etl", "basol_geocoded") # write output schema dtype = basol_filtered.read_dtype(primary_key="numerobasol") output_dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), *dtype, Column("geocoded_latitude", Float(precision=10)), Column("geocoded_longitude", Float(precision=10)), Column("geocoded_result_score", Float()), Column("geocoded_result_type", String()), Column("adresse_id", String()) ] basol_geocoded.write_dtype(output_dtype) with basol_geocoded.get_writer() as writer: for df in basol_filtered.get_dataframes(chunksize=100): df = df.replace({np.nan: None}) rows = df.to_dict(orient="records") payload = [{ "adresse": row["adresse"], "code_insee": row["code_insee"] } for row in rows] geocoded = bulk_geocode(payload, columns=["adresse"], citycode="code_insee") zipped = list(zip(rows, geocoded)) for (row, geocodage) in zipped: latitude = geocodage["latitude"] row["geocoded_latitude"] = float(latitude) \ if latitude else None longitude = geocodage["longitude"] row["geocoded_longitude"] = float(longitude) \ if longitude else None result_score = geocodage["result_score"] row["geocoded_result_score"] = float(result_score) \ if result_score else None row["geocoded_result_type"] = geocodage["result_type"] if row["geocoded_result_type"] == precisions.HOUSENUMBER and \ row["geocoded_result_score"] > 0.6: row["adresse_id"] = geocodage["result_id"] else: row["adresse_id"] = None writer.write_row_dict(row)
def prepare_code_postal(): code_postal_source = Dataset("etl", "code_postal_source") code_postal = Dataset("etl", "code_postal") dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), Column("code_insee", String), Column("code_postal", String), Column("nom_commune", String), Column("version", Integer) ] code_postal.write_dtype(dtype) with code_postal.get_writer() as writer: for row in code_postal_source.iter_rows( primary_key="Code_commune_INSEE"): output_row = { "code_insee": row["Code_commune_INSEE"], "code_postal": row["Code_postal"], "nom_commune": row["Nom_commune"] } writer.write_row_dict(output_row)
def join_cadastre(): """ Join the table basol_cadastre with cadastre table """ # Input datasets basol_cadastre = Dataset("etl", "basol_cadastre") cadastre = Dataset("etl", "cadastre") # Output datasets basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined") basol_cadastre_dtype = basol_cadastre.read_dtype() dtype = [*basol_cadastre_dtype, Column("geog", Geometry(srid=4326))] basol_cadastre_joined.write_dtype(dtype) BasolCadastre = basol_cadastre.reflect() Cadastre = cadastre.reflect() session = basol_cadastre.get_session() cond = (BasolCadastre.commune == Cadastre.commune) & \ (BasolCadastre.section == Cadastre.section) & \ (BasolCadastre.numero == Cadastre.numero) q = session.query(BasolCadastre, Cadastre.geog) \ .join(Cadastre, cond) \ .yield_per(500) with basol_cadastre_joined.get_writer() as writer: for (basol, geog) in q: output_row = {**row2dict(basol), "geog": geog} output_row.pop("id") writer.write_row_dict(output_row) session.close()
def normalize_precision(): """ Cette recette permet de normaliser les valeurs de la colonne lib_precis dans la nomenclature PARCEL, HOUSENUMBER, MUNICIPALITY """ # input dataset s3ic_geocoded = Dataset("etl", "s3ic_geocoded") # output dataset s3ic_normalized = Dataset("etl", "s3ic_normalized") dtype = s3ic_geocoded.read_dtype() s3ic_normalized.write_dtype(dtype) with s3ic_normalized.get_writer() as writer: for row in s3ic_geocoded.iter_rows(): mapping = { "Coordonnées précises": precisions.PARCEL, "Coordonnée précise": precisions.PARCEL, "Valeur Initiale": precisions.PARCEL, "Adresse postale": precisions.HOUSENUMBER, "Centroïde Commune": precisions.MUNICIPALITY, "Inconnu": precisions.MUNICIPALITY } precision = row.get("precision") if precision: row["precision"] = mapping.get(precision) else: row["precision"] = precisions.MUNICIPALITY writer.write_row_dict(row)
def merge_cadastre(): """ Merge the different parcelles into a MultiPolygon """ # Input dataset basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined") # Output dataset basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged") dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("numerobasol", String), Column("geog", Geometry(srid=4326)) ] basol_cadastre_merged.write_dtype(dtype) BasolCadastreJoined = basol_cadastre_joined.reflect() session = basol_cadastre_joined.get_session() select = [ BasolCadastreJoined.numerobasol, func.st_multi(func.st_union(BasolCadastreJoined.geog)) ] q = session.query(*select) \ .group_by(BasolCadastreJoined.numerobasol) \ .all() with basol_cadastre_merged.get_writer() as writer: for (numerobasol, geog) in q: row = {"numerobasol": numerobasol, "geog": geog} writer.write_row_dict(row) session.close()
def merge_geog(): """ Choose best precision between initial coordinates or geocoded coordinates if geog is not set from cadastre information """ # Input dataset basol_geocoded = Dataset("etl", "basol_normalized") # Output dataset basol_geog_merged = Dataset("etl", "basol_geog_merged") basol_geog_merged.write_dtype([ *basol_geocoded.read_dtype(), Column("geog", Geometry(srid=4326)), Column("geog_precision", String), Column("geog_source", String) ]) BasolGeocoded = basol_geocoded.reflect() session = basol_geocoded.get_session() point_lambert2 = func.ST_Transform( func.ST_setSRID( func.ST_MakePoint(BasolGeocoded.coordxlambertii, BasolGeocoded.coordylambertii), LAMBERT2), WGS84) point_geocoded = func.ST_setSRID( func.ST_MakePoint(BasolGeocoded.geocoded_longitude, BasolGeocoded.geocoded_latitude), WGS84) q = session.query(BasolGeocoded, point_lambert2, point_geocoded).all() with basol_geog_merged.get_writer() as writer: for (row, point_lambert2, point_geocoded) in q: output_row = { **row2dict(row), "geog": None, "geog_precision": None, "geog_source": None } if row.l2e_precision == precisions.HOUSENUMBER: output_row["geog"] = point_lambert2 output_row["geog_precision"] = row.l2e_precision output_row["geog_source"] = "lambert2" elif (row.geocoded_result_type == precisions.HOUSENUMBER) and \ (row.geocoded_result_score >= 0.6): output_row["geog"] = point_geocoded output_row["geog_precision"] = row.geocoded_result_type output_row["geog_source"] = "geocodage" writer.write_row_dict(output_row) session.close()
def add_parcels(): """ join table basol_intersected with basol_cadastre_merged """ # input datasets basol_intersected = Dataset("etl", "basol_intersected") basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged") # output datasets basol_with_parcels = Dataset("etl", "basol_with_parcels") BasolIntersected = basol_intersected.reflect() BasolCadastreMerged = basol_cadastre_merged.reflect() dtype = basol_intersected.read_dtype() basol_with_parcels.write_dtype(dtype) session = basol_intersected.get_session() cond = (BasolIntersected.numerobasol == BasolCadastreMerged.numerobasol) q = session.query(BasolIntersected, BasolCadastreMerged.geog) \ .join(BasolCadastreMerged, cond, isouter=True) \ .all() with basol_with_parcels.get_writer() as writer: for (row, geog) in q: if geog is not None: row.geog_precision = precisions.PARCEL row.geog_source = "cadastre" row.geog = geog writer.write_row_dict(row2dict(row)) session.close()
def parse_cadastre(): basol_source = Dataset("etl", "basol_source") basol_cadastre = Dataset("etl", "basol_cadastre") dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("numerobasol", String), Column("commune", String), Column("section", String), Column("numero", String) ] basol_cadastre.write_dtype(dtype) with basol_cadastre.get_writer() as writer: for row in basol_source.iter_rows(primary_key="numerobasol"): cadastre_multi = row["cadastre_multi"] if cadastre_multi: parcelles = transformers.parse_cadastre(cadastre_multi) for parcelle in parcelles: output_row = { "numerobasol": row["numerobasol"], **parcelle } writer.write_row_dict(output_row)
def prepare_sites(): """ Cette recette ajoute une clé primaire et garde uniquement certaines colonnes """ # input dataset basias_sites_filtered = Dataset("etl", "basias_sites_filtered") # output dataset basias_sites_prepared = Dataset("etl", "basias_sites_prepared") # columns to keep keep = ["indice_departemental", "nom_usuel", "raison_sociale"] dtype = basias_sites_filtered.read_dtype() # transform schema output_dtype = [column for column in dtype if column.name in keep] id_column = Column("id", BigInteger, primary_key=True, autoincrement=True) output_dtype = [id_column, *output_dtype] basias_sites_prepared.write_dtype(output_dtype) # transform data with basias_sites_prepared.get_writer() as writer: for row in basias_sites_filtered.iter_rows(): output_row = dict((key, row[key]) for key in row if key in keep) writer.write_row_dict(output_row)
def scrap_adresses(): """ Scrappe les adresses présentes sur les fiches détails Géorisques Exemple: À partir de l'url http://www.installationsclassees.developpement-durable.gouv.fr /ficheEtablissement.php?champEtablBase=61&champEtablNumero=14605 On extraie => Lieu dit 'Les Murettes' 26300 BEAUREGARD BARET Pour des raisons de performance, on scrappe uniquement les adresses pour les enregistrements dont la précision est "Centroïde Commune" """ # input dataset s3ic_filtered = Dataset("etl", "s3ic_source") # output dataset s3ic_scraped = Dataset("etl", "s3ic_scraped") dtype = s3ic_filtered.read_dtype() output_dtype = [*dtype, Column("adresse", String)] s3ic_scraped.write_dtype(output_dtype) with s3ic_scraped.get_writer() as writer: for df in s3ic_filtered.get_dataframes(chunksize=100): filtered = df.loc[(df["lib_precis"] == "Centroïde Commune") & (df["url_fiche"].notnull())].copy() urls = filtered["url_fiche"].tolist() scrapers = [IcpeScraper(url) for url in urls] fetch_parallel(scrapers) for scraper in scrapers: scraper.parse() scraper.find_adresse() filtered["adresse"] = [s.adresse for s in scrapers] def f(row): try: return filtered["adresse"].loc[row.name] except KeyError: return None df["adresse"] = df.apply(lambda row: f(row), axis=1) writer.write_dataframe(df)
def load_cadastre_for_department(department): cadastre_temp = Dataset( "etl", "cadastre_{dep}_temp".format(dep=department)) dep_url = "https://cadastre.data.gouv.fr/data/etalab-cadastre" + \ "/latest/geojson/communes/{dep}/".format(dep=department) # Recherche la liste des communes dans la page web scraper = CadastreCommuneScraper(dep_url) with requests.Session() as session: scraper.fetch_url(session) scraper.parse() scraper.find_communes() communes = scraper.communes # Iterate over each commune and load data into temporary table with cadastre_temp.get_writer() as writer: for commune in communes: with tempfile.NamedTemporaryFile() as temp: commune_url = "https://cadastre.data.gouv.fr/" + \ "data/etalab-cadastre/latest/geojson/communes/" + \ "{dep}/{commune}/cadastre-{commune}-parcelles.json.gz" \ .format(dep=department, commune=commune) urlretrieve(commune_url, temp.name) with gzip.open(temp.name) as f: data = geojson.loads(f.read()) features = data["features"] for feature in features: s = shape(feature["geometry"]) row = { "code": feature["id"], "commune": feature["properties"]["commune"], "prefixe": feature["properties"]["prefixe"], "section": feature["properties"]["section"], "numero": feature["properties"]["numero"], "type": feature["type"], "type_geom": feature["geometry"]["type"], "geog": wkb.dumps(s, hex=True, srid=WGS84), "version": 1 } writer.write_row_dict(row)
def add_geog(): """ Cette recette réalise une jointure avec la table cadastre pour nettoyer les parcelles invalide. On utilise la table cadastre du schéma kelrisks et non la table du schéma etl car il n'y a pas assez de stockage sur le serveur pour avoir 4 tables cadastre (preprod `etl`, preprod `kelrisks`, prod `etl`, prod `kelrisks`). On est donc obligé de supprimer les tables cadastres du schéma `etl` après les avoir copié dans le schéma `kelrisks`. La table `etl.cadastre` n'existe donc pas forcément au moment où ce DAG est executé. """ # Input dataset basias_cadastre_parsed = Dataset("etl", "basias_cadastre_parsed") cadastre = Dataset("kelrisks", "cadastre") # Output dataset basias_cadastre_with_geog = Dataset("etl", "basias_cadastre_with_geog") BasiasCadastreParsed = basias_cadastre_parsed.reflect() Cadastre = cadastre.reflect() basias_cadastre_with_geog.write_dtype([ *basias_cadastre_parsed.read_dtype(), Column("geog", Geometry(srid=4326)) ]) session = basias_cadastre_parsed.get_session() cond = (BasiasCadastreParsed.commune == Cadastre.commune) & \ (BasiasCadastreParsed.section == Cadastre.section) & \ (BasiasCadastreParsed.numero == Cadastre.numero) q = session.query(BasiasCadastreParsed, Cadastre.geog) \ .join(Cadastre, cond) \ .yield_per(500) with basias_cadastre_with_geog.get_writer() as writer: for (row, geog) in q: output_row = {**row2dict(row), "geog": None} del output_row["id"] if geog is not None: output_row["geog"] = geog writer.write_row_dict(output_row) session.close()
def filter_departements(): basol_source = Dataset("etl", "basol_source") basol_filtered = Dataset("etl", "basol_filtered") basol_filtered.write_dtype([ Column("id", BigInteger, primary_key=True, autoincrement=True), *basol_source.read_dtype(primary_key="numerobasol") ]) with basol_filtered.get_writer() as writer: for row in basol_source.iter_rows(): if row["departement"] in DEPARTEMENTS: writer.write_row_dict(row)
def load_sis(): """ Charge les données SIS à partir d'un fichier csv contenant un champ geom au format geojson """ sis_source = Dataset("etl", "sis_source") dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), Column("id_sis", String), Column("numero_affichage", String), Column("numero_basol", String), Column("adresse", Text), Column("lieu_dit", String), Column("code_insee", String), Column("nom_commune", String), Column("nom_departement", String), Column("x", Float(8)), Column("y", Float(8)), Column("surface_m2", Float), Column("geog", Geometry(srid=WGS84)) ] sis_source.write_dtype(dtype) with sis_source.get_writer() as writer: with tempfile.NamedTemporaryFile() as temp: url = "https://kelrisks.fra1.digitaloceanspaces.com/sis.zip" urlretrieve(url, temp.name) with zipfile.ZipFile(temp.name) as zfile: with zfile.open("sis/sis.csv") as csvfile: reader = csv.DictReader(io.TextIOWrapper(csvfile), delimiter=",", quotechar="\"") for row in reader: g = geojson.loads(row["geom"]) s = shape(g) row["geog"] = wkb.dumps(s, hex=True, srid=4326) writer.write_row_dict(row)
def join_sites_localisation(): """ Réalise une jointure entre la table sites et la table localisation """ # datasets to join basias_sites_prepared = Dataset("etl", "basias_sites_prepared") basias_localisation_with_cadastre = Dataset( "etl", "basias_localisation_with_cadastre") # output dataset basias_sites_localisation_joined = Dataset( "etl", "basias_sites_localisation_joined") # transform types output_dtype = [ *basias_sites_prepared.read_dtype(), *basias_localisation_with_cadastre.read_dtype() ] output_dtype = merge_dtype(output_dtype) basias_sites_localisation_joined.write_dtype(output_dtype) # transform data BasiasSitesPrepared = basias_sites_prepared.reflect() BasiasLocalisation = basias_localisation_with_cadastre.reflect() session = basias_sites_prepared.get_session() join_query = session \ .query(BasiasSitesPrepared, BasiasLocalisation) \ .join(BasiasLocalisation, BasiasSitesPrepared.indice_departemental == BasiasLocalisation.indice_departemental, isouter=True) \ .all() with basias_sites_localisation_joined.get_writer() as writer: for (site, localisation) in join_query: output_row = {c.name: None for c in output_dtype} output_row = {**output_row, **row2dict(site)} if localisation: output_row = {**output_row, **row2dict(localisation)} del output_row["id"] writer.write_row_dict(output_row) session.close()
def normalize_precision(): """ normalize precision fields """ # input dataset basol_geocoded = Dataset("etl", "basol_geocoded") # output dataset basol_normalized = Dataset("etl", "basol_normalized") dtype = basol_geocoded.read_dtype() basol_normalized.write_dtype(dtype) with basol_normalized.get_writer() as writer: for row in basol_geocoded.iter_rows(): normalized = transformers.normalize_precision(row) writer.write_row_dict(normalized)
def add_version(): """ Add a version column for compatibility with Spring """ # Input dataset basol_with_communes = Dataset("etl", "basol_with_commune") # Output dataset basol_with_version = Dataset("etl", "basol_with_version") basol_with_version.write_dtype( [*basol_with_communes.read_dtype(), Column("version", Integer)]) with basol_with_version.get_writer() as writer: for row in basol_with_communes.iter_rows(): writer.write_row_dict(row)
def add_commune(): """ Ajoute le contour des communes comme nouvelle valeur pour geog dans le cas où la précision est MUNICIPALITY La reqête SqlAlchemy est équivalente à SELECT * FROM etl.basias_sites_localisation_joined A LEFT JOIN etl.commune B ON A.numero_insee == B.insee """ # input dataset basias_sites_localisation_joined = Dataset( "etl", "basias_sites_localisation_joined") communes = Dataset("etl", "commune") # output dataset basias_sites_with_commune = Dataset("etl", "basias_sites_with_commune") dtype = basias_sites_localisation_joined.read_dtype() basias_sites_with_commune.write_dtype(dtype) BasiasSites = basias_sites_localisation_joined.reflect() Commune = communes.reflect() session = basias_sites_localisation_joined.get_session() q = session.query(BasiasSites, Commune.geog) \ .join(Commune, BasiasSites.numero_insee == Commune.insee, isouter=True) \ .all() with basias_sites_with_commune.get_writer() as writer: for (row, commune) in q: if row.geog is None: row.geog = commune row.geog_precision = precisions.MUNICIPALITY row.geog_source = "numero_insee" writer.write_row_dict(row2dict(row)) session.close()
def add_commune(): """ Ajoute le contour des communes comme nouvelle valeur pour geog dans le cas où la précision est MUNICIPALITY La reqête SqlAlchemy est équivalente à SELECT * FROM etl.s3ic_with_parcelle A LEFT JOIN etl.commune B ON A.cd_insee == B.insee """ # input dataset s3ic_with_parcelle = Dataset("etl", "s3ic_with_parcelle") communes = Dataset("etl", "commune") # output dataset s3ic_with_commune = Dataset("etl", "s3ic_with_commune") dtype = s3ic_with_parcelle.read_dtype() s3ic_with_commune.write_dtype(dtype) S3icWithParcelle = s3ic_with_parcelle.reflect() Commune = communes.reflect() session = s3ic_with_parcelle.get_session() q = session.query(S3icWithParcelle, Commune.geog) \ .join(Commune, S3icWithParcelle.code_insee == Commune.insee, isouter=True) \ .all() with s3ic_with_commune.get_writer() as writer: for (row, commune) in q: if row.geog_precision == precisions.MUNICIPALITY: row.geog = commune row.precision = precisions.MUNICIPALITY row.geog_source = "code_insee" writer.write_row_dict(row2dict(row)) session.close()
def intersect(): """ Find the closest parcelle to the point and set it as new geography """ # Input dataset basol_geog_merged = Dataset("etl", "basol_geog_merged") cadastre = Dataset("etl", "cadastre") # Output dataset basol_intersected = Dataset("etl", "basol_intersected") dtype = basol_geog_merged.read_dtype() basol_intersected.write_dtype(dtype) Cadastre = cadastre.reflect() BasolGeogMerged = basol_geog_merged.reflect() session = basol_geog_merged.get_session() stmt = session.query(Cadastre.geog) \ .filter(func.st_dwithin( Cadastre.geog, BasolGeogMerged.geog, 0.0001)) \ .order_by(func.st_distance( Cadastre.geog, BasolGeogMerged.geog)) \ .limit(1) \ .label("nearest") q = session.query(BasolGeogMerged, stmt).all() with basol_intersected.get_writer() as writer: for (row, cadastre_geog) in q: if cadastre_geog is not None: row.geog = cadastre_geog writer.write_row_dict(row2dict(row)) session.close()
def add_communes(): """ set commune geog for all records where geog is not set with a better precision """ # input dataset basol_with_parcels = Dataset("etl", "basol_with_parcels") communes = Dataset("etl", "commune") # output dataset basol_with_communes = Dataset("etl", "basol_with_commune") dtype = basol_with_parcels.read_dtype() basol_with_communes.write_dtype(dtype) BasolWithParcels = basol_with_parcels.reflect() Commune = communes.reflect() session = basol_with_parcels.get_session() q = session.query(BasolWithParcels, Commune.geog) \ .join(Commune, BasolWithParcels.code_insee == Commune.insee, isouter=True) \ .all() with basol_with_communes.get_writer() as writer: for (row, commune) in q: if row.geog is None: row.geog = commune row.geog_precision = precisions.MUNICIPALITY row.geog_source = "code_insee" writer.write_row_dict(row2dict(row)) session.close()
def join_localisation_cadastre(): """ Réalise une jointure entre la table localisation et la table cadastre """ # Input datasets basias_localisation_intersected = Dataset( "etl", "basias_localisation_intersected") basias_cadastre_merged = Dataset("etl", "basias_cadastre_merged") # Output dataset basias_localisation_with_cadastre = Dataset( "etl", "basias_localisation_with_cadastre") basias_localisation_with_cadastre.write_dtype( basias_localisation_intersected.read_dtype()) BasiasLocalisationIntersected = basias_localisation_intersected.reflect() BasiasCadastreMerged = basias_cadastre_merged.reflect() session = basias_localisation_intersected.get_session() cond = BasiasCadastreMerged.indice_departemental \ == BasiasLocalisationIntersected.indice_departemental q = session.query( BasiasLocalisationIntersected, BasiasCadastreMerged.geog) \ .join(BasiasCadastreMerged, cond, isouter=True) \ .all() with basias_localisation_with_cadastre.get_writer() as writer: for (row, cadastre_geog) in q: if cadastre_geog is not None: # replace geog with cadastre geog row.geog = cadastre_geog row.geog_precision = precisions.PARCEL row.geog_source = "cadastre" writer.write_row_dict(row2dict(row))
def merge_cadastre_geog(): """ Aggrège les différentes parcelles d'un même site dans un Multi Polygon """ # Input dataset basias_cadastre_with_geog = Dataset("etl", "basias_cadastre_with_geog") # Output dataset basias_cadastre_merged = Dataset("etl", "basias_cadastre_merged") dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("indice_departemental", String), Column("geog", Geometry(srid=4326)) ] basias_cadastre_merged.write_dtype(dtype) basias_cadastre_merged.write_dtype(dtype) BasiasCadastreWithGeog = basias_cadastre_with_geog.reflect() session = basias_cadastre_with_geog.get_session() select = [ BasiasCadastreWithGeog.indice_departemental, func.st_multi(func.st_union(BasiasCadastreWithGeog.geog)) ] q = session.query(*select) \ .group_by(BasiasCadastreWithGeog.indice_departemental) \ .all() with basias_cadastre_merged.get_writer() as writer: for (indice_departemental, geog) in q: row = {"indice_departemental": indice_departemental, "geog": geog} writer.write_row_dict(row) session.close()
def load_communes(): communes_filepath = "{data_dir}/communes/communes-20190101.json" \ .format(data_dir=DATA_DIR) communes = Dataset("etl", "commune") dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), Column("insee", String), Column("nom", String), Column("geog", Geometry(srid=4326)) ] communes.write_dtype(dtype) with open(communes_filepath, 'r') as f: data = geojson.loads(f.read()) features = data["features"] with communes.get_writer() as writer: for feature in features: s = shape(feature["geometry"]) row = { "insee": feature["properties"]["insee"], "nom": feature["properties"]["nom"], "type_geom": feature["geometry"]["type"], "geog": wkb.dumps(s, hex=True, srid=WGS84) } writer.write_row_dict(row)
def parse_cadastre(): """ Cette recette permet d'extraire des parcelles du champ "numero_de_parcelle" qui n'est pas formaté correctement Exemple de valeurs: ZA 128 et 146 ? 131 AH 575-574-43-224 etc """ # input dataset basias_cadastre_filtered = Dataset("etl", "basias_cadastre_filtered") basias_localisation_filtered = Dataset("etl", "basias_localisation_filtered") # output dataset basias_cadastre_parsed = Dataset("etl", "basias_cadastre_parsed") BasiasCadastreFiltered = basias_cadastre_filtered.reflect() BasiasLocalisationFiltered = basias_localisation_filtered.reflect() session = basias_cadastre_filtered.get_session() # write output schema output_dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("indice_departemental", String), Column("commune", String), Column("section", String), Column("numero", String) ] basias_cadastre_parsed.write_dtype(output_dtype) # Fait une jointure avec la table basias_localiation_source pour # récuperer le code insee q = session \ .query(BasiasCadastreFiltered, BasiasLocalisationFiltered.numero_insee) \ .join( BasiasLocalisationFiltered, BasiasCadastreFiltered.indice_departemental == BasiasLocalisationFiltered.indice_departemental, isouter=True) \ .all() with basias_cadastre_parsed.get_writer() as writer: for (cadastre, numero_insee) in q: row = {**row2dict(cadastre), "numero_insee": numero_insee} parcelles = transformers.extract_basias_parcelles_from_row(row) for parcelle in parcelles: output_row = { "indice_departemental": row["indice_departemental"], "commune": parcelle.commune, "section": parcelle.section, "numero": parcelle.numero } writer.write_row_dict(output_row) session.close()
def merge_geog(): """ Sélectionne la meilleure information géographique entre - (x_lambert2_etendue, y_lambert2_etendue) - (xl2_adresse, yl2_adresse) - point géocodé Ajoute la précision associée et la source de l'info """ # Input dataset basias_localisation_geocoded = Dataset("etl", "basias_localisation_geocoded") # Output dataset basias_localisation_geog_merged = Dataset( "etl", "basias_localisation_geog_merged") basias_localisation_geog_merged.write_dtype([ *basias_localisation_geocoded.read_dtype(), Column("geog", Geometry(srid=4326)), Column("geog_precision", String), Column("geog_source", String) ]) BasiasLocalisationGeocoded = basias_localisation_geocoded.reflect() session = basias_localisation_geocoded.get_session() point_lambert2_etendue = func.ST_Transform( func.ST_setSRID( func.ST_MakePoint(BasiasLocalisationGeocoded.x_lambert2_etendue, BasiasLocalisationGeocoded.y_lambert2_etendue), LAMBERT2), WGS84) point_adresse = func.ST_Transform( func.ST_setSRID( func.ST_MakePoint(BasiasLocalisationGeocoded.xl2_adresse, BasiasLocalisationGeocoded.yl2_adresse), LAMBERT2), WGS84) point_geocoded = func.ST_setSRID( func.ST_MakePoint(BasiasLocalisationGeocoded.geocoded_longitude, BasiasLocalisationGeocoded.geocoded_latitude), WGS84) q = session.query(BasiasLocalisationGeocoded, point_lambert2_etendue, point_adresse, point_geocoded).all() with basias_localisation_geog_merged.get_writer() as writer: for (row, p_lambert2_etendue, p_adresse, p_geocoded) in q: output_row = { **row2dict(row), "geog": None, "geog_precision": None, "geog_source": None } if p_lambert2_etendue is not None: # assert this data is accurate output_row["geog"] = p_lambert2_etendue output_row["geog_precision"] = precisions.PARCEL output_row["geog_source"] = "lambert2_etendue" elif p_adresse is not None and row.precision_adresse == "numéro": output_row["geog"] = p_adresse output_row["geog_precision"] = precisions.HOUSENUMBER output_row["geog_source"] = "adresse" elif p_geocoded is not None and row.geocoded_result_score > 0.6 \ and row.geocoded_result_type == precisions.HOUSENUMBER: output_row["geog"] = p_geocoded output_row["geog_precision"] = precisions.HOUSENUMBER output_row["geog_source"] = "geocodage" writer.write_row_dict(output_row) session.close()
def add_parcelle(): """ Projette les points géométriques sur la parcelle la plus proche. On se limite à un voisinage d'environ 10m = 0.0001 degré. Si aucune parcelle n'a été trouvée dans ce voisinage on conserve le point d'origine. La requête SQL Alchemy est équivalente à SELECT *, (SELECT geog FROM kelrisks.cadastre AS c WHERE st_dwithin(ic.geog, c.geog, 0.0001) ORDER BY st_distance(ic.geog, c.geog) LIMIT 1) nearest FROM etl.s3ic_geog_merged ic On utilise la table cadastre du schéma kelrisks et non la table du schéma etl car il n'y a pas assez de stockage sur le serveur pour avoir 4 tables cadastre (preprod `etl`, preprod `kelrisks`, prod `etl`, prod `kelrisks`). On est donc obligé de supprimer les tables cadastres du schéma `etl` après les avoir copié dans le schéma `kelrisks`. La table `etl.cadastre` n'existe donc pas forcément au moment où ce DAG est executé. """ # Input dataset s3ic_geog_merged = Dataset("etl", "s3ic_geog_merged") cadastre = Dataset("kelrisks", "cadastre") # Output dataset s3ic_with_parcelle = Dataset("etl", "s3ic_with_parcelle") dtype = s3ic_geog_merged.read_dtype() s3ic_with_parcelle.write_dtype(dtype) Cadastre = cadastre.reflect() S3icGeogMerged = s3ic_geog_merged.reflect() session = s3ic_geog_merged.get_session() stmt = session.query(Cadastre.geog) \ .filter(func.st_dwithin( Cadastre.geog, S3icGeogMerged.geog, 0.0001)) \ .order_by(func.st_distance( Cadastre.geog, S3icGeogMerged.geog)) \ .limit(1) \ .label("nearest") q = session.query(S3icGeogMerged, stmt).yield_per(500) with s3ic_with_parcelle.get_writer() as writer: for (row, cadastre_geog) in q: if cadastre_geog is not None \ and row.geog_precision != precisions.MUNICIPALITY: row.geog = cadastre_geog writer.write_row_dict(row2dict(row)) session.close()
def merge_geog(): """ Fusionne les infomations géographiques et produit les champs normalisés geog, geog_precision, geog_source On choisit de privilégier les informations de géocodage lorsque la précision initiale est MUNICIPALITY et que le score de géocodage est supérieur à 0.6 """ # input dataset s3ic_normalized = Dataset("etl", "s3ic_normalized") # output dataset s3ic_merged = Dataset("etl", "s3ic_geog_merged") dtype = s3ic_normalized.read_dtype() # geom => geog # lib_precis => precision dtype = [c for c in dtype if c.name not in ("geom", "lib_precis")] output_dtype = [ *dtype, Column("geog", Geometry(srid=4326)), Column("geog_precision", String), Column("geog_source", String) ] s3ic_merged.write_dtype(output_dtype) S3icNormalized = s3ic_normalized.reflect() session = s3ic_normalized.get_session() point_geocoded = func.ST_setSRID( func.ST_MakePoint(S3icNormalized.geocoded_longitude, S3icNormalized.geocoded_latitude), WGS84) q = session.query(S3icNormalized, point_geocoded).all() with s3ic_merged.get_writer() as writer: for (row, point) in q: output_row = { **row2dict(row), "geog": row.geog, "geog_precision": row.precision, "geog_source": "initial_data" } c1 = row.precision not in \ (precisions.HOUSENUMBER, precisions.PARCEL) c2 = row.geocoded_latitude and row.geocoded_longitude c3 = row.geocoded_result_score and row.geocoded_result_score > 0.6 c4 = row.geocoded_result_type == precisions.HOUSENUMBER if c1 and c2 and c3 and c4: output_row["geog"] = point output_row["geog_precision"] = row.geocoded_result_type output_row["geog_source"] = "geocodage" writer.write_row_dict(output_row) session.close()
def merge(): """ Fusionne les données pour l'IDF et pour le reste de la France en renommant des colonnes pour obtenir un schéma commun """ # Données hors IDF s3ic_scraped = Dataset("etl", "s3ic_scraped") # Données IDF s3ic_idf_with_geom = Dataset("etl", "s3ic_idf_with_geom") # outpt dataset s3ic_merged = Dataset("etl", "s3ic_merged") output_dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), Column("code", String), Column("nom", String), Column("adresse", String), Column("complement_adresse", String), Column("code_insee", String), Column("code_postal", String), Column("commune", String), Column("code_naf", String), Column("lib_naf", String), Column("num_siret", String), Column("regime", String), Column("lib_regime", String), Column("ipcc", String), Column("seveso", String), Column("lib_seveso", String), Column("famille_ic", String), Column("url_fiche", String), Column("x", BigInteger), Column("y", BigInteger), Column("precision", String), Column("geog", Geometry(srid=WGS84)) ] s3ic_merged.write_dtype(output_dtype) try: with s3ic_merged.get_writer() as writer: for row in s3ic_scraped.iter_rows(): del row["id"] row["code"] = row.pop("code_s3ic") row["nom"] = row.pop("nom_ets") row["code_insee"] = row.pop("cd_insee") row["code_postal"] = row.pop("cd_postal") row["commune"] = row.pop("nomcommune") row["precision"] = row.pop("lib_precis") s = to_shape(row.pop("geom")) row["geog"] = wkb.dumps(s, hex=True, srid=WGS84) writer.write_row_dict(row) except exc.NoSuchTableError: # si tous les départements configurés sont en IDF, # cette table n'existe pas pass try: with s3ic_merged.get_writer() as writer: for row in s3ic_idf_with_geom.iter_rows(primary_key="code"): writer.write_row_dict(row) except exc.NoSuchTableError: # si tous les départements configurés sont hors IDF # cette table n'existe pas pass