def prepare_sites(): """ Cette recette ajoute une clé primaire et garde uniquement certaines colonnes """ # input dataset basias_sites_filtered = Dataset("etl", "basias_sites_filtered") # output dataset basias_sites_prepared = Dataset("etl", "basias_sites_prepared") # columns to keep keep = ["indice_departemental", "nom_usuel", "raison_sociale"] dtype = basias_sites_filtered.read_dtype() # transform schema output_dtype = [column for column in dtype if column.name in keep] id_column = Column("id", BigInteger, primary_key=True, autoincrement=True) output_dtype = [id_column, *output_dtype] basias_sites_prepared.write_dtype(output_dtype) # transform data with basias_sites_prepared.get_writer() as writer: for row in basias_sites_filtered.iter_rows(): output_row = dict((key, row[key]) for key in row if key in keep) writer.write_row_dict(output_row)
def filter_departements(): """ Filtre les données pour conserver uniquement les enregistrements localisés dans les départements sélectionnés dans la config """ # Input dataset sis_source = Dataset("etl", "sis_source") # output dataset sis_filtered = Dataset("etl", "sis_filtered") sis_filtered.write_dtype(sis_source.read_dtype()) with sis_filtered.get_writer() as writer: for row in sis_source.iter_rows(): code_insee = row["code_insee"] keep_row = False for departement in DEPARTEMENTS: if code_insee.startswith(departement): keep_row = True break if keep_row: writer.write_row_dict(row)
def normalize_precision(): """ Cette recette permet de normaliser les valeurs de la colonne lib_precis dans la nomenclature PARCEL, HOUSENUMBER, MUNICIPALITY """ # input dataset s3ic_geocoded = Dataset("etl", "s3ic_geocoded") # output dataset s3ic_normalized = Dataset("etl", "s3ic_normalized") dtype = s3ic_geocoded.read_dtype() s3ic_normalized.write_dtype(dtype) with s3ic_normalized.get_writer() as writer: for row in s3ic_geocoded.iter_rows(): mapping = { "Coordonnées précises": precisions.PARCEL, "Coordonnée précise": precisions.PARCEL, "Valeur Initiale": precisions.PARCEL, "Adresse postale": precisions.HOUSENUMBER, "Centroïde Commune": precisions.MUNICIPALITY, "Inconnu": precisions.MUNICIPALITY } precision = row.get("precision") if precision: row["precision"] = mapping.get(precision) else: row["precision"] = precisions.MUNICIPALITY writer.write_row_dict(row)
def prepare_code_postal(): code_postal_source = Dataset("etl", "code_postal_source") code_postal = Dataset("etl", "code_postal") dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), Column("code_insee", String), Column("code_postal", String), Column("nom_commune", String), Column("version", Integer) ] code_postal.write_dtype(dtype) with code_postal.get_writer() as writer: for row in code_postal_source.iter_rows( primary_key="Code_commune_INSEE"): output_row = { "code_insee": row["Code_commune_INSEE"], "code_postal": row["Code_postal"], "nom_commune": row["Nom_commune"] } writer.write_row_dict(output_row)
def parse_cadastre(): basol_source = Dataset("etl", "basol_source") basol_cadastre = Dataset("etl", "basol_cadastre") dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("numerobasol", String), Column("commune", String), Column("section", String), Column("numero", String) ] basol_cadastre.write_dtype(dtype) with basol_cadastre.get_writer() as writer: for row in basol_source.iter_rows(primary_key="numerobasol"): cadastre_multi = row["cadastre_multi"] if cadastre_multi: parcelles = transformers.parse_cadastre(cadastre_multi) for parcelle in parcelles: output_row = { "numerobasol": row["numerobasol"], **parcelle } writer.write_row_dict(output_row)
def filter_departements(): basol_source = Dataset("etl", "basol_source") basol_filtered = Dataset("etl", "basol_filtered") basol_filtered.write_dtype([ Column("id", BigInteger, primary_key=True, autoincrement=True), *basol_source.read_dtype(primary_key="numerobasol") ]) with basol_filtered.get_writer() as writer: for row in basol_source.iter_rows(): if row["departement"] in DEPARTEMENTS: writer.write_row_dict(row)
def add_version(): """ Add a version column for compatibility with Spring """ # Input dataset basol_with_communes = Dataset("etl", "basol_with_commune") # Output dataset basol_with_version = Dataset("etl", "basol_with_version") basol_with_version.write_dtype( [*basol_with_communes.read_dtype(), Column("version", Integer)]) with basol_with_version.get_writer() as writer: for row in basol_with_communes.iter_rows(): writer.write_row_dict(row)
def normalize_precision(): """ normalize precision fields """ # input dataset basol_geocoded = Dataset("etl", "basol_geocoded") # output dataset basol_normalized = Dataset("etl", "basol_normalized") dtype = basol_geocoded.read_dtype() basol_normalized.write_dtype(dtype) with basol_normalized.get_writer() as writer: for row in basol_geocoded.iter_rows(): normalized = transformers.normalize_precision(row) writer.write_row_dict(normalized)
def merge(): """ Fusionne les données pour l'IDF et pour le reste de la France en renommant des colonnes pour obtenir un schéma commun """ # Données hors IDF s3ic_scraped = Dataset("etl", "s3ic_scraped") # Données IDF s3ic_idf_with_geom = Dataset("etl", "s3ic_idf_with_geom") # outpt dataset s3ic_merged = Dataset("etl", "s3ic_merged") output_dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), Column("code", String), Column("nom", String), Column("adresse", String), Column("complement_adresse", String), Column("code_insee", String), Column("code_postal", String), Column("commune", String), Column("code_naf", String), Column("lib_naf", String), Column("num_siret", String), Column("regime", String), Column("lib_regime", String), Column("ipcc", String), Column("seveso", String), Column("lib_seveso", String), Column("famille_ic", String), Column("url_fiche", String), Column("x", BigInteger), Column("y", BigInteger), Column("precision", String), Column("geog", Geometry(srid=WGS84)) ] s3ic_merged.write_dtype(output_dtype) try: with s3ic_merged.get_writer() as writer: for row in s3ic_scraped.iter_rows(): del row["id"] row["code"] = row.pop("code_s3ic") row["nom"] = row.pop("nom_ets") row["code_insee"] = row.pop("cd_insee") row["code_postal"] = row.pop("cd_postal") row["commune"] = row.pop("nomcommune") row["precision"] = row.pop("lib_precis") s = to_shape(row.pop("geom")) row["geog"] = wkb.dumps(s, hex=True, srid=WGS84) writer.write_row_dict(row) except exc.NoSuchTableError: # si tous les départements configurés sont en IDF, # cette table n'existe pas pass try: with s3ic_merged.get_writer() as writer: for row in s3ic_idf_with_geom.iter_rows(primary_key="code"): writer.write_row_dict(row) except exc.NoSuchTableError: # si tous les départements configurés sont hors IDF # cette table n'existe pas pass