示例#1
0
def filter_departements():
    """
    Filtre les données pour conserver uniquement
    les enregistrements localisés dans les
    départements sélectionnés dans la config
    """

    # Input dataset
    sis_source = Dataset("etl", "sis_source")

    # output dataset
    sis_filtered = Dataset("etl", "sis_filtered")

    sis_filtered.write_dtype(sis_source.read_dtype())

    with sis_filtered.get_writer() as writer:
        for row in sis_source.iter_rows():
            code_insee = row["code_insee"]
            keep_row = False
            for departement in DEPARTEMENTS:
                if code_insee.startswith(departement):
                    keep_row = True
                    break
            if keep_row:
                writer.write_row_dict(row)
示例#2
0
def geocode():
    """ Geocode Basol adresses """

    # input dataset
    basol_filtered = Dataset("etl", "basol_filtered")

    # output dataset
    basol_geocoded = Dataset("etl", "basol_geocoded")

    # write output schema
    dtype = basol_filtered.read_dtype(primary_key="numerobasol")

    output_dtype = [
        Column("id", BigInteger(), primary_key=True, autoincrement=True),
        *dtype,
        Column("geocoded_latitude", Float(precision=10)),
        Column("geocoded_longitude", Float(precision=10)),
        Column("geocoded_result_score", Float()),
        Column("geocoded_result_type", String()),
        Column("adresse_id", String())
    ]

    basol_geocoded.write_dtype(output_dtype)

    with basol_geocoded.get_writer() as writer:

        for df in basol_filtered.get_dataframes(chunksize=100):

            df = df.replace({np.nan: None})
            rows = df.to_dict(orient="records")
            payload = [{
                "adresse": row["adresse"],
                "code_insee": row["code_insee"]
            } for row in rows]

            geocoded = bulk_geocode(payload,
                                    columns=["adresse"],
                                    citycode="code_insee")

            zipped = list(zip(rows, geocoded))

            for (row, geocodage) in zipped:
                latitude = geocodage["latitude"]
                row["geocoded_latitude"] = float(latitude) \
                    if latitude else None
                longitude = geocodage["longitude"]
                row["geocoded_longitude"] = float(longitude) \
                    if longitude else None
                result_score = geocodage["result_score"]
                row["geocoded_result_score"] = float(result_score) \
                    if result_score else None
                row["geocoded_result_type"] = geocodage["result_type"]

                if row["geocoded_result_type"] == precisions.HOUSENUMBER and \
                   row["geocoded_result_score"] > 0.6:
                    row["adresse_id"] = geocodage["result_id"]
                else:
                    row["adresse_id"] = None

                writer.write_row_dict(row)
示例#3
0
def prepare_code_postal():

    code_postal_source = Dataset("etl", "code_postal_source")
    code_postal = Dataset("etl", "code_postal")

    dtype = [
        Column("id", BigInteger(), primary_key=True, autoincrement=True),
        Column("code_insee", String),
        Column("code_postal", String),
        Column("nom_commune", String),
        Column("version", Integer)
    ]

    code_postal.write_dtype(dtype)

    with code_postal.get_writer() as writer:

        for row in code_postal_source.iter_rows(
                primary_key="Code_commune_INSEE"):

            output_row = {
                "code_insee": row["Code_commune_INSEE"],
                "code_postal": row["Code_postal"],
                "nom_commune": row["Nom_commune"]
            }

            writer.write_row_dict(output_row)
示例#4
0
def join_cadastre():
    """ Join the table basol_cadastre with cadastre table """

    # Input datasets
    basol_cadastre = Dataset("etl", "basol_cadastre")
    cadastre = Dataset("etl", "cadastre")

    # Output datasets
    basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined")

    basol_cadastre_dtype = basol_cadastre.read_dtype()
    dtype = [*basol_cadastre_dtype, Column("geog", Geometry(srid=4326))]
    basol_cadastre_joined.write_dtype(dtype)

    BasolCadastre = basol_cadastre.reflect()
    Cadastre = cadastre.reflect()

    session = basol_cadastre.get_session()

    cond = (BasolCadastre.commune == Cadastre.commune) & \
           (BasolCadastre.section == Cadastre.section) & \
           (BasolCadastre.numero == Cadastre.numero)

    q = session.query(BasolCadastre, Cadastre.geog) \
        .join(Cadastre, cond) \
        .yield_per(500)

    with basol_cadastre_joined.get_writer() as writer:

        for (basol, geog) in q:
            output_row = {**row2dict(basol), "geog": geog}
            output_row.pop("id")
            writer.write_row_dict(output_row)

    session.close()
def normalize_precision():
    """
    Cette recette permet de normaliser les valeurs
    de la colonne lib_precis dans la nomenclature
    PARCEL, HOUSENUMBER, MUNICIPALITY
    """

    # input dataset
    s3ic_geocoded = Dataset("etl", "s3ic_geocoded")

    # output dataset
    s3ic_normalized = Dataset("etl", "s3ic_normalized")

    dtype = s3ic_geocoded.read_dtype()
    s3ic_normalized.write_dtype(dtype)

    with s3ic_normalized.get_writer() as writer:

        for row in s3ic_geocoded.iter_rows():

            mapping = {
                "Coordonnées précises": precisions.PARCEL,
                "Coordonnée précise": precisions.PARCEL,
                "Valeur Initiale": precisions.PARCEL,
                "Adresse postale": precisions.HOUSENUMBER,
                "Centroïde Commune": precisions.MUNICIPALITY,
                "Inconnu": precisions.MUNICIPALITY
            }
            precision = row.get("precision")
            if precision:
                row["precision"] = mapping.get(precision)
            else:
                row["precision"] = precisions.MUNICIPALITY

            writer.write_row_dict(row)
示例#6
0
def merge_cadastre():
    """ Merge the different parcelles into a MultiPolygon """

    # Input dataset
    basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined")

    # Output dataset
    basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged")

    dtype = [
        Column("id", BigInteger, primary_key=True, autoincrement=True),
        Column("numerobasol", String),
        Column("geog", Geometry(srid=4326))
    ]
    basol_cadastre_merged.write_dtype(dtype)

    BasolCadastreJoined = basol_cadastre_joined.reflect()

    session = basol_cadastre_joined.get_session()

    select = [
        BasolCadastreJoined.numerobasol,
        func.st_multi(func.st_union(BasolCadastreJoined.geog))
    ]

    q = session.query(*select) \
               .group_by(BasolCadastreJoined.numerobasol) \
               .all()

    with basol_cadastre_merged.get_writer() as writer:
        for (numerobasol, geog) in q:
            row = {"numerobasol": numerobasol, "geog": geog}
            writer.write_row_dict(row)

    session.close()
示例#7
0
def merge_geog():
    """
    Choose best precision between initial coordinates
    or geocoded coordinates if geog is not set from
    cadastre information
    """

    # Input dataset
    basol_geocoded = Dataset("etl", "basol_normalized")

    # Output dataset
    basol_geog_merged = Dataset("etl", "basol_geog_merged")

    basol_geog_merged.write_dtype([
        *basol_geocoded.read_dtype(),
        Column("geog", Geometry(srid=4326)),
        Column("geog_precision", String),
        Column("geog_source", String)
    ])

    BasolGeocoded = basol_geocoded.reflect()

    session = basol_geocoded.get_session()

    point_lambert2 = func.ST_Transform(
        func.ST_setSRID(
            func.ST_MakePoint(BasolGeocoded.coordxlambertii,
                              BasolGeocoded.coordylambertii), LAMBERT2), WGS84)

    point_geocoded = func.ST_setSRID(
        func.ST_MakePoint(BasolGeocoded.geocoded_longitude,
                          BasolGeocoded.geocoded_latitude), WGS84)

    q = session.query(BasolGeocoded, point_lambert2, point_geocoded).all()

    with basol_geog_merged.get_writer() as writer:

        for (row, point_lambert2, point_geocoded) in q:

            output_row = {
                **row2dict(row), "geog": None,
                "geog_precision": None,
                "geog_source": None
            }

            if row.l2e_precision == precisions.HOUSENUMBER:

                output_row["geog"] = point_lambert2
                output_row["geog_precision"] = row.l2e_precision
                output_row["geog_source"] = "lambert2"

            elif (row.geocoded_result_type == precisions.HOUSENUMBER) and \
                 (row.geocoded_result_score >= 0.6):
                output_row["geog"] = point_geocoded
                output_row["geog_precision"] = row.geocoded_result_type
                output_row["geog_source"] = "geocodage"

            writer.write_row_dict(output_row)

    session.close()
示例#8
0
def add_parcels():
    """ join table basol_intersected with basol_cadastre_merged """

    # input datasets
    basol_intersected = Dataset("etl", "basol_intersected")
    basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged")

    # output datasets
    basol_with_parcels = Dataset("etl", "basol_with_parcels")

    BasolIntersected = basol_intersected.reflect()
    BasolCadastreMerged = basol_cadastre_merged.reflect()

    dtype = basol_intersected.read_dtype()
    basol_with_parcels.write_dtype(dtype)

    session = basol_intersected.get_session()

    cond = (BasolIntersected.numerobasol == BasolCadastreMerged.numerobasol)
    q = session.query(BasolIntersected, BasolCadastreMerged.geog) \
               .join(BasolCadastreMerged, cond, isouter=True) \
               .all()

    with basol_with_parcels.get_writer() as writer:
        for (row, geog) in q:
            if geog is not None:
                row.geog_precision = precisions.PARCEL
                row.geog_source = "cadastre"
                row.geog = geog
            writer.write_row_dict(row2dict(row))

    session.close()
示例#9
0
def parse_cadastre():

    basol_source = Dataset("etl", "basol_source")
    basol_cadastre = Dataset("etl", "basol_cadastre")

    dtype = [
        Column("id", BigInteger, primary_key=True, autoincrement=True),
        Column("numerobasol", String),
        Column("commune", String),
        Column("section", String),
        Column("numero", String)
    ]

    basol_cadastre.write_dtype(dtype)

    with basol_cadastre.get_writer() as writer:
        for row in basol_source.iter_rows(primary_key="numerobasol"):
            cadastre_multi = row["cadastre_multi"]
            if cadastre_multi:
                parcelles = transformers.parse_cadastre(cadastre_multi)
                for parcelle in parcelles:
                    output_row = {
                        "numerobasol": row["numerobasol"],
                        **parcelle
                    }
                    writer.write_row_dict(output_row)
示例#10
0
def prepare_sites():
    """
    Cette recette ajoute une clé primaire
    et garde uniquement certaines colonnes
    """

    # input dataset
    basias_sites_filtered = Dataset("etl", "basias_sites_filtered")

    # output dataset
    basias_sites_prepared = Dataset("etl", "basias_sites_prepared")

    # columns to keep
    keep = ["indice_departemental", "nom_usuel", "raison_sociale"]

    dtype = basias_sites_filtered.read_dtype()

    # transform schema
    output_dtype = [column for column in dtype if column.name in keep]

    id_column = Column("id", BigInteger, primary_key=True, autoincrement=True)

    output_dtype = [id_column, *output_dtype]

    basias_sites_prepared.write_dtype(output_dtype)

    # transform data
    with basias_sites_prepared.get_writer() as writer:
        for row in basias_sites_filtered.iter_rows():
            output_row = dict((key, row[key]) for key in row if key in keep)
            writer.write_row_dict(output_row)
def scrap_adresses():
    """
    Scrappe les adresses présentes sur les fiches détails Géorisques

    Exemple:

    À partir de l'url
    http://www.installationsclassees.developpement-durable.gouv.fr
    /ficheEtablissement.php?champEtablBase=61&champEtablNumero=14605

    On extraie => Lieu dit 'Les Murettes' 26300 BEAUREGARD BARET

    Pour des raisons de performance, on scrappe uniquement les adresses
    pour les enregistrements dont la précision est "Centroïde Commune"
    """

    # input dataset
    s3ic_filtered = Dataset("etl", "s3ic_source")

    # output dataset
    s3ic_scraped = Dataset("etl", "s3ic_scraped")

    dtype = s3ic_filtered.read_dtype()

    output_dtype = [*dtype, Column("adresse", String)]

    s3ic_scraped.write_dtype(output_dtype)

    with s3ic_scraped.get_writer() as writer:

        for df in s3ic_filtered.get_dataframes(chunksize=100):

            filtered = df.loc[(df["lib_precis"] == "Centroïde Commune")
                              & (df["url_fiche"].notnull())].copy()

            urls = filtered["url_fiche"].tolist()
            scrapers = [IcpeScraper(url) for url in urls]
            fetch_parallel(scrapers)

            for scraper in scrapers:
                scraper.parse()
                scraper.find_adresse()

            filtered["adresse"] = [s.adresse for s in scrapers]

            def f(row):
                try:
                    return filtered["adresse"].loc[row.name]
                except KeyError:
                    return None

            df["adresse"] = df.apply(lambda row: f(row), axis=1)

            writer.write_dataframe(df)
def load_cadastre_for_department(department):

    cadastre_temp = Dataset(
        "etl", "cadastre_{dep}_temp".format(dep=department))

    dep_url = "https://cadastre.data.gouv.fr/data/etalab-cadastre" + \
        "/latest/geojson/communes/{dep}/".format(dep=department)

    # Recherche la liste des communes dans la page web
    scraper = CadastreCommuneScraper(dep_url)
    with requests.Session() as session:
        scraper.fetch_url(session)
    scraper.parse()
    scraper.find_communes()
    communes = scraper.communes

    # Iterate over each commune and load data into temporary table
    with cadastre_temp.get_writer() as writer:

        for commune in communes:

            with tempfile.NamedTemporaryFile() as temp:

                commune_url = "https://cadastre.data.gouv.fr/" + \
                    "data/etalab-cadastre/latest/geojson/communes/" + \
                    "{dep}/{commune}/cadastre-{commune}-parcelles.json.gz" \
                    .format(dep=department, commune=commune)

                urlretrieve(commune_url, temp.name)

                with gzip.open(temp.name) as f:

                    data = geojson.loads(f.read())

                    features = data["features"]

                    for feature in features:

                        s = shape(feature["geometry"])

                        row = {
                            "code": feature["id"],
                            "commune": feature["properties"]["commune"],
                            "prefixe": feature["properties"]["prefixe"],
                            "section": feature["properties"]["section"],
                            "numero": feature["properties"]["numero"],
                            "type": feature["type"],
                            "type_geom": feature["geometry"]["type"],
                            "geog": wkb.dumps(s, hex=True, srid=WGS84),
                            "version": 1
                        }

                        writer.write_row_dict(row)
示例#13
0
def add_geog():
    """
    Cette recette réalise une jointure avec la table cadastre
    pour nettoyer les parcelles invalide.

    On utilise la table cadastre du schéma kelrisks
    et non la table du schéma etl car il n'y a pas assez
    de stockage sur le serveur pour avoir 4 tables cadastre
    (preprod `etl`, preprod `kelrisks`, prod `etl`, prod `kelrisks`).
    On est donc obligé de supprimer les tables cadastres
    du schéma `etl` après les avoir copié dans le schéma `kelrisks`.
    La table `etl.cadastre` n'existe donc pas forcément au moment
    où ce DAG est executé.
    """

    # Input dataset
    basias_cadastre_parsed = Dataset("etl", "basias_cadastre_parsed")
    cadastre = Dataset("kelrisks", "cadastre")

    # Output dataset
    basias_cadastre_with_geog = Dataset("etl", "basias_cadastre_with_geog")

    BasiasCadastreParsed = basias_cadastre_parsed.reflect()
    Cadastre = cadastre.reflect()

    basias_cadastre_with_geog.write_dtype([
        *basias_cadastre_parsed.read_dtype(),
        Column("geog", Geometry(srid=4326))
    ])

    session = basias_cadastre_parsed.get_session()

    cond = (BasiasCadastreParsed.commune == Cadastre.commune) & \
           (BasiasCadastreParsed.section == Cadastre.section) & \
           (BasiasCadastreParsed.numero == Cadastre.numero)

    q = session.query(BasiasCadastreParsed, Cadastre.geog) \
        .join(Cadastre, cond) \
        .yield_per(500)

    with basias_cadastre_with_geog.get_writer() as writer:

        for (row, geog) in q:
            output_row = {**row2dict(row), "geog": None}
            del output_row["id"]
            if geog is not None:
                output_row["geog"] = geog

            writer.write_row_dict(output_row)

    session.close()
示例#14
0
def filter_departements():

    basol_source = Dataset("etl", "basol_source")
    basol_filtered = Dataset("etl", "basol_filtered")

    basol_filtered.write_dtype([
        Column("id", BigInteger, primary_key=True, autoincrement=True),
        *basol_source.read_dtype(primary_key="numerobasol")
    ])

    with basol_filtered.get_writer() as writer:
        for row in basol_source.iter_rows():
            if row["departement"] in DEPARTEMENTS:
                writer.write_row_dict(row)
示例#15
0
def load_sis():
    """
    Charge les données SIS à partir d'un fichier csv contenant
    un champ geom au format geojson
    """

    sis_source = Dataset("etl", "sis_source")

    dtype = [
        Column("id", BigInteger(), primary_key=True, autoincrement=True),
        Column("id_sis", String),
        Column("numero_affichage", String),
        Column("numero_basol", String),
        Column("adresse", Text),
        Column("lieu_dit", String),
        Column("code_insee", String),
        Column("nom_commune", String),
        Column("nom_departement", String),
        Column("x", Float(8)),
        Column("y", Float(8)),
        Column("surface_m2", Float),
        Column("geog", Geometry(srid=WGS84))
    ]

    sis_source.write_dtype(dtype)

    with sis_source.get_writer() as writer:

        with tempfile.NamedTemporaryFile() as temp:

            url = "https://kelrisks.fra1.digitaloceanspaces.com/sis.zip"
            urlretrieve(url, temp.name)

            with zipfile.ZipFile(temp.name) as zfile:

                with zfile.open("sis/sis.csv") as csvfile:

                    reader = csv.DictReader(io.TextIOWrapper(csvfile),
                                            delimiter=",",
                                            quotechar="\"")

                    for row in reader:

                        g = geojson.loads(row["geom"])
                        s = shape(g)
                        row["geog"] = wkb.dumps(s, hex=True, srid=4326)

                        writer.write_row_dict(row)
示例#16
0
def join_sites_localisation():
    """
    Réalise une jointure entre la table sites et la table localisation
    """

    # datasets to join
    basias_sites_prepared = Dataset("etl", "basias_sites_prepared")
    basias_localisation_with_cadastre = Dataset(
        "etl", "basias_localisation_with_cadastre")

    # output dataset
    basias_sites_localisation_joined = Dataset(
        "etl", "basias_sites_localisation_joined")

    # transform types
    output_dtype = [
        *basias_sites_prepared.read_dtype(),
        *basias_localisation_with_cadastre.read_dtype()
    ]

    output_dtype = merge_dtype(output_dtype)

    basias_sites_localisation_joined.write_dtype(output_dtype)

    # transform data
    BasiasSitesPrepared = basias_sites_prepared.reflect()
    BasiasLocalisation = basias_localisation_with_cadastre.reflect()

    session = basias_sites_prepared.get_session()

    join_query = session \
        .query(BasiasSitesPrepared, BasiasLocalisation) \
        .join(BasiasLocalisation,
              BasiasSitesPrepared.indice_departemental ==
              BasiasLocalisation.indice_departemental,
              isouter=True) \
        .all()

    with basias_sites_localisation_joined.get_writer() as writer:
        for (site, localisation) in join_query:
            output_row = {c.name: None for c in output_dtype}
            output_row = {**output_row, **row2dict(site)}
            if localisation:
                output_row = {**output_row, **row2dict(localisation)}
            del output_row["id"]
            writer.write_row_dict(output_row)

    session.close()
示例#17
0
def normalize_precision():
    """ normalize precision fields """

    # input dataset
    basol_geocoded = Dataset("etl", "basol_geocoded")

    # output dataset
    basol_normalized = Dataset("etl", "basol_normalized")

    dtype = basol_geocoded.read_dtype()
    basol_normalized.write_dtype(dtype)

    with basol_normalized.get_writer() as writer:
        for row in basol_geocoded.iter_rows():
            normalized = transformers.normalize_precision(row)
            writer.write_row_dict(normalized)
示例#18
0
def add_version():
    """ Add a version column for compatibility with Spring """

    # Input dataset
    basol_with_communes = Dataset("etl", "basol_with_commune")

    # Output dataset
    basol_with_version = Dataset("etl", "basol_with_version")

    basol_with_version.write_dtype(
        [*basol_with_communes.read_dtype(),
         Column("version", Integer)])

    with basol_with_version.get_writer() as writer:
        for row in basol_with_communes.iter_rows():
            writer.write_row_dict(row)
示例#19
0
def add_commune():
    """
    Ajoute le contour des communes comme nouvelle valeur pour
    geog dans le cas où la précision est MUNICIPALITY

    La reqête SqlAlchemy est équivalente à

    SELECT *
    FROM etl.basias_sites_localisation_joined A
    LEFT JOIN etl.commune B
    ON A.numero_insee == B.insee
    """

    # input dataset
    basias_sites_localisation_joined = Dataset(
        "etl", "basias_sites_localisation_joined")
    communes = Dataset("etl", "commune")

    # output dataset
    basias_sites_with_commune = Dataset("etl", "basias_sites_with_commune")

    dtype = basias_sites_localisation_joined.read_dtype()
    basias_sites_with_commune.write_dtype(dtype)

    BasiasSites = basias_sites_localisation_joined.reflect()
    Commune = communes.reflect()

    session = basias_sites_localisation_joined.get_session()

    q = session.query(BasiasSites, Commune.geog) \
               .join(Commune,
                     BasiasSites.numero_insee == Commune.insee,
                     isouter=True) \
               .all()

    with basias_sites_with_commune.get_writer() as writer:

        for (row, commune) in q:

            if row.geog is None:
                row.geog = commune
                row.geog_precision = precisions.MUNICIPALITY
                row.geog_source = "numero_insee"

            writer.write_row_dict(row2dict(row))

    session.close()
def add_commune():
    """
    Ajoute le contour des communes comme nouvelle valeur pour
    geog dans le cas où la précision est MUNICIPALITY

    La reqête SqlAlchemy est équivalente à

    SELECT *
    FROM etl.s3ic_with_parcelle A
    LEFT JOIN etl.commune B
    ON A.cd_insee == B.insee
    """

    # input dataset
    s3ic_with_parcelle = Dataset("etl", "s3ic_with_parcelle")
    communes = Dataset("etl", "commune")

    # output dataset
    s3ic_with_commune = Dataset("etl", "s3ic_with_commune")

    dtype = s3ic_with_parcelle.read_dtype()
    s3ic_with_commune.write_dtype(dtype)

    S3icWithParcelle = s3ic_with_parcelle.reflect()
    Commune = communes.reflect()

    session = s3ic_with_parcelle.get_session()

    q = session.query(S3icWithParcelle, Commune.geog) \
               .join(Commune,
                     S3icWithParcelle.code_insee == Commune.insee,
                     isouter=True) \
               .all()

    with s3ic_with_commune.get_writer() as writer:

        for (row, commune) in q:

            if row.geog_precision == precisions.MUNICIPALITY:
                row.geog = commune
                row.precision = precisions.MUNICIPALITY
                row.geog_source = "code_insee"

            writer.write_row_dict(row2dict(row))

    session.close()
示例#21
0
def intersect():
    """
    Find the closest parcelle to the point and set it as
    new geography
    """

    # Input dataset
    basol_geog_merged = Dataset("etl", "basol_geog_merged")
    cadastre = Dataset("etl", "cadastre")

    # Output dataset
    basol_intersected = Dataset("etl", "basol_intersected")

    dtype = basol_geog_merged.read_dtype()
    basol_intersected.write_dtype(dtype)

    Cadastre = cadastre.reflect()

    BasolGeogMerged = basol_geog_merged.reflect()
    session = basol_geog_merged.get_session()

    stmt = session.query(Cadastre.geog) \
                  .filter(func.st_dwithin(
                      Cadastre.geog,
                      BasolGeogMerged.geog,
                      0.0001)) \
                  .order_by(func.st_distance(
                      Cadastre.geog,
                      BasolGeogMerged.geog)) \
                  .limit(1) \
                  .label("nearest")

    q = session.query(BasolGeogMerged, stmt).all()

    with basol_intersected.get_writer() as writer:
        for (row, cadastre_geog) in q:
            if cadastre_geog is not None:
                row.geog = cadastre_geog
            writer.write_row_dict(row2dict(row))

    session.close()
示例#22
0
def add_communes():
    """
    set commune geog for all records where geog is not set with
    a better precision
    """

    # input dataset
    basol_with_parcels = Dataset("etl", "basol_with_parcels")
    communes = Dataset("etl", "commune")

    # output dataset
    basol_with_communes = Dataset("etl", "basol_with_commune")

    dtype = basol_with_parcels.read_dtype()
    basol_with_communes.write_dtype(dtype)

    BasolWithParcels = basol_with_parcels.reflect()
    Commune = communes.reflect()

    session = basol_with_parcels.get_session()

    q = session.query(BasolWithParcels, Commune.geog) \
               .join(Commune,
                     BasolWithParcels.code_insee == Commune.insee,
                     isouter=True) \
               .all()

    with basol_with_communes.get_writer() as writer:

        for (row, commune) in q:

            if row.geog is None:
                row.geog = commune
                row.geog_precision = precisions.MUNICIPALITY
                row.geog_source = "code_insee"

            writer.write_row_dict(row2dict(row))

    session.close()
示例#23
0
def join_localisation_cadastre():
    """
    Réalise une jointure entre la table localisation et la table cadastre
    """

    # Input datasets
    basias_localisation_intersected = Dataset(
        "etl", "basias_localisation_intersected")
    basias_cadastre_merged = Dataset("etl", "basias_cadastre_merged")

    # Output dataset
    basias_localisation_with_cadastre = Dataset(
        "etl", "basias_localisation_with_cadastre")

    basias_localisation_with_cadastre.write_dtype(
        basias_localisation_intersected.read_dtype())

    BasiasLocalisationIntersected = basias_localisation_intersected.reflect()
    BasiasCadastreMerged = basias_cadastre_merged.reflect()

    session = basias_localisation_intersected.get_session()

    cond = BasiasCadastreMerged.indice_departemental \
        == BasiasLocalisationIntersected.indice_departemental

    q = session.query(
        BasiasLocalisationIntersected,
        BasiasCadastreMerged.geog) \
        .join(BasiasCadastreMerged, cond, isouter=True) \
        .all()

    with basias_localisation_with_cadastre.get_writer() as writer:
        for (row, cadastre_geog) in q:
            if cadastre_geog is not None:
                # replace geog with cadastre geog
                row.geog = cadastre_geog
                row.geog_precision = precisions.PARCEL
                row.geog_source = "cadastre"
            writer.write_row_dict(row2dict(row))
示例#24
0
def merge_cadastre_geog():
    """
    Aggrège les différentes parcelles d'un même
    site dans un Multi Polygon """

    # Input dataset
    basias_cadastre_with_geog = Dataset("etl", "basias_cadastre_with_geog")

    # Output dataset
    basias_cadastre_merged = Dataset("etl", "basias_cadastre_merged")

    dtype = [
        Column("id", BigInteger, primary_key=True, autoincrement=True),
        Column("indice_departemental", String),
        Column("geog", Geometry(srid=4326))
    ]
    basias_cadastre_merged.write_dtype(dtype)
    basias_cadastre_merged.write_dtype(dtype)

    BasiasCadastreWithGeog = basias_cadastre_with_geog.reflect()

    session = basias_cadastre_with_geog.get_session()

    select = [
        BasiasCadastreWithGeog.indice_departemental,
        func.st_multi(func.st_union(BasiasCadastreWithGeog.geog))
    ]

    q = session.query(*select) \
               .group_by(BasiasCadastreWithGeog.indice_departemental) \
               .all()

    with basias_cadastre_merged.get_writer() as writer:
        for (indice_departemental, geog) in q:
            row = {"indice_departemental": indice_departemental, "geog": geog}
            writer.write_row_dict(row)

    session.close()
示例#25
0
def load_communes():

    communes_filepath = "{data_dir}/communes/communes-20190101.json" \
        .format(data_dir=DATA_DIR)

    communes = Dataset("etl", "commune")

    dtype = [
        Column("id", BigInteger(), primary_key=True, autoincrement=True),
        Column("insee", String),
        Column("nom", String),
        Column("geog", Geometry(srid=4326))
    ]

    communes.write_dtype(dtype)

    with open(communes_filepath, 'r') as f:

        data = geojson.loads(f.read())

        features = data["features"]

        with communes.get_writer() as writer:

            for feature in features:

                s = shape(feature["geometry"])

                row = {
                    "insee": feature["properties"]["insee"],
                    "nom": feature["properties"]["nom"],
                    "type_geom": feature["geometry"]["type"],
                    "geog": wkb.dumps(s, hex=True, srid=WGS84)
                }

                writer.write_row_dict(row)
示例#26
0
def parse_cadastre():
    """
    Cette recette permet d'extraire des parcelles
    du champ "numero_de_parcelle" qui n'est pas
    formaté correctement

    Exemple de valeurs:
    ZA 128 et 146
    ? 131
    AH 575-574-43-224 etc
    """

    # input dataset
    basias_cadastre_filtered = Dataset("etl", "basias_cadastre_filtered")
    basias_localisation_filtered = Dataset("etl",
                                           "basias_localisation_filtered")

    # output dataset
    basias_cadastre_parsed = Dataset("etl", "basias_cadastre_parsed")

    BasiasCadastreFiltered = basias_cadastre_filtered.reflect()
    BasiasLocalisationFiltered = basias_localisation_filtered.reflect()

    session = basias_cadastre_filtered.get_session()

    # write output schema
    output_dtype = [
        Column("id", BigInteger, primary_key=True, autoincrement=True),
        Column("indice_departemental", String),
        Column("commune", String),
        Column("section", String),
        Column("numero", String)
    ]

    basias_cadastre_parsed.write_dtype(output_dtype)

    # Fait une jointure avec la table basias_localiation_source pour
    # récuperer le code insee
    q = session \
        .query(BasiasCadastreFiltered, BasiasLocalisationFiltered.numero_insee) \
        .join(
            BasiasLocalisationFiltered,
            BasiasCadastreFiltered.indice_departemental ==
            BasiasLocalisationFiltered.indice_departemental,
            isouter=True) \
        .all()

    with basias_cadastre_parsed.get_writer() as writer:
        for (cadastre, numero_insee) in q:
            row = {**row2dict(cadastre), "numero_insee": numero_insee}
            parcelles = transformers.extract_basias_parcelles_from_row(row)
            for parcelle in parcelles:
                output_row = {
                    "indice_departemental": row["indice_departemental"],
                    "commune": parcelle.commune,
                    "section": parcelle.section,
                    "numero": parcelle.numero
                }
                writer.write_row_dict(output_row)

    session.close()
示例#27
0
def merge_geog():
    """
    Sélectionne la meilleure information géographique
    entre
    - (x_lambert2_etendue, y_lambert2_etendue)
    - (xl2_adresse, yl2_adresse)
    - point géocodé

    Ajoute la précision associée et la source de l'info
    """

    # Input dataset
    basias_localisation_geocoded = Dataset("etl",
                                           "basias_localisation_geocoded")

    # Output dataset
    basias_localisation_geog_merged = Dataset(
        "etl", "basias_localisation_geog_merged")

    basias_localisation_geog_merged.write_dtype([
        *basias_localisation_geocoded.read_dtype(),
        Column("geog", Geometry(srid=4326)),
        Column("geog_precision", String),
        Column("geog_source", String)
    ])

    BasiasLocalisationGeocoded = basias_localisation_geocoded.reflect()

    session = basias_localisation_geocoded.get_session()

    point_lambert2_etendue = func.ST_Transform(
        func.ST_setSRID(
            func.ST_MakePoint(BasiasLocalisationGeocoded.x_lambert2_etendue,
                              BasiasLocalisationGeocoded.y_lambert2_etendue),
            LAMBERT2), WGS84)

    point_adresse = func.ST_Transform(
        func.ST_setSRID(
            func.ST_MakePoint(BasiasLocalisationGeocoded.xl2_adresse,
                              BasiasLocalisationGeocoded.yl2_adresse),
            LAMBERT2), WGS84)

    point_geocoded = func.ST_setSRID(
        func.ST_MakePoint(BasiasLocalisationGeocoded.geocoded_longitude,
                          BasiasLocalisationGeocoded.geocoded_latitude), WGS84)

    q = session.query(BasiasLocalisationGeocoded, point_lambert2_etendue,
                      point_adresse, point_geocoded).all()

    with basias_localisation_geog_merged.get_writer() as writer:
        for (row, p_lambert2_etendue, p_adresse, p_geocoded) in q:
            output_row = {
                **row2dict(row), "geog": None,
                "geog_precision": None,
                "geog_source": None
            }
            if p_lambert2_etendue is not None:
                # assert this data is accurate
                output_row["geog"] = p_lambert2_etendue
                output_row["geog_precision"] = precisions.PARCEL
                output_row["geog_source"] = "lambert2_etendue"
            elif p_adresse is not None and row.precision_adresse == "numéro":
                output_row["geog"] = p_adresse
                output_row["geog_precision"] = precisions.HOUSENUMBER
                output_row["geog_source"] = "adresse"
            elif p_geocoded is not None and row.geocoded_result_score > 0.6 \
                    and row.geocoded_result_type == precisions.HOUSENUMBER:
                output_row["geog"] = p_geocoded
                output_row["geog_precision"] = precisions.HOUSENUMBER
                output_row["geog_source"] = "geocodage"

            writer.write_row_dict(output_row)

    session.close()
def add_parcelle():
    """
    Projette les points géométriques sur la parcelle la plus proche.
    On se limite à un voisinage d'environ 10m = 0.0001 degré.

    Si aucune parcelle n'a été trouvée dans ce voisinage on conserve
    le point d'origine.

    La requête SQL Alchemy est équivalente à

    SELECT *,
        (SELECT geog
            FROM kelrisks.cadastre AS c
            WHERE st_dwithin(ic.geog, c.geog, 0.0001)
            ORDER BY st_distance(ic.geog, c.geog)
            LIMIT 1) nearest
    FROM etl.s3ic_geog_merged ic

    On utilise la table cadastre du schéma kelrisks
    et non la table du schéma etl car il n'y a pas assez
    de stockage sur le serveur pour avoir 4 tables cadastre
    (preprod `etl`, preprod `kelrisks`, prod `etl`, prod `kelrisks`).
    On est donc obligé de supprimer les tables cadastres
    du schéma `etl` après les avoir copié dans le schéma `kelrisks`.
    La table `etl.cadastre` n'existe donc pas forcément au moment
    où ce DAG est executé.
    """

    # Input dataset
    s3ic_geog_merged = Dataset("etl", "s3ic_geog_merged")
    cadastre = Dataset("kelrisks", "cadastre")

    # Output dataset
    s3ic_with_parcelle = Dataset("etl", "s3ic_with_parcelle")

    dtype = s3ic_geog_merged.read_dtype()
    s3ic_with_parcelle.write_dtype(dtype)

    Cadastre = cadastre.reflect()

    S3icGeogMerged = s3ic_geog_merged.reflect()
    session = s3ic_geog_merged.get_session()

    stmt = session.query(Cadastre.geog) \
                  .filter(func.st_dwithin(
                      Cadastre.geog,
                      S3icGeogMerged.geog,
                      0.0001)) \
                  .order_by(func.st_distance(
                      Cadastre.geog,
                      S3icGeogMerged.geog)) \
                  .limit(1) \
                  .label("nearest")

    q = session.query(S3icGeogMerged, stmt).yield_per(500)

    with s3ic_with_parcelle.get_writer() as writer:
        for (row, cadastre_geog) in q:
            if cadastre_geog is not None \
               and row.geog_precision != precisions.MUNICIPALITY:
                row.geog = cadastre_geog
            writer.write_row_dict(row2dict(row))

    session.close()
def merge_geog():
    """
    Fusionne les infomations géographiques et produit
    les champs normalisés geog, geog_precision, geog_source

    On choisit de privilégier les informations de géocodage
    lorsque la précision initiale est MUNICIPALITY et que
    le score de géocodage est supérieur à 0.6
    """

    # input dataset
    s3ic_normalized = Dataset("etl", "s3ic_normalized")

    # output dataset
    s3ic_merged = Dataset("etl", "s3ic_geog_merged")

    dtype = s3ic_normalized.read_dtype()

    # geom => geog
    # lib_precis => precision
    dtype = [c for c in dtype if c.name not in ("geom", "lib_precis")]

    output_dtype = [
        *dtype,
        Column("geog", Geometry(srid=4326)),
        Column("geog_precision", String),
        Column("geog_source", String)
    ]

    s3ic_merged.write_dtype(output_dtype)

    S3icNormalized = s3ic_normalized.reflect()

    session = s3ic_normalized.get_session()

    point_geocoded = func.ST_setSRID(
        func.ST_MakePoint(S3icNormalized.geocoded_longitude,
                          S3icNormalized.geocoded_latitude), WGS84)

    q = session.query(S3icNormalized, point_geocoded).all()

    with s3ic_merged.get_writer() as writer:

        for (row, point) in q:

            output_row = {
                **row2dict(row), "geog": row.geog,
                "geog_precision": row.precision,
                "geog_source": "initial_data"
            }

            c1 = row.precision not in \
                (precisions.HOUSENUMBER, precisions.PARCEL)
            c2 = row.geocoded_latitude and row.geocoded_longitude
            c3 = row.geocoded_result_score and row.geocoded_result_score > 0.6
            c4 = row.geocoded_result_type == precisions.HOUSENUMBER

            if c1 and c2 and c3 and c4:
                output_row["geog"] = point
                output_row["geog_precision"] = row.geocoded_result_type
                output_row["geog_source"] = "geocodage"

            writer.write_row_dict(output_row)

    session.close()
def merge():
    """
    Fusionne les données pour l'IDF et pour le reste
    de la France en renommant des colonnes pour obtenir
    un schéma commun
    """

    # Données hors IDF
    s3ic_scraped = Dataset("etl", "s3ic_scraped")

    # Données IDF
    s3ic_idf_with_geom = Dataset("etl", "s3ic_idf_with_geom")

    # outpt dataset
    s3ic_merged = Dataset("etl", "s3ic_merged")

    output_dtype = [
        Column("id", BigInteger(), primary_key=True, autoincrement=True),
        Column("code", String),
        Column("nom", String),
        Column("adresse", String),
        Column("complement_adresse", String),
        Column("code_insee", String),
        Column("code_postal", String),
        Column("commune", String),
        Column("code_naf", String),
        Column("lib_naf", String),
        Column("num_siret", String),
        Column("regime", String),
        Column("lib_regime", String),
        Column("ipcc", String),
        Column("seveso", String),
        Column("lib_seveso", String),
        Column("famille_ic", String),
        Column("url_fiche", String),
        Column("x", BigInteger),
        Column("y", BigInteger),
        Column("precision", String),
        Column("geog", Geometry(srid=WGS84))
    ]

    s3ic_merged.write_dtype(output_dtype)

    try:
        with s3ic_merged.get_writer() as writer:

            for row in s3ic_scraped.iter_rows():
                del row["id"]
                row["code"] = row.pop("code_s3ic")
                row["nom"] = row.pop("nom_ets")
                row["code_insee"] = row.pop("cd_insee")
                row["code_postal"] = row.pop("cd_postal")
                row["commune"] = row.pop("nomcommune")
                row["precision"] = row.pop("lib_precis")

                s = to_shape(row.pop("geom"))
                row["geog"] = wkb.dumps(s, hex=True, srid=WGS84)

                writer.write_row_dict(row)
    except exc.NoSuchTableError:
        # si tous les départements configurés sont en IDF,
        # cette table n'existe pas
        pass

    try:
        with s3ic_merged.get_writer() as writer:
            for row in s3ic_idf_with_geom.iter_rows(primary_key="code"):
                writer.write_row_dict(row)
    except exc.NoSuchTableError:
        # si tous les départements configurés sont hors IDF
        # cette table n'existe pas
        pass