def insert_simple_item(kind, resources): timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if kind not in controle_kind: raise Exception("kind unknown") splitted_kind = kind.split('_') level = controle_kind[kind] resource_type = splitted_kind[0] if resource_type not in resource_kinds: raise Exception("resource unknown") conn = db_init.db_connect() cur = conn.cursor() cur.execute("INSERT INTO controles.metadata VALUES (DEFAULT, %s, %s, %s, %s, %s) RETURNING id;", (timestamp, kind, level, len(resources), "FR")) meta_id = cur.fetchone()[0] if len(resources) == 0: conn.commit() return cur.execute("INSERT INTO controles.item VALUES (DEFAULT, %s) RETURNING id;", (meta_id,)) item_id = cur.fetchone()[0] for res in resources: cur.execute("INSERT INTO controles.resource VALUES (DEFAULT, %s, %s, %s, %s)", (res[0], res[1], resource_type, item_id)) conn.commit()
def execute_simple(kind, sql): conn = db_init.db_connect() cur = conn.cursor() cur.execute(sql) resources = cur.fetchall() insert_simple_item(kind, resources)
def check_housenumber_same_ordinal(): conn = db_init.db_connect() cur = conn.cursor() cur.execute("SELECT hn1.pk, hn1.version, hn2.pk, hn2.version FROM housenumber as hn1, housenumber as hn2 WHERE hn1.pk != hn2.pk AND hn1.parent_id =hn2.parent_id AND hn1.number = hn2.number AND upper(hn1.ordinal) = substring(upper(hn2.ordinal) from 1 for 1);") resources = cur.fetchall() insert_double_item("housenumber_same_ordinal", resources)
def check_group_kind(): conn = db_init.db_connect() cur = conn.cursor() script_path = os.path.dirname(__file__) way_file_path = os.path.join(script_path, 'way.txt') area_file_path = os.path.join(script_path, 'area.txt') way_arr = open(way_file_path, 'r').read().splitlines() ways = '' for way in way_arr: ways += way + '|' ways = ways[:-1] areas = '' area_arr = open(area_file_path, 'r').read().splitlines() for area in area_arr: areas += area + '|' areas = areas[:-1] way_schem = '('+ways+') %' area_schem = '('+areas+') %' cur.execute("""SELECT pk, version FROM "group" where (name SIMILAR TO %s AND name NOT SIMILAR TO %s AND kind != 'way') OR (name SIMILAR TO %s AND name NOT SIMILAR TO %s AND kind != 'area')""", (way_schem, area_schem, area_schem, way_schem)) resources = cur.fetchall() insert_simple_item("group_kind", resources)
def check_pile(): conn = db_init.db_connect() cur = conn.cursor() cur.execute("drop table if exists position90;") cur.execute( """create table position90 as select p.*, st_transform(p.center,2154) as c2154 from position p, housenumber h, "group" g, municipality m where p.housenumber_id=h.pk and h.parent_id=g.pk and g.municipality_id=m.pk and m.insee like '90%';""") cur.execute( "CREATE INDEX position90_c2154 ON position90 USING gist (c2154);") conn.commit() # cur.execute( # """SELECT p1.pk, p1.version, p2.pk, p2.version # FROM position p1, position p2, housenumber h1, housenumber h2 # WHERE p1.housenumber_id=h1.pk AND p2.housenumber_id=h2.pk # AND p1.source_kind=p2.source_kind # AND st_distance(st_transform(p1.center,2154), st_transform(p2.center,2154))<5 # AND NOT h1.pk=h2.pk;""" # ) cur.execute( "SELECT p1.pk, p1.version, p2.pk, p2.version,'' FROM position90 p1, position90 p2, housenumber h1, housenumber h2 WHERE p1.housenumber_id=h1.pk AND p2.housenumber_id=h2.pk AND p1.source_kind=p2.source_kind AND p1.c2154 && st_buffer(p2.c2154,5) AND st_distance(p1.c2154, p2.c2154)<5 AND NOT h1.pk=h2.pk;" ) resources = cur.fetchall() insert_multiple_item("housenumber_pile", resources)
def check_group_name_format(): conn = db_init.db_connect() cur = conn.cursor() sql = """[!"$%&()*+,/:;<=>?[\]^_|~#]+""" cur.execute("""select pk, version from "group" where name ~* %s;""", (sql,)) resources = cur.fetchall() insert_simple_item("group_name_format", resources)
def check_housenumber_ordinal_format(): conn = db_init.db_connect() cur = conn.cursor() sql = """[!"$%&()*+,./:;<=>?[\]^-_|~#]+""" cur.execute("select pk, version from housenumber where ordinal ~* %s;", (sql,)) resources = cur.fetchall() insert_simple_item("housenumber_ordinal_format", resources)
def sum_stat_table(cur=db_connect()[0]): cols = [ "population", "poverty_rate", "pct_renter_occupied", "median_gross_rent", "median_household_income", "median_property_value", "rent_burden", "pct_white", "pct_af_am", "pct_hispanic", "pct_am_ind", "pct_asian", "pct_nh_pi", "pct_multiple", "pct_other", "renter_occupied_households", "eviction_filings", "evictions", "eviction_rate", "eviction_filing_rate" ] cur.execute( '''SELECT avg(population), avg(poverty_rate), avg(pct_renter_occupied), avg(median_gross_rent), avg(median_household_income), avg(median_property_value), avg(rent_burden), avg(pct_white), avg(pct_af_am), avg(pct_hispanic), avg(pct_am_ind), avg(pct_asian), avg(pct_nh_pi), avg(pct_multiple), avg(pct_other), avg(renter_occupied_households), avg(eviction_filings), avg(evictions), avg(eviction_rate), avg(eviction_filing_rate) FROM evictions.blockgroup;''' ) avg_row = list(cur.fetchone()) cur.execute( '''SELECT min(population), min(poverty_rate), min(pct_renter_occupied), min(median_gross_rent), min(median_household_income), min(median_property_value), min(rent_burden), min(pct_white), min(pct_af_am), min(pct_hispanic), min(pct_am_ind), min(pct_asian), min(pct_nh_pi), min(pct_multiple), min(pct_other), min(renter_occupied_households), min(eviction_filings), min(evictions), min(eviction_rate), min(eviction_filing_rate) FROM evictions.blockgroup;''' ) min_row = list(cur.fetchone()) cur.execute( '''SELECT max(population), max(poverty_rate), max(pct_renter_occupied), max(median_gross_rent), max(median_household_income), max(median_property_value), max(rent_burden), max(pct_white), max(pct_af_am), max(pct_hispanic), max(pct_am_ind), max(pct_asian), max(pct_nh_pi), max(pct_multiple), max(pct_other), max(renter_occupied_households), max(eviction_filings), max(evictions), max(eviction_rate), max(eviction_filing_rate) FROM evictions.blockgroup;''' ) max_row = list(cur.fetchone()) cur.execute( '''SELECT stddev(population), stddev(poverty_rate), stddev(pct_renter_occupied), stddev(median_gross_rent), stddev(median_household_income), stddev(median_property_value), stddev(rent_burden), stddev(pct_white), stddev(pct_af_am), stddev(pct_hispanic), stddev(pct_am_ind), stddev(pct_asian), stddev(pct_nh_pi), stddev(pct_multiple), stddev(pct_other), stddev(renter_occupied_households), stddev(eviction_filings), stddev(evictions), stddev(eviction_rate), stddev(eviction_filing_rate) FROM evictions.blockgroup;''' ) stddev_row = list(cur.fetchone()) df = pd.DataFrame( OrderedDict({ "variable": cols, "avg": avg_row, "min": min_row, "max": max_row, "std": stddev_row })) df.to_csv("sum_stat_table.csv") return df
def check_group_same_name(): conn = db_init.db_connect() cur = conn.cursor() cur.execute("""select g1.pk, g1.version, g2.pk, g2.version, '' FROM "group" as g1, "group" as g2 where g1.municipality_id = g2.municipality_id and g1.pk > g2.pk and dmetaphone_alt(g1.name)=dmetaphone_alt(g2.name) and levenshtein(g1.name, g2.name)::float/LEAST(length(g1.name), length(g2.name)) + (1-similarity(g1.name, g2.name)::float ) < 0.44""" ) resources = cur.fetchall() insert_double_item("group_same_name", resources)
def outlier_table(cur=db_connect()[0]): cols = [ "population", "poverty_rate", "pct_renter_occupied", "median_gross_rent", "median_household_income", "median_property_value", "rent_burden", "pct_white", "pct_af_am", "pct_hispanic", "pct_am_ind", "pct_asian", "pct_nh_pi", "pct_multiple", "pct_other", "renter_occupied_households", "eviction_filings", "evictions", "eviction_rate", "eviction_filing_rate" ] table = [] for col in cols: cur.execute("ROLLBACK;") #drop temp table if it exists already try: cur.execute("DROP TABLE tmp;") except: pass #create tmp table for vairable tmp = '''CREATE TEMP TABLE tmp as SELECT avg({}) as mean, stddev({})*3 as out_there FROM evictions.blockgroup;'''.format(col, col) #identify count and average of high and low outliers query_high = "SELECT count({}), avg({}) FROM evictions.blockgroup WHERE {} > (select mean + out_there from tmp);".format( col, col, col) query_low = "SELECT count({}), avg({}) FROM evictions.blockgroup WHERE {} < (select mean - out_there from tmp);".format( col, col, col) #execute queries and build output table cur.execute(tmp) cur.execute(query_high) row = [col] + list(cur.fetchone()) cur.execute(query_low) row += list(cur.fetchone()) table.append(row) df = pd.DataFrame(table) df.columns = ["variable", "count high", "avg high", "count low", "avg low"] df.to_csv("outlier_table.csv") return df
def check_housenumber_outside_municipality(): conn = db_init.db_connect() cur = conn.cursor() cur.execute( """SELECT hn.pk, hn.version, 'num : ' || hn.number || ' // parent : ' || hn.parent_id || ' // insee : ' || mu.insee FROM housenumber hn INNER JOIN postcode pc ON hn.postcode_id=pc.pk INNER JOIN municipality mu ON pc.municipality_id=mu.pk INNER JOIN surface_commune sc ON mu.insee=sc.code_insee_cdc INNER JOIN position po ON hn.pk=po.housenumber_id WHERE sc.gcms_detruit=FALSE AND hn.postcode_id IS NOT NULL AND ST_Within(ST_Transform(po.center, 2154), sc.geometrie)=FALSE UNION SELECT hn.pk, hn.version, 'num : ' || hn.number || ' // parent : ' || hn.parent_id || ' // insee : ' || mu.insee FROM housenumber hn INNER JOIN "group" gp ON hn.postcode_id=gp.pk INNER JOIN municipality mu ON gp.municipality_id=mu.pk INNER JOIN surface_commune sc ON mu.insee=sc.code_insee_cdc INNER JOIN position po ON hn.pk=po.housenumber_id WHERE sc.gcms_detruit=FALSE AND hn.postcode_id IS NULL AND ST_Within(ST_Transform(po.center, 2154), sc.geometrie)=FALSE; """) resources = cur.fetchall() insert_simple_item("housenumber_outside_municipality", resources)
def insert_multiple_item(kind, resources): timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if kind not in controle_kind: raise Exception("kind unknown") splitted_kind = kind.split('_') level = controle_kind[kind] resource_type = splitted_kind[0] if resource_type not in resource_kinds: raise Exception("resource unknown") conn = db_init.db_connect() cur = conn.cursor() cur.execute( "INSERT INTO controles.metadata VALUES (DEFAULT, %s, %s, %s, %s, %s) RETURNING id;", (timestamp, kind, level, len(resources), 'FR')) meta_id = cur.fetchone()[0] g = nx.Graph() for res in resources: g.add_edge(res[0], res[2]) cc = nx.connected_components(g) for cci in cc: print("composante connexe {}".format(cci))
def check_housenumber_missing_ordinal(): timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') kind = 'housenumber_missing_ordinal' if kind not in controle_kind: raise Exception("kind unknown") splitted_kind = kind.split('_') level = controle_kind[kind] resource_type = splitted_kind[0] if resource_type not in resource_kinds: raise Exception("resource unknown") conn = db_init.db_connect() cur = conn.cursor() cur.execute("""CREATE TEMPORARY TABLE IF NOT EXISTS temp_pos ( ordinal VARCHAR(16), pos INT ); INSERT INTO temp_pos (ordinal, pos) VALUES('', 1); INSERT INTO temp_pos (ordinal, pos) VALUES('A', 2); INSERT INTO temp_pos (ordinal, pos) VALUES('B', 3); INSERT INTO temp_pos (ordinal, pos) VALUES('C', 4); INSERT INTO temp_pos (ordinal, pos) VALUES('D', 5); INSERT INTO temp_pos (ordinal, pos) VALUES('E', 6); INSERT INTO temp_pos (ordinal, pos) VALUES('F', 7); INSERT INTO temp_pos (ordinal, pos) VALUES('G', 8); INSERT INTO temp_pos (ordinal, pos) VALUES('H', 9); INSERT INTO temp_pos (ordinal, pos) VALUES('I', 10); INSERT INTO temp_pos (ordinal, pos) VALUES('J', 11); INSERT INTO temp_pos (ordinal, pos) VALUES('K', 12); INSERT INTO temp_pos (ordinal, pos) VALUES('L', 13); INSERT INTO temp_pos (ordinal, pos) VALUES('BIS', 2); INSERT INTO temp_pos (ordinal, pos) VALUES('TER', 3); INSERT INTO temp_pos (ordinal, pos) VALUES('QUATER', 4); INSERT INTO temp_pos (ordinal, pos) VALUES('QUINQUIES', 5); INSERT INTO temp_pos (ordinal, pos) VALUES('SEXTO', 6); SELECT number, parent_id FROM ( SELECT pk, version, number, parent_id, pos, rank() OVER (PARTITION BY number, parent_id ORDER BY pos ASC) AS posit FROM ( SELECT hn.pk, hn.version, hn.number, hn.parent_id, hn.ordinal, tp.pos FROM housenumber hn JOIN temp_pos tp ON coalesce(hn.ordinal, '') = tp.ordinal ) AS t1 ) AS t2 WHERE posit!= pos GROUP BY number, parent_id; """) resources = cur.fetchall() cur.execute( "INSERT INTO controles.metadata VALUES (DEFAULT, %s, %s, %s, %s, %s) RETURNING id;", (timestamp, kind, level, len(resources), 'FR')) meta_id = cur.fetchone()[0] for res in resources: cur.execute( "INSERT INTO controles.item VALUES (DEFAULT, %s, %s) RETURNING id;", (meta_id, '')) item_id = cur.fetchone()[0] cur.execute( "SELECT hn.pk, hn.version, coalesce(hn.ordinal, '') FROM housenumber hn JOIN temp_pos tp ON coalesce(hn.ordinal, '')=tp.ordinal WHERE number=%s AND parent_id=%s ORDER BY tp.pos;", (res[0], res[1])) housenumbers = cur.fetchall() comment = '' for hn in housenumbers: comment = comment + hn[2] + '-' cur.execute( "INSERT INTO controles.resource VALUES (DEFAULT, %s, %s, %s, %s);", (hn[0], hn[1], resource_type, item_id)) cur.execute("UPDATE controles.item SET comment=%s WHERE id=%s;", (comment, item_id)) conn.commit()