def cleanUS(self, tbl, dbBase=None, locVar=["City", "State", "Zipcode"], reset=False): #Congressional District + Zipcode if dbBase==None: dbBase = "/home/ron/disambig/geo/CD_ZIP.sqlite3" s = SQLite(db=self.db, tbl=self.tbl) s.attach(dbBase) locStr = ", ".join(locVar) locQStr = "=? AND ".join(locVar)+"=?" s.index(locVar) if 'lat1' not in s.columns(output=False) or reset: s.merge(key=[['lat1', 'latitude'], ['lng1', 'longitude'], ['CD1', 'CD'], ['State1', 'State']], on=[[locVar[2], 'Zipcode']], tableFrom='congdistZip', db='db') s.merge(key=[[locVar[2]+"2", 'Zipcode']], on=[[locVar[0], 'City'], [locVar[1], 'State']], tableFrom='USCities', db='db') s.merge(key=[['lat2', 'latitude'], ['lng2', 'longitude'], ['CD2', 'CD'], ['State2', 'State']], on=[[locVar[2]+'2', 'Zipcode']], tableFrom='congdistZip', db='db') s.add('lat', '') s.add('lng', '') s.add('CD', '') s.c.execute("UPDATE %s SET lat='', lng='', CD=''" % tbl) ## HERE ARE MY ASSUMPTIONS TO PUT LNG/LAT INTO LOU'S GRANT FILE -- 944,549 total records ## 1. City, State match is more precise than Zipcode match (sometimes Zip is just wrong..) Use that as default -- (206,369) 21.8% ## 2. If City, State match doesn't happen, then I default to Zipcode match ... small (3,998) 0.4% ## 3. If CD, State match -- use Zipcode centroid. (693,922) 73.5% ## 4. 1-3 not capturing anything BUT city is filled (I did a quick scan, these all basically look foreign, see CSV) (7,217) 0.7% ## 5. Organization labeled as "UNKNOWN" (without City, State) - (30,640) 3.3% ## 6. Blanks ## a) create frequency table of Standized_Organization with Zipcodes. ## b) check if organization exists in database. If so, align it with most frequent Zipcode combo (1,356) 0.1% ## 7. Blank (non 6) (1007) 0.1% ## 8. UNKNOWN, Blank or "Foreign" -- Delete for now, although I have the CSV output saved as blankCode.csv (38,864) ## 9. Remaining records: (905,685) 95.8% if s.c.execute("SELECT count(*) FROM %s WHERE lat='' or lat is null" % tbl).fetchone()[0]>0: #Update everything to reflect 2nd #print datetime.datetime.now() g = s.c.execute("SELECT lat2, lng2, State2, CD2, %s FROM %s GROUP BY %s" % (locStr, tbl, locStr)).fetchall() if len(g)>0: s.c.executemany("UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" % (tbl, locQStr), g) #If State,CD!= Take Lat1, Lng1 ... I trust the City, State combo more overall (not the Zipcode) #print datetime.datetime.now() g = s.c.execute("SELECT lat1, lng1, State1, CD1, %s FROM %s GROUP BY %s HAVING CD2='' or CD2 is null" % (locStr, tbl, locStr)).fetchall() if len(g)>0: s.c.executemany("UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" % (tbl, locQStr), g) #If State,CD= Take Lat1, Lng1 #print datetime.datetime.now() g = s.c.execute("SELECT lat1, lng1, State1, CD1, %s FROM %s GROUP BY %s HAVING State1=State" % (locStr, tbl, locStr)).fetchall() if len(g)>0: s.c.executemany("UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" % (tbl, locQStr), g) s.close()
def setKey(self, db, table="main"): s = self.s s.open() OrgDct = dict(s.c.execute("SELECT %s, %s2 FROM grp" % (self.fld, self.uqKey)).fetchall()) s.close() t = SQLite(db) def OrgDctIt(x): if x in OrgDct: return OrgDct[x] else: return "" t.conn.create_function("OrgDct", 1, OrgDctIt) t.c.execute("UPDATE %s SET %s=OrgDct(%s)" % (table, self.uqKey, self.fld)) t.conn.commit() t.close()
def merge(self, keys, db=None, tbl="main"): s = self.s s.open() if len(keys[0])<13: keys = ["%s%0.12d" % (x[0], int(x[1:])) for x in keys] k1 = min(keys) for k in keys: s.c.execute("UPDATE grp SET %s2='%s' WHERE %s2='%s'" % (self.uqKey, k1, self.uqKey, k)) s.conn.commit() s.close() if db!=None: t = SQLite(db) for k in keys: t.c.execute("UPDATE %s SET %s='%s' WHERE %s='%s'" % (tbl, self.uqKey, k1, self.uqKey, k)) t.conn.commit() t.close()
def cleanUS(self, tbl, dbBase=None, locVar=["City", "State", "Zipcode"], reset=False): #Congressional District + Zipcode if dbBase == None: dbBase = "/home/ron/disambig/geo/CD_ZIP.sqlite3" s = SQLite(db=self.db, tbl=self.tbl) s.attach(dbBase) locStr = ", ".join(locVar) locQStr = "=? AND ".join(locVar) + "=?" s.index(locVar) if 'lat1' not in s.columns(output=False) or reset: s.merge(key=[['lat1', 'latitude'], ['lng1', 'longitude'], ['CD1', 'CD'], ['State1', 'State']], on=[[locVar[2], 'Zipcode']], tableFrom='congdistZip', db='db') s.merge(key=[[locVar[2] + "2", 'Zipcode']], on=[[locVar[0], 'City'], [locVar[1], 'State']], tableFrom='USCities', db='db') s.merge(key=[['lat2', 'latitude'], ['lng2', 'longitude'], ['CD2', 'CD'], ['State2', 'State']], on=[[locVar[2] + '2', 'Zipcode']], tableFrom='congdistZip', db='db') s.add('lat', '') s.add('lng', '') s.add('CD', '') s.c.execute("UPDATE %s SET lat='', lng='', CD=''" % tbl) ## HERE ARE MY ASSUMPTIONS TO PUT LNG/LAT INTO LOU'S GRANT FILE -- 944,549 total records ## 1. City, State match is more precise than Zipcode match (sometimes Zip is just wrong..) Use that as default -- (206,369) 21.8% ## 2. If City, State match doesn't happen, then I default to Zipcode match ... small (3,998) 0.4% ## 3. If CD, State match -- use Zipcode centroid. (693,922) 73.5% ## 4. 1-3 not capturing anything BUT city is filled (I did a quick scan, these all basically look foreign, see CSV) (7,217) 0.7% ## 5. Organization labeled as "UNKNOWN" (without City, State) - (30,640) 3.3% ## 6. Blanks ## a) create frequency table of Standized_Organization with Zipcodes. ## b) check if organization exists in database. If so, align it with most frequent Zipcode combo (1,356) 0.1% ## 7. Blank (non 6) (1007) 0.1% ## 8. UNKNOWN, Blank or "Foreign" -- Delete for now, although I have the CSV output saved as blankCode.csv (38,864) ## 9. Remaining records: (905,685) 95.8% if s.c.execute("SELECT count(*) FROM %s WHERE lat='' or lat is null" % tbl).fetchone()[0] > 0: #Update everything to reflect 2nd #print datetime.datetime.now() g = s.c.execute( "SELECT lat2, lng2, State2, CD2, %s FROM %s GROUP BY %s" % (locStr, tbl, locStr)).fetchall() if len(g) > 0: s.c.executemany( "UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" % (tbl, locQStr), g) #If State,CD!= Take Lat1, Lng1 ... I trust the City, State combo more overall (not the Zipcode) #print datetime.datetime.now() g = s.c.execute( "SELECT lat1, lng1, State1, CD1, %s FROM %s GROUP BY %s HAVING CD2='' or CD2 is null" % (locStr, tbl, locStr)).fetchall() if len(g) > 0: s.c.executemany( "UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" % (tbl, locQStr), g) #If State,CD= Take Lat1, Lng1 #print datetime.datetime.now() g = s.c.execute( "SELECT lat1, lng1, State1, CD1, %s FROM %s GROUP BY %s HAVING State1=State" % (locStr, tbl, locStr)).fetchall() if len(g) > 0: s.c.executemany( "UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" % (tbl, locQStr), g) s.close()