class WalmartProdSearch(object): def __init__(self): init_logging() self.logger = logging.getLogger(__name__) self.logger.info("WalmartProdSearch object initialized and logging enabled") with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"walmart_config.yml"), "r") as fh: settings = yaml.load(fh) self.db = Mysql(settings['db_config']) self.driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) self.driver.set_window_size(1024, 768) self.shipping_rate = 0.75 # $rate/lb # TODO: shift this to AZ class self.outfile = "../data/test.csv" self.depth_limit = settings['depth_limit'] self.debug = settings['debug'] self.fieldnames = ('net', 'roi', 'title', 'price', 'az_price', 'weight', 'az_sales_rank', 'az_match', 'url', 'img', 'az_url', 'az_asin', 'item_id') self.site_url = settings['site_url'] self.page_url = settings['page_url'] self.base_url = strip_final_slash(get_base_url(self.site_url)) #self.az = AZ() def destroy(self): """ method to destroy all objects and clean up. :return: """ #self.driver.service.process.send_signal(signal.SIGTERM) self.logger.info("Database connection closed...") self.db.exit() self.logger.info("Walmart object cleanly destroyed...") self.driver.quit() def scrape(self, pc=None, change_url=None): """ :param change_url is the changing part of wider site url, if there are multiple sections to hit. :param pc is an integer indicating where to start with a paginated url. """ self.run = True # initialization of a site/section. if pc is not None: self.pc = pc while self.run is True: url = self.next_page_url(build_search_url(self.site_url, change_url)) try: page = self.get_page(url) except Exception as e: self.logger.error("Error with %s and skipped" % url) continue self.get_list(page) if change_url is None: self.logger.info("Site %s finished" % self.site_url) else: self.logger.info("Section %s finished" % change_url) def init_output(self): if not os.path.exists(self.outfile): with open(self.outfile, "w", encoding='utf-8') as fh: outwriter = csv.DictWriter(fh, fieldnames=self.fieldnames, delimiter="\t") outwriter.writeheader() def _persist_new_match(self, item_id): """ Method takes walmart item id and finds the match_cw row with that item id. If none, inserts a new row as this is a new product. :param item_id: :return: """ with self.db.con.cursor() as cursor: #Find the selected product from walmart in the existing product TABLE select_sql = "SELECT pk_id " \ "FROM walmart_product " \ "WHERE item_id=%s" cursor.execute(select_sql, (item_id)) ret = cursor.fetchone() wal_id = ret['pk_id'] #Find the corresponding row in match_cw select_sql = "SELECT pk_id " \ "FROM match_cw " \ "WHERE wal_pk_id=%s" cursor.execute(select_sql, (wal_id)) ret = cursor.fetchone() # didn't get a match row so make one if ret is None: insert_sql = "INSERT INTO match_cw " \ "(wal_pk_id, last_update) " \ "VALUES " \ "(%s, now())" try: cursor.execute(insert_sql, (wal_id)) self.db.con.commit() except: self.logger.exception('Failed to insert new walmart product into match.') else: self.logger.info('Product %s is already in match table.' % (wal_id)) def process_output(self, data): with self.db.con.cursor() as cursor: #Try to find the selected product from walmart in the existing product TABLE select_sql = "SELECT pk_id, price " \ "FROM walmart_product " \ "WHERE item_id=%s" cursor.execute(select_sql, (data['item_id'])) ret = cursor.fetchone() # If none found, it's a new product so push it in if ret is None: insert_sql = "INSERT INTO walmart_product " \ "(price, url, img, item_id, " \ "title, last_changed, " \ "last_read) " \ "VALUES " \ "(%s, %s, %s, %s, %s, %s, %s)" try: cursor.execute(insert_sql, (data['price'].strip(), data['url'].strip(), data['img'.strip()], data['item_id'].strip(), data['title'].strip(), datetime.datetime.now(), datetime.datetime.now())) self.db.con.commit() except: self.logger.exception('Failed to insert new walmart product.') # Otherwise, it's already been found, so update the price and time stamp elif data['price'] != ret['price']: update_sql = "UPDATE walmart_product " \ "SET price=%s, last_read=now(), " \ "last_changed=now(), title=%s, " \ "url=%s, img=%s " \ "WHERE pk_id=%s" try: cursor.execute(update_sql, (data['price'].strip(), data['title'].strip(), data['url'].strip(), data['img'].strip(), ret['pk_id'])) self.db.con.commit() except: self.logger.exception('Failed to update walmart product.') # Otherwise, it's already been found and price is same so update time stamp else: update_sql = "UPDATE walmart_product " \ "SET last_read=now()" \ "WHERE pk_id=%s" cursor.execute(update_sql, (ret['pk_id'])) self.db.con.commit() # insert into the match_cw table accordingly self._persist_new_match(data['item_id'].strip()) # with open(self.outfile, 'a', encoding='utf-8') as fh: # outwriter = csv.DictWriter(fh, # fieldnames=self.fieldnames, # delimiter="\t") # outwriter.writerow(data) def get_dollar_amount(self, f): if isinstance(f, str): f = f.replace(",", "", 1) return round(float(re.match(r'\$?(\d+[.]\d\d)', f.strip()).group(1)), 2) else: return f def get_net(self, data): az_price = data['az_price'] if az_price == 0.0: return round(0.0,2) price = self.get_dollar_amount(data['price']) if data['weight'] == "Weight not fetched" or data['weight'] == "Not Available": weight = 0.0 else: weight = float(data['weight']) try: net = (az_price - (price*1.08 + az_price*0.3 + weight*self.shipping_rate)) except Exception as e: net = 0.0 try: net = round(net, 2) except: self.logger.error("Bad net value for %s - price:%s, az_price:%s, weight:%s" % (data['title'], data['price'], data['az_price'], data['weight'])) net = 0.0 return net def get_roi(self, data): net = self.get_dollar_amount(data['net']) price = self.get_dollar_amount(data['price']) return round(net/price, 2) def get_list(self, page): """ method takes search results page from Walmart and parses out items to save. Has error checking for no search results or an empty set. :param page: bs4 object returned from get_page :return: """ imitate_user(0.5) if page.find(string=re.compile(r'We found 0 results')): self.run = False return elif not page.find("ul", {"class": "tile-list-grid"}): self.run = False return else: entries = page.find("ul", {"class": "tile-list-grid"}) for e in entries: if len(e) == 1: continue elif e.name == "script": continue else: entry = {} try: entry['title'] = e.find("a", {"class":"js-product-title"}).get_text().strip() except: continue if 'http://' in e.find("a", {"class":"js-product-title"}).attrs['href']: entry['url'] = e.find("a", {"class":"js-product-title"}).attrs['href'] else: entry['url'] = "".join((self.base_url, e.find("a", {"class":"js-product-title"}).attrs['href'])) try: entry['price'] = e.find("span", {"class":"price-display"}).get_text().replace('$', '') except: continue entry['img'] = e.find("img", {"class":"product-image"}).attrs['data-default-image'] entry['item_id'] = e.find("div", {"class": "js-tile", "class": "tile-grid-unit"}).attrs['data-item-id'] #entry['az_price'], entry['weight'], entry['az_sales_rank'], entry['az_match'], entry['az_url'], entry['az_asin'] = self.az.find_best_match(entry['title'], 'Toys') #entry['net'] = self.get_net(entry) #entry['roi'] = self.get_roi(entry) self.process_output(entry) def next_page_url(self, url): self.pc += 1 imitate_user(0.5) next_url = url if self.page_url: next_url += self.page_url next_url += str(self.pc) if self.pc == self.depth_limit: self.run = False # recursion limit reached return next_url def get_page(self, url): try: self.logger.info("Getting %s" % url) self.driver.get(url) # self.driver.get_cookies() except ValueError as e: imitate_user(2) try: self.driver.get(url) except: raise except Exception as e: self.logger.error(url, e) # try: # wait = WebDriverWait(self.driver, 3) # wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div"))) # except Exception as e: # self.logger.error("WebDriverWait error") page = BeautifulSoup(self.driver.page_source, "lxml") return page def _is_recent(self, t): """ private method to check whether timestamp is within default time period. :param t: timestamp :return: boolean of whether it is within default period, true is within period. """ if (datetime.datetime.now() - t) < datetime.timedelta(days=1): return True else: return False def get_target(self): """ method to find the next node at walmart to search for products. If there is no target that has been more than 24 hours since being searched, will return None. Otherwise, takes the first returned and updates the timestamp to now to reset its priority for search. :return: 'node' is the cat component of the url to hit at walmart. """ with self.db.con.cursor() as cursor: select_sql = "select node, last_read from walmart_node ORDER BY last_read ASC" cursor.execute(select_sql) ret = cursor.fetchone() if self._is_recent(ret['last_read']): self.logger.info("All product information is up to date - exiting.") return None else: update_sql = "update walmart_node set last_read=now() where node=%s" cursor.execute(update_sql, (ret['node'])) self.db.con.commit() return ret['node']
class Populate(): """Builds the census_2010_schema with various tables. loads: census_2010 zips and tract ids. loads: US geo details for all states, counties, cities with zip, lat/long. """ def __init__(self): """Constructor for Populate""" self.acsdb = Mysql("../Pydb/mysql_config.yml") def destroy(self): self.acsdb.exit() def load_tracts(self): tracts = {} # {track_id : track_name} with open("data/TRACT_ZIP_122015.csv") as fh: r_count = 0 reader = csv.reader(fh) for r in reader: r_count += 1 if r_count < 2: continue tracts[r[0]] = self.get_tract_name(r[0]) with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM census_tract_2010" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("census_tract_2010 table already exists. Skipping...") return for t in tracts: update_sql = "INSERT INTO `census_tract_2010` (`track_id`, `track_name`) VALUES (%s, %s)" try: cursor.execute(update_sql, (t, tracts[t])) except: pass self.acsdb.con.commit() @staticmethod def get_tract_name(tract_id): """Pulls out the Census Tract Name from up to the last 6 digits in the track_id. This is not finished since it is taking 020100 and yielding 0201.00 and it should drop the initial 0 and for suffixes with no information it should drop 00. :param tract_id: this is the full tract id from the input file """ base = re.search(r'\d+(\d{4})(\d\d)', tract_id) name = ".".join((base.group(1), base.group(2))) return name def load_zip_tract_crosswalk(self): data = [] with open("data/TRACT_ZIP_122015.csv") as fh: reader = csv.reader(fh) index = 0 for r in reader: index += 1 if index < 2: # skip header continue data.append([r[0], r[1], r[2], r[3], r[4], r[5]]) # TRACT,ZIP,RES_RATIO,BUS_RATIO,OTH_RATIO,TOT_RATIO with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM zip_tract_cw" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("zip_tract_cw table already exists. Skipping...") return for r in data: zip_id_sql = "SELECT `pk_id` FROM `zip` WHERE `zipcode`=%s" cursor.execute(zip_id_sql, (r[1])) zip_pk_id = cursor.fetchone() if zip_pk_id is None: print(r[1]) track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` WHERE `track_id`=%s" cursor.execute(track_id_sql, (r[0])) track_pk_id = cursor.fetchone() if track_pk_id is None: print(r[0]) continue insert_sql = "INSERT INTO `zip_tract_cw` " \ "(`track_pk_id`, " \ "`zip_pk_id`, " \ "`res_ratio`, " \ "`bus_ratio`, " \ "`oth_ratio`, " \ "`tot_ratio`) " \ "VALUES (%s, %s, %s, %s, %s, %s)" try: cursor.execute(insert_sql, (track_pk_id['pk_id'], zip_pk_id['pk_id'], r[2], r[3], r[4], r[5])) self.acsdb.con.commit() except Exception as e: print("tract %s attempted to map into zip %s and not found - %s" % (r[0], r[1], e)) def load_geo_details(self): geo = {} with open("data/US/US.txt") as fh: reader = csv.reader(fh, delimiter='\t') for r in reader: geo[r[1]] = [{'city': r[2]}, {'county': r[5]}, {'state': r[3]}, {'lat': r[9]}, {'lon': r[10]}] with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM zip" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("Zip table already exists. Skipping...") return for g in geo: update_sql = "INSERT INTO `zip` (`city`, `county`, `state`, `lat`, `lon`, `zipcode`) " \ "VALUES (%s, %s, %s, %s, %s, %s)" try: cursor.execute(update_sql, (geo[g][0]['city'], geo[g][1]['county'], geo[g][2]['state'], geo[g][3]['lat'], geo[g][4]['lon'], g)) except: pass self.acsdb.con.commit() def load_S2503(self): data = {} with open("data/ACS_14_5YR_S2503.csv") as fh: reader = csv.reader(fh) r_count = 0 for r in reader: r_count += 1 if r_count < 3: continue # name = re.search(r'[\w .]+', r[2]).group(0) r = [e.replace('-', '0') for e in r] r = [e.replace('(X)', '0') for e in r] r = [e.replace('250,000+', '250000') for e in r] r = [e.replace('***', '0') for e in r] r = [e.replace('**', '0') for e in r] data[r[1]] = [{'HC01_VC01': float(r[3])}, {'HC02_VC01': float(r[5])}, {'HC03_VC01': float(r[7])}, {'HC01_VC03': float(r[9])}, {'HC02_VC03': float(r[11])}, {'HC03_VC03': float(r[13])}, {'HC01_VC04': float(r[15])}, {'HC02_VC04': float(r[17])}, {'HC03_VC04': float(r[19])}, {'HC01_VC05': float(r[21])}, {'HC02_VC05': float(r[23])}, {'HC03_VC05': float(r[25])}, {'HC01_VC06': float(r[27])}, {'HC02_VC06': float(r[29])}, {'HC03_VC06': float(r[31])}, {'HC01_VC07': float(r[33])}, {'HC02_VC07': float(r[35])}, {'HC03_VC07': float(r[37])}, {'HC01_VC08': float(r[39])}, {'HC02_VC08': float(r[41])}, {'HC03_VC08': float(r[43])}, {'HC01_VC09': float(r[45])}, {'HC02_VC09': float(r[47])}, {'HC03_VC09': float(r[49])}, {'HC01_VC10': float(r[51])}, {'HC02_VC10': float(r[53])}, {'HC03_VC10': float(r[55])}, {'HC01_VC11': float(r[57])}, {'HC02_VC11': float(r[59])}, {'HC03_VC11': float(r[61])}, {'HC01_VC12': float(r[63])}, {'HC02_VC12': float(r[65])}, {'HC03_VC12': float(r[67])}, {'HC01_VC13': float(r[69])}, {'HC02_VC13': float(r[71])}, {'HC03_VC13': float(r[73])}, {'HC01_VC14': float(r[75])}, {'HC02_VC14': float(r[77])}, {'HC03_VC14': float(r[79])}] with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM S2503_ACS" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("S2503_ACS table already exists. Skipping...") return for r in data: get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s" cursor.execute(get_track_id_sql, (r)) track_pk_id = cursor.fetchone() if track_pk_id is None: continue update_sql = "INSERT INTO `S2503_ACS` " \ "(`HC01_VC01`, `HC02_VC01`, `HC03_VC01`, " \ "`HC01_VC03`, `HC02_VC03`, `HC03_VC03`, " \ "`HC01_VC04`, `HC02_VC04`, `HC03_VC04`, " \ "`HC01_VC05`, `HC02_VC05`, `HC03_VC05`, " \ "`HC01_VC06`, `HC02_VC06`, `HC03_VC06`, " \ "`HC01_VC07`, `HC02_VC07`, `HC03_VC07`, " \ "`HC01_VC08`, `HC02_VC08`, `HC03_VC08`, " \ "`HC01_VC09`, `HC02_VC09`, `HC03_VC09`, " \ "`HC01_VC10`, `HC02_VC10`, `HC03_VC10`, " \ "`HC01_VC11`, `HC02_VC11`, `HC03_VC11`, " \ "`HC01_VC12`, `HC02_VC12`, `HC03_VC12`, " \ "`HC01_VC13`, `HC02_VC13`, `HC03_VC13`, " \ "`HC01_VC14`, `HC02_VC14`, `HC03_VC14`, " \ "`track_pk_id`) " \ "VALUES " \ "(%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s)" try: cursor.execute(update_sql, (data[r][0]['HC01_VC01'], data[r][1]['HC02_VC01'], data[r][2]['HC03_VC01'], data[r][3]['HC01_VC03'], data[r][4]['HC02_VC03'], data[r][5]['HC03_VC03'], data[r][6]['HC01_VC04'], data[r][7]['HC02_VC04'], data[r][8]['HC03_VC04'], data[r][9]['HC01_VC05'], data[r][10]['HC02_VC05'], data[r][11]['HC03_VC05'], data[r][12]['HC01_VC06'], data[r][13]['HC02_VC06'], data[r][14]['HC03_VC06'], data[r][15]['HC01_VC07'], data[r][16]['HC02_VC07'], data[r][17]['HC03_VC07'], data[r][18]['HC01_VC08'], data[r][19]['HC02_VC08'], data[r][20]['HC03_VC08'], data[r][21]['HC01_VC09'], data[r][22]['HC02_VC09'], data[r][23]['HC03_VC09'], data[r][24]['HC01_VC10'], data[r][25]['HC02_VC10'], data[r][26]['HC03_VC10'], data[r][27]['HC01_VC11'], data[r][28]['HC02_VC11'], data[r][29]['HC03_VC11'], data[r][30]['HC01_VC12'], data[r][31]['HC02_VC12'], data[r][32]['HC03_VC12'], data[r][33]['HC01_VC13'], data[r][34]['HC02_VC13'], data[r][35]['HC03_VC13'], data[r][36]['HC01_VC14'], data[r][37]['HC02_VC14'], data[r][38]['HC03_VC14'], track_pk_id['pk_id'])) except Exception as e: print(e) self.acsdb.con.commit() def load_S2503_moe(self): # TODO: Should refactor this to fold MOE into data load...Needs new ddl as well """ Same general method as load_S2503 but this loads the MOE for each variable in the dataset. :return: """ data = {} with open("data/ACS_14_5YR_S2503.csv") as fh: reader = csv.reader(fh) r_count = 0 for r in reader: r_count += 1 if r_count < 3: continue # name = re.search(r'[\w .]+', r[2]).group(0) r = [e.replace('-', '0') for e in r] r = [e.replace('(X)', '0') for e in r] r = [e.replace('250,000+', '250000') for e in r] r = [e.replace('***', '0') for e in r] r = [e.replace('**', '0') for e in r] data[r[1]] = [{'HC01_VC01_MOE': float(r[4])}, {'HC02_VC01_MOE': float(r[6])}, {'HC03_VC01_MOE': float(r[8])}, {'HC01_VC03_MOE': float(r[10])}, {'HC02_VC03_MOE': float(r[12])}, {'HC03_VC03_MOE': float(r[14])}, {'HC01_VC04_MOE': float(r[16])}, {'HC02_VC04_MOE': float(r[18])}, {'HC03_VC04_MOE': float(r[20])}, {'HC01_VC05_MOE': float(r[22])}, {'HC02_VC05_MOE': float(r[24])}, {'HC03_VC05_MOE': float(r[26])}, {'HC01_VC06_MOE': float(r[28])}, {'HC02_VC06_MOE': float(r[30])}, {'HC03_VC06_MOE': float(r[32])}, {'HC01_VC07_MOE': float(r[34])}, {'HC02_VC07_MOE': float(r[36])}, {'HC03_VC07_MOE': float(r[38])}, {'HC01_VC08_MOE': float(r[40])}, {'HC02_VC08_MOE': float(r[42])}, {'HC03_VC08_MOE': float(r[44])}, {'HC01_VC09_MOE': float(r[46])}, {'HC02_VC09_MOE': float(r[48])}, {'HC03_VC09_MOE': float(r[50])}, {'HC01_VC10_MOE': float(r[52])}, {'HC02_VC10_MOE': float(r[54])}, {'HC03_VC10_MOE': float(r[56])}, {'HC01_VC11_MOE': float(r[58])}, {'HC02_VC11_MOE': float(r[60])}, {'HC03_VC11_MOE': float(r[62])}, {'HC01_VC12_MOE': float(r[64])}, {'HC02_VC12_MOE': float(r[66])}, {'HC03_VC12_MOE': float(r[68])}, {'HC01_VC13_MOE': float(r[70])}, {'HC02_VC13_MOE': float(r[72])}, {'HC03_VC13_MOE': float(r[74])}, {'HC01_VC14_MOE': float(r[76])}, {'HC02_VC14_MOE': float(r[78])}, {'HC03_VC14_MOE': float(r[80])}] with self.acsdb.con.cursor() as cursor: test_sql = "SELECT HC01_VC01_MOE FROM S2501_ACS" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("S2501_ACS table already provisioned with MOEs. Skipping...") return for r in data: get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s" cursor.execute(get_track_id_sql, (r)) track_pk_id = cursor.fetchone() if track_pk_id is None: continue get_pk_id_sql = "SELECT pk_id FROM S2503_ACS WHERE track_pk_id=%s" cursor.execute(get_pk_id_sql, (track_pk_id['pk_id'])) ret = cursor.fetchone() update_sql = "UPDATE `S2503_ACS` SET " \ "`HC01_VC01_MOE`=%s, `HC02_VC01_MOE`=%s, `HC03_VC01_MOE`=%s, " \ "`HC01_VC03_MOE`=%s, `HC02_VC03_MOE`=%s, `HC03_VC03_MOE`=%s, " \ "`HC01_VC04_MOE`=%s, `HC02_VC04_MOE`=%s, `HC03_VC04_MOE`=%s, " \ "`HC01_VC05_MOE`=%s, `HC02_VC05_MOE`=%s, `HC03_VC05_MOE`=%s, " \ "`HC01_VC06_MOE`=%s, `HC02_VC06_MOE`=%s, `HC03_VC06_MOE`=%s, " \ "`HC01_VC07_MOE`=%s, `HC02_VC07_MOE`=%s, `HC03_VC07_MOE`=%s, " \ "`HC01_VC08_MOE`=%s, `HC02_VC08_MOE`=%s, `HC03_VC08_MOE`=%s, " \ "`HC01_VC09_MOE`=%s, `HC02_VC09_MOE`=%s, `HC03_VC09_MOE`=%s, " \ "`HC01_VC10_MOE`=%s, `HC02_VC10_MOE`=%s, `HC03_VC10_MOE`=%s, " \ "`HC01_VC11_MOE`=%s, `HC02_VC11_MOE`=%s, `HC03_VC11_MOE`=%s, " \ "`HC01_VC12_MOE`=%s, `HC02_VC12_MOE`=%s, `HC03_VC12_MOE`=%s, " \ "`HC01_VC13_MOE`=%s, `HC02_VC13_MOE`=%s, `HC03_VC13_MOE`=%s, " \ "`HC01_VC14_MOE`=%s, `HC02_VC14_MOE`=%s, `HC03_VC14_MOE`=%s " \ "WHERE " \ "pk_id=%s" try: cursor.execute(update_sql, (data[r][0]['HC01_VC01_MOE'], data[r][1]['HC02_VC01_MOE'], data[r][2]['HC03_VC01_MOE'], data[r][3]['HC01_VC03_MOE'], data[r][4]['HC02_VC03_MOE'], data[r][5]['HC03_VC03_MOE'], data[r][6]['HC01_VC04_MOE'], data[r][7]['HC02_VC04_MOE'], data[r][8]['HC03_VC04_MOE'], data[r][9]['HC01_VC05_MOE'], data[r][10]['HC02_VC05_MOE'], data[r][11]['HC03_VC05_MOE'], data[r][12]['HC01_VC06_MOE'], data[r][13]['HC02_VC06_MOE'], data[r][14]['HC03_VC06_MOE'], data[r][15]['HC01_VC07_MOE'], data[r][16]['HC02_VC07_MOE'], data[r][17]['HC03_VC07_MOE'], data[r][18]['HC01_VC08_MOE'], data[r][19]['HC02_VC08_MOE'], data[r][20]['HC03_VC08_MOE'], data[r][21]['HC01_VC09_MOE'], data[r][22]['HC02_VC09_MOE'], data[r][23]['HC03_VC09_MOE'], data[r][24]['HC01_VC10_MOE'], data[r][25]['HC02_VC10_MOE'], data[r][26]['HC03_VC10_MOE'], data[r][27]['HC01_VC11_MOE'], data[r][28]['HC02_VC11_MOE'], data[r][29]['HC03_VC11_MOE'], data[r][30]['HC01_VC12_MOE'], data[r][31]['HC02_VC12_MOE'], data[r][32]['HC03_VC12_MOE'], data[r][33]['HC01_VC13_MOE'], data[r][34]['HC02_VC13_MOE'], data[r][35]['HC03_VC13_MOE'], data[r][36]['HC01_VC14_MOE'], data[r][37]['HC02_VC14_MOE'], data[r][38]['HC03_VC14_MOE'], ret['pk_id'])) except Exception as e: print(e) self.acsdb.con.commit() def load_S2501(self): data = {} with open("data/ACS_14_5YR_S2501.csv") as fh: reader = csv.reader(fh) r_count = 0 for r in reader: r_count += 1 if r_count < 3: # csv file first two rows are not data continue # name = re.search(r'[\w .]+', r[2]).group(0) r = [e.replace('-', '0') for e in r] r = [e.replace('(X)', '0') for e in r] r = [e.replace('250,000+', '250000') for e in r] r = [e.replace('***', '0') for e in r] r = [e.replace('**', '0') for e in r] data[r[1]] = [{'HC01_VC01': float(r[3])}, {'HC01_VC01_MOE': float(r[4])}, {'HC02_VC01': float(r[5])}, {'HC02_VC01_MOE': float(r[6])}, {'HC03_VC01': float(r[7])}, {'HC03_VC01_MOE': float(r[8])}, {'HC01_VC03': float(r[9])}, {'HC01_VC03_MOE': float(r[10])}, {'HC02_VC03': float(r[11])}, {'HC02_VC03_MOE': float(r[12])}, {'HC03_VC03': float(r[13])}, {'HC03_VC03_MOE': float(r[14])}, {'HC01_VC04': float(r[15])}, {'HC01_VC04_MOE': float(r[16])}, {'HC02_VC04': float(r[17])}, {'HC02_VC04_MOE': float(r[18])}, {'HC03_VC04': float(r[19])}, {'HC03_VC04_MOE': float(r[20])}, {'HC01_VC05': float(r[21])}, {'HC01_VC05_MOE': float(r[22])}, {'HC02_VC05': float(r[23])}, {'HC02_VC05_MOE': float(r[24])}, {'HC03_VC05': float(r[25])}, {'HC03_VC05_MOE': float(r[26])}, {'HC01_VC06': float(r[27])}, {'HC01_VC06_MOE': float(r[28])}, {'HC02_VC06': float(r[29])}, {'HC02_VC06_MOE': float(r[30])}, {'HC03_VC06': float(r[31])}, {'HC03_VC06_MOE': float(r[32])}, {'HC01_VC14': float(r[51])}, {'HC01_VC14_MOE': float(r[52])}, {'HC02_VC14': float(r[53])}, {'HC02_VC14_MOE': float(r[54])}, {'HC03_VC14': float(r[55])}, {'HC03_VC14_MOE': float(r[56])}, {'HC01_VC15': float(r[57])}, {'HC01_VC15_MOE': float(r[58])}, {'HC02_VC15': float(r[59])}, {'HC02_VC15_MOE': float(r[60])}, {'HC03_VC15': float(r[61])}, {'HC03_VC15_MOE': float(r[62])}, {'HC01_VC19': float(r[81])}, {'HC01_VC19_MOE': float(r[82])}, {'HC02_VC19': float(r[83])}, {'HC02_VC19_MOE': float(r[84])}, {'HC03_VC19': float(r[85])}, {'HC03_VC19_MOE': float(r[86])}, {'HC01_VC39': float(r[183])}, {'HC01_VC39_MOE': float(r[184])}, {'HC02_VC39': float(r[185])}, {'HC02_VC39_MOE': float(r[186])}, {'HC03_VC39': float(r[187])}, {'HC03_VC39_MOE': float(r[188])}] with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM S2501_ACS" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("S2501_ACS table already exists. Skipping...") return for r in data: get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s" cursor.execute(get_track_id_sql, (r)) track_pk_id = cursor.fetchone() if track_pk_id is None: continue update_sql = "INSERT INTO `S2501_ACS` " \ "(`HC01_VC01`, `HC01_VC01_MOE`, `HC02_VC01`, `HC02_VC01_MOE`, `HC03_VC01`, `HC03_VC01_MOE`, " \ "`HC01_VC03`, `HC01_VC03_MOE`, `HC02_VC03`, `HC02_VC03_MOE`, `HC03_VC03`, `HC03_VC03_MOE`, " \ "`HC01_VC04`, `HC01_VC04_MOE`, `HC02_VC04`, `HC02_VC04_MOE`, `HC03_VC04`, `HC03_VC04_MOE`, " \ "`HC01_VC05`, `HC01_VC05_MOE`, `HC02_VC05`, `HC02_VC05_MOE`, `HC03_VC05`, `HC03_VC05_MOE`, " \ "`HC01_VC06`, `HC01_VC06_MOE`, `HC02_VC06`, `HC02_VC06_MOE`, `HC03_VC06`, `HC03_VC06_MOE`, " \ "`HC01_VC14`, `HC01_VC14_MOE`, `HC02_VC14`, `HC02_VC14_MOE`, `HC03_VC14`, `HC03_VC14_MOE`, " \ "`HC01_VC15`, `HC01_VC15_MOE`, `HC02_VC15`, `HC02_VC15_MOE`, `HC03_VC15`, `HC03_VC15_MOE`, " \ "`HC01_VC19`, `HC01_VC19_MOE`, `HC02_VC19`, `HC02_VC19_MOE`, `HC03_VC19`, `HC03_VC19_MOE`, " \ "`HC01_VC39`, `HC01_VC39_MOE`, `HC02_VC39`, `HC02_VC39_MOE`, `HC03_VC39`, `HC03_VC39_MOE`, " \ "`track_pk_id`) " \ "VALUES " \ "(%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s)" try: cursor.execute(update_sql, (data[r][0]['HC01_VC01'], data[r][1]['HC01_VC01_MOE'], data[r][2]['HC02_VC01'], data[r][3]['HC02_VC01_MOE'], data[r][4]['HC03_VC01'], data[r][5]['HC03_VC01_MOE'], data[r][6]['HC01_VC03'], data[r][7]['HC01_VC03_MOE'], data[r][8]['HC02_VC03'], data[r][9]['HC02_VC03_MOE'], data[r][10]['HC03_VC03'], data[r][11]['HC03_VC03_MOE'], data[r][12]['HC01_VC04'], data[r][13]['HC01_VC04_MOE'], data[r][14]['HC02_VC04'], data[r][15]['HC02_VC04_MOE'], data[r][16]['HC03_VC04'], data[r][17]['HC03_VC04_MOE'], data[r][18]['HC01_VC05'], data[r][19]['HC01_VC05_MOE'], data[r][20]['HC02_VC05'], data[r][21]['HC02_VC05_MOE'], data[r][22]['HC03_VC05'], data[r][23]['HC03_VC05_MOE'], data[r][24]['HC01_VC06'], data[r][25]['HC01_VC06_MOE'], data[r][26]['HC02_VC06'], data[r][27]['HC02_VC06_MOE'], data[r][28]['HC03_VC06'], data[r][29]['HC03_VC06_MOE'], data[r][30]['HC01_VC14'], data[r][31]['HC01_VC14_MOE'], data[r][32]['HC02_VC14'], data[r][33]['HC02_VC14_MOE'], data[r][34]['HC03_VC14'], data[r][35]['HC03_VC14_MOE'], data[r][36]['HC01_VC15'], data[r][37]['HC01_VC15_MOE'], data[r][38]['HC02_VC15'], data[r][39]['HC02_VC15_MOE'], data[r][40]['HC03_VC15'], data[r][41]['HC03_VC15_MOE'], data[r][42]['HC01_VC19'], data[r][43]['HC01_VC19_MOE'], data[r][44]['HC02_VC19'], data[r][45]['HC02_VC19_MOE'], data[r][46]['HC03_VC19'], data[r][47]['HC03_VC19_MOE'], data[r][48]['HC01_VC39'], data[r][49]['HC01_VC39_MOE'], data[r][50]['HC02_VC39'], data[r][51]['HC02_VC39_MOE'], data[r][52]['HC03_VC39'], data[r][53]['HC03_VC39_MOE'], track_pk_id['pk_id'])) except Exception as e: print(e) self.acsdb.con.commit()
class AZ(object): def __init__(self): init_logging() self.logger = logging.getLogger(__name__) self.logger.info("Amazon object initialized") with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "az_config.yml"), "r") as fh: settings = yaml.load(fh) self.db = Mysql(settings['db_config']) self.access_key = settings['access_key_id'] self.secret_key = settings['secret_key_id'] self.associate_tag = settings['associate_tag'] self.default_weight = settings['default_weight'] self.az_price = None self.az_asin = None self.az_sales_rank = None self.az_url = None self.az_match = None self.amazon = AmazonAPI(self.access_key, self.secret_key, self.associate_tag) def destroy(self): self.logger.info("Database connection closed...") self.db.exit() self.logger.info("Amazon object destroyed") def find_best_match(self, title, cat='All'): # TODO: consider using cat='Blended' for default """ :param title: string containing the source site product title :param cat: string containing the category for searchindex on amazon :return: product dict with price, weight, sales_rank, offer_url and T if match occurred on title search, K if match occurred on keyword search, N if no match occurred. """ self._find_by_title(title, cat) if self.product['az_price'] != 0: self.product['az_match'] = 'T' if self.product['az_price'] == 0: self._find_by_key(title, cat) if self.product['az_price'] != 0: self.product['az_match'] = 'K' if self.product['az_price'] == 0: # we didn't find any match so clean remaining attrs (self.product['az_sales_rank'], self.product['az_match'], self.product['az_url']) = (0, 'N', '') self._get_attrs() #return self.az_price, self.az_weight, self.az_sales_rank, self.az_match, self.az_url, self.az_asin # consider returning a status flag def _find_by_title(self, title, cat): lowest = 0.0 try: products = self.amazon.search(Title=title, SearchIndex=cat) for i, p in enumerate(products): price = p.price_and_currency[0] if lowest == 0.0: lowest = price self.product['asin'] = p.asin elif price < lowest: lowest = price self.product['asin'] = p.asin except Exception as e: self.logger.warn("%s had no match in amazon" % title) self.product['az_price'] = lowest def _find_by_key(self, title, cat): lowest = 0.0 try: products = self.amazon.search(Keywords=title, SearchIndex=cat) for i, p in enumerate(products): price = p.price_and_currency[0] if lowest == 0.0: lowest = price self.product['asin'] = p.asin elif price < lowest: lowest = price self.product['asin'] = p.asin except Exception as e: self.logger.warn("%s had no match in amazon" % title) self.product['az_price'] = lowest def _get_attrs(self): (r, w, u, i) = (None, None, None, None) if self.product.get('asin', None) is not None: try: p = self.amazon.lookup(ItemId=self.product['asin']) r = int(p.sales_rank) i = p.images[0].LargeImage.URL.text w = p.get_attribute('ItemDimensions.Weight') u = p.offer_url self._get_meta_rate(p.reviews[1]) except: pass if r is None: r = 0 if i is None: i = '' if w is None: w = self.product['az_weight'] # this should not be in hundreth-pounds else: w = float(w)*0.01 # its coming from amazon in hundreth-pounds seemingly else: # there was no product found. (r, w, u, i) = (0, 0, '', '') self.product['az_sales_rank'] = r self.product['az_weight'] = w self.product['az_url'] = u self.product['img'] = i def _get_weight(self): w = None if self.product['asin'] is not None: try: p = self.amazon.lookup(ItemId=self.product['asin']) w = p.get_attribute('ItemDimensions.Weight') except: pass if w is None: w = self.product['az_weight'] else: w = float(w)*0.01 # its coming from amazon in hundreth-pounds seemingly else: w = 0 return w def _get_meta_rate(self, url): rank_page = get_page(url, 1) if rank_page[0].find(string=re.compile(r'There are no customer reviews for this item')): self.product['num_reviews'] = 0 self.product['avg_review'] = 0 else: try: rating = rank_page[0].find("span", {"class":"asinReviewsSummary"}).img.attrs['title'] self.product['num_reviews'] = rating except: self.product['num_reviews'] = 0 try: reviews = rank_page[0].find(string=re.compile(r'\d+ customer reviews')) # TODO: strip out text self.product['avg_review'] = reviews except: self.product['avg_review'] = 0 def _init_prod_dict(self): self.product = {} self.product['az_weight'] = self.default_weight self.product['az_price'] = 0.0 self.product['avg_review'] = 0 self.product['num_reviews'] = 0 def match_products(self): """ Method looks for unmatched products in match_cw. When it finds a new product, attempts to match with amazon product. When successful, will then update amazon product row and match_cw row :return: """ new_prods = self._get_unmatched_products() for k in new_prods: self._init_prod_dict() self.product['name'] = new_prods[k]['title'] self.find_best_match(self.product['name']) if self.product.get('asin', None) is None: continue self._persist_product(self.product, k) def _get_unmatched_products(self): """ Finds all the empty products in match_cw. Empty is a null element for az_pk_id in a product row, which means it was written by another product find and had no match. :return: dict {wal_pk_id: {title, match_pk_id}} """ new_products = {} with self.db.con.cursor() as cursor: select_sql = "SELECT pk_id, wal_pk_id " \ "FROM match_cw " \ "WHERE az_pk_id is NULL" try: cursor.execute(select_sql, ()) ret = cursor.fetchall() new_products = {e['wal_pk_id']: {'match_pk_id': e['pk_id']} for e in ret} except Exception as e: self.logger.exception(e) select_sql = "SELECT title " \ "FROM walmart_product " \ "WHERE pk_id=%s" for e in new_products: cursor.execute(select_sql, (e)) ret = cursor.fetchone() if ret is not None: new_products[e]['title'] = ret['title'] return new_products def _persist_match(self, az_pk_id, wal_pk_id): """ should actually write az_pk_id to existing product row.... :param az_pk_id: this is the az_pk_id to write to match_cw row :wal_pk_id: this is the wal_pk_id that identifies the row to update :return: """ if az_pk_id is None: return with self.db.con.cursor() as cursor: select_sql = "SELECT pk_id " \ "FROM match_cw " \ "WHERE wal_pk_id=%s" cursor.execute(select_sql, (wal_pk_id)) ret = cursor.fetchone() if ret is None: self.logger.warn("There was a match issue with wal_pk_id %s" % az_pk_id) return # there was no row for that amazon product update_sql = "UPDATE match_cw " \ "SET az_pk_id=%s, " \ "last_update=now() " \ "WHERE pk_id=%s" cursor.execute(update_sql, (az_pk_id, ret['pk_id'])) self.db.con.commit() def _update_match(self, match): """ TODO: this needs to update things like roi, etc. possibly pull it out... :param match: :return: """ if match is None: return with self.db.con.cursor() as cursor: select_sql = "SELECT pk_id " \ "FROM match_cw " \ "WHERE az_pk_id=%s" cursor.execute(select_sql, (match['pk_id'])) ret = cursor.fetchone() if ret is None: self.logger.warn("There was a match issue with az_pk_id %s" % match['pk_id']) return # there was no row for that amazon product update_sql = "UPDATE match_cw " \ "SET az_pk_id=%s, wal_pk_id=%s, " \ "net=%s, roi=%s, match_score=%s, " \ "match_meth=%s, last_update=now() " \ "WHERE pk_id=%s" cursor.execute(update_sql, (match['az_pk_id'], match['wal_pk_id'], match['net'], match['roi'], match['match_score'], match['match_meth'], match['pk_id'])) self.db.con.commit() def _persist_product(self, product, wal_pk_id): """ This method will insert or update a row for the product in the az table as well as call to update the corresponding row in match_cw with the az_pk_id. Expects to find az_product with asin. If a match_cw_id is passed in product dict, then there is a row in match_cw that needs to be updated with the product az_pk_id. :param product: dict with values to insert/update in az_product. :param wal_pk_id: pk_id from original search on no amazon info in row :return: """ if 'name' not in product: self.logger.warn("Attempt to persist empty product!") return with self.db.con.cursor() as cursor: (az_id, az_price) = (0, 0.0) # declare as a wider scope var to track through entire method select_sql = "SELECT pk_id, price " \ "FROM az_product " \ "WHERE asin=%s" cursor.execute(select_sql, (product['asin'])) ret = cursor.fetchone() # no row in az product table with that asin, so insert row if ret is None: insert_sql = "INSERT INTO az_product " \ "(asin, avg_rate, img, num_reviews, " \ "price, sales_rank, shipping_cost, " \ "title, upc, url, weight, " \ "last_read, last_changed) " \ "VALUES " \ "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, now(), now())" try: cursor.execute(insert_sql, (product['asin'], product['avg_review'], product['img'], product['num_reviews'], product.get('az_price', 0.0), product['az_sales_rank'], product.get('shipping_cost', 1.00), # no shipping cost or upc product['name'], product.get('upc',''), product['az_url'], product['az_weight'])) self.db.con.commit() # now go back and get the pk_id of the new row just inserted select_sql = "SELECT pk_id, price " \ "FROM az_product " \ "WHERE asin=%s" cursor.execute(select_sql, (product['asin'])) ret = cursor.fetchone() az_id = ret.get('pk_id', None) if az_id is None: self.logger.warn("Error in inserting new az_product.") return az_price = ret.get('price', 0.0) except: self.logger.exception('Failed to insert new az product: %s.' % product['asin']) return # otherwise, product exists - if price is changed update product elif product['az_price'] != ret['price']: update_sql = "UPDATE az_product " \ "SET asin=%s, avg_rate=%s, " \ "img=%s, num_reviews=%s, " \ "price=%s, sales_rank=%s, " \ "shipping_cost=%s, last_read=now(), " \ "last_changed=now(), title=%s, " \ "url=%s, weight=%s " \ "WHERE pk_id=%s" try: cursor.execute(update_sql, (product['az_price'], product['name'], product['az_url'], product['img'], ret['pk_id'])) self.db.con.commit() except: self.logger.exception('Failed to update az product: %s.' % product['asin']) # otherwise, product exists but price is unchanged so simply refresh timestamp else: update_sql = "UPDATE az_product " \ "SET last_read=now()" \ "WHERE pk_id=%s" cursor.execute(update_sql, (ret['pk_id'])) self.db.con.commit() # now need to place match in match_cw if product.get('az_match'): self._persist_match(ret['pk_id'], wal_pk_id)
class Model(): """ """ def __init__(self): """Constructor for Model""" self.db = Mysql("Pydb/mysql_config.yml") self.tracts = {} def destroy(self): self.db.exit() def get_tracts(self): """ Builds up a data set of all tracts, including ratio to compute to each zip. :return: """ self._get_zips() self._build_tracts() self._add_median_incomes() self._add_housing() self._add_occupancy_features() def _get_zips(self): """ returns all the Nassau county pk_ids of zips, sorted by city asc :param self: :return: """ with self.db.con.cursor() as cursor: select_sql = "SELECT `pk_id`,`zipcode` FROM zip WHERE `county`=%s " \ "ORDER BY `city` ASC" cursor.execute(select_sql, ("Nassau")) ret = cursor.fetchall() for r in ret: self.tracts[r['zipcode']] = {'zip_pk_id': r['pk_id']} def _build_tracts(self): """ builds up tract information in dataset, adding a track_pk_id, the ratio of residences of that tract to be counted in the chosen zip. :return: """ with self.db.con.cursor() as cursor: for z in self.tracts: select_sql = "SELECT `track_pk_id`, `res_ratio` FROM `zip_tract_cw` " \ "WHERE `zip_pk_id`=%s" cursor.execute(select_sql, (self.tracts[z]['zip_pk_id'])) ret = cursor.fetchall() for r in ret: self.tracts[z][r['track_pk_id']] = {'res_ratio': r['res_ratio']} def _add_median_incomes(self): """ builds median incomes for each tract, by occupied, owner-occupied, renter. :return: """ with self.db.con.cursor() as cursor: for z in self.tracts: for k, v in self.tracts[z].items(): if k == 'zip_pk_id': continue select_sql = "SELECT `HC01_VC14`, `HC02_VC14`, `HC03_VC14` FROM `S2503_ACS` " \ "WHERE `track_pk_id`=%s" cursor.execute(select_sql, (k)) ret = cursor.fetchone() if ret is not None: self.tracts[z][k]['HC01_VC14'] = ret['HC01_VC14'] self.tracts[z][k]['HC02_VC14'] = ret['HC02_VC14'] self.tracts[z][k]['HC03_VC14'] = ret['HC03_VC14'] def _add_housing(self): """ builds housing totals for occupied, owner-occupied, renter in each tract. :return: """ with self.db.con.cursor() as cursor: for z in self.tracts: for k, v in self.tracts[z].items(): if k == 'zip_pk_id': continue select_sql = "SELECT `HC01_VC01`, `HC01_VC01_MOE`, " \ "`HC02_VC01`, `HC02_VC01_MOE`, " \ "`HC03_VC01`, `HC03_VC01_MOE` FROM `S2503_ACS` " \ "WHERE `track_pk_id`=%s" cursor.execute(select_sql, (k)) ret = cursor.fetchone() if ret: self.tracts[z][k]['HC01_VC01'] = ret['HC01_VC01'] self.tracts[z][k]['HC02_VC01'] = ret['HC02_VC01'] self.tracts[z][k]['HC03_VC01'] = ret['HC03_VC01'] self.tracts[z][k]['HC01_VC01_MOE'] = ret['HC01_VC01_MOE'] self.tracts[z][k]['HC02_VC01_MOE'] = ret['HC02_VC01_MOE'] self.tracts[z][k]['HC03_VC01_MOE'] = ret['HC03_VC01_MOE'] else: # TODO: Move print to a warn in logger print("_add_housing failed for track pk: {}".format(k)) def _add_occupancy_features(self): """ builds occupancy characteristics for occupied, owner-occupied, renter in each tract. :return: adds occupancy as sub-dict in self.tracts[z][tract_pk_id] """ with self.db.con.cursor() as cursor: for z in self.tracts: for k, v in self.tracts[z].items(): if k == 'zip_pk_id': continue select_sql = "SELECT HC01_VC01, HC01_VC01_MOE, " \ "HC02_VC01, HC02_VC01_MOE, " \ "HC03_VC01, HC03_VC01_MOE, " \ "HC01_VC03, HC01_VC03_MOE, " \ "HC02_VC03, HC02_VC03_MOE, " \ "HC03_VC03, HC03_VC03_MOE, " \ "HC01_VC04, HC01_VC04_MOE, " \ "HC02_VC04, HC02_VC04_MOE, " \ "HC03_VC04, HC03_VC04_MOE, " \ "HC01_VC05, HC01_VC05_MOE, " \ "HC02_VC05, HC02_VC05_MOE, " \ "HC03_VC05, HC03_VC05_MOE, " \ "HC01_VC06, HC01_VC06_MOE, " \ "HC02_VC06, HC02_VC06_MOE, " \ "HC03_VC06, HC03_VC06_MOE, " \ "HC01_VC14, HC01_VC14_MOE, " \ "HC02_VC14, HC02_VC14_MOE, " \ "HC03_VC14, HC03_VC14_MOE, " \ "HC01_VC15, HC01_VC15_MOE, " \ "HC02_VC15, HC02_VC15_MOE, " \ "HC03_VC15, HC03_VC15_MOE, " \ "HC01_VC19, HC01_VC19_MOE, " \ "HC02_VC19, HC02_VC19_MOE, " \ "HC03_VC19, HC03_VC19_MOE, " \ "HC01_VC39, HC01_VC39_MOE, " \ "HC02_VC39, HC02_VC39_MOE, " \ "HC03_VC39, HC03_VC39_MOE " \ "FROM S2501_ACS " \ "WHERE track_pk_id=%s" cursor.execute(select_sql, (k)) ret = cursor.fetchone() if ret: self.tracts[z][k]['occupancy'] = ret else: self.tracts[z][k]['occupancy'] = {} print("add_occupancy_features failed for track pk".format(k)) # TODO: move output to logger def build_rental_housing_densities(self): """ calculates the density of renters/total occupied units for each zip. multiplies number of renters/occupiers by the weight of each tract contributing to the current zip. Firstly, get raw data for h1-4 renter numbers, renters total units as well as total occupied units in tract. Get res_ratio, the fraction of that tract which is contributing to the current zip. Then normalize all of the h1-4 with the total renters to facilitate comparisons on same scale. If normalization is non-zero, calculate the normalized renter density for that group. Check to see if this is the max for the zip and store if so. Calculate the contribution from this tract and add proportion element to total for the zip. Finally, calculate and store the avg for the zip. :return: """ for z in self.tracts: t_rent = 0 t_h1_rent = 0 t_h2_rent = 0 t_h3_rent = 0 t_h4_rent = 0 t_occ = 0 rent_pop = 0 self.tracts[z]['max_renter_density'] = self.tracts[z].get('max_renter_density', 0.0) self.tracts[z]['max_renter_income'] = self.tracts[z].get('max_renter_income', 0.0) self.tracts[z]['max_h1_renter_density'] = self.tracts[z].get('max_h1_renter_density', 0.0) self.tracts[z]['max_h2_renter_density'] = self.tracts[z].get('max_h2_renter_density', 0.0) self.tracts[z]['max_h3_renter_density'] = self.tracts[z].get('max_h3_renter_density', 0.0) self.tracts[z]['max_h4_renter_density'] = self.tracts[z].get('max_h4_renter_density', 0.0) self.tracts[z]['renter_pop_est'] = self.tracts[z].get('renter_pop_est', 0.0) for k, v in self.tracts[z].items(): if isinstance(k, int): renters = v.get('HC03_VC01', 0) h1_renters = v['occupancy'].get('HC03_VC03', 0) h2_renters = v['occupancy'].get('HC03_VC04', 0) h3_renters = v['occupancy'].get('HC03_VC05', 0) h4_renters = v['occupancy'].get('HC03_VC06', 0) occupied = v.get('HC01_VC01', 1) if v.get('res_ratio', 1) > 0: res_ratio = v.get('res_ratio', 1) else: res_ratio = 1 if renters > 0: current_den = renters/occupied else: current_den = 0.0 t_h_renters = h1_renters + h2_renters + h3_renters + h4_renters if t_h_renters > 0: ren_norm = renters/t_h_renters else: ren_norm = 1 h1_renters_norm = h1_renters*ren_norm h2_renters_norm = h2_renters*ren_norm h3_renters_norm = h3_renters*ren_norm h4_renters_norm = h4_renters*ren_norm if h1_renters_norm > 0: current_h1_den = h1_renters_norm/occupied else: current_h1_den = 0.0 if h2_renters_norm > 0: current_h2_den = h2_renters_norm/occupied else: current_h2_den = 0.0 if h3_renters_norm > 0: current_h3_den = h3_renters_norm/occupied else: current_h3_den = 0.0 if h4_renters_norm > 0: current_h4_den = h4_renters_norm/occupied else: current_h4_den = 0.0 if self.tracts[z]['max_renter_density'] < current_den: self.tracts[z]['max_renter_density'] = round(current_den, 2) self.tracts[z]['max_renter_income'] = round(v.get('HC03_VC14', 0.0), 0) if self.tracts[z]['max_h1_renter_density'] < current_h1_den: self.tracts[z]['max_h1_renter_density'] = round(current_h1_den, 2) if self.tracts[z]['max_h2_renter_density'] < current_h2_den: self.tracts[z]['max_h2_renter_density'] = round(current_h2_den, 2) if self.tracts[z]['max_h3_renter_density'] < current_h3_den: self.tracts[z]['max_h3_renter_density'] = round(current_h3_den, 2) if self.tracts[z]['max_h4_renter_density'] < current_h4_den: self.tracts[z]['max_h4_renter_density'] = round(current_h4_den, 2) t_rent += renters * res_ratio t_h1_rent += h1_renters_norm * res_ratio t_h2_rent += h2_renters_norm * res_ratio t_h3_rent += h3_renters_norm * res_ratio t_h4_rent += h4_renters_norm * res_ratio t_occ += occupied * res_ratio rent_pop += h1_renters_norm * res_ratio + \ h2_renters_norm * res_ratio * 2 + \ h3_renters_norm * res_ratio * 3 + \ h4_renters_norm * res_ratio * 4 if t_rent == 0: self.tracts[z]['avg_renter_density'] = 0.0 else: self.tracts[z]['avg_renter_density'] = round(t_rent/t_occ, 2) self.tracts[z]['renter_pop_est'] = round(rent_pop, 0) if t_h1_rent == 0: self.tracts[z]['avg_h1_renter_density'] = 0.0 else: self.tracts[z]['avg_h1_renter_density'] = round(t_h1_rent/t_occ, 2) if t_h2_rent == 0: self.tracts[z]['avg_h2_renter_density'] = 0.0 else: self.tracts[z]['avg_h2_renter_density'] = round(t_h2_rent/t_occ, 2) if t_h3_rent == 0: self.tracts[z]['avg_h3_renter_density'] = 0.0 else: self.tracts[z]['avg_h3_renter_density'] = round(t_h3_rent/t_occ, 2) if t_h4_rent == 0: self.tracts[z]['avg_h4_renter_density'] = 0.0 else: self.tracts[z]['avg_h4_renter_density'] = round(t_h4_rent/t_occ, 2) #self.make_output(output) #self.make_output(self.tracts, meth='pp') def build_incomes(self): """ calculates the avg median renter income in each zip. multiplies number of renters by the weight of each tract contributing to the current zip. :return: """ for z in self.tracts: t_rent_income = 0 t_pop = 0 self.tracts[z]['max_renter_density'] = self.tracts[z].get('max_renter_density', 0.0) self.tracts[z]['max_renter_income'] = self.tracts[z].get('max_renter_income', 0) for k, v in self.tracts[z].items(): if isinstance(k, int): income = v.get('HC03_VC14', 0) pop = v.get('HC03_VC01', 0) res_ratio = v.get('res_ratio', 1) if res_ratio == 0: res_ratio = 1 t_rent_income += income * pop * res_ratio t_pop += pop * res_ratio if t_pop == 0: self.tracts[z]['avg_renter_income'] = 0.0 else: self.tracts[z]['avg_renter_income'] = round(t_rent_income/t_pop, 0) def build_model(self): """ Builds up a derived dataset of relevant features from ACS info pulled or built in other methods. Purpose is specific to data needed for this particular output. :return: """ output = [] for z in self.tracts: output.append([z, self.tracts[z].get('avg_renter_density', 0.0), self.tracts[z].get('max_renter_density', 0.0), self.tracts[z].get('avg_renter_income', 0), self.tracts[z].get('max_renter_income', 0), self.tracts[z].get('avg_h1_renter_density', 0), self.tracts[z].get('max_h1_renter_density', 0), self.tracts[z].get('avg_h2_renter_density', 0), self.tracts[z].get('max_h2_renter_density', 0), self.tracts[z].get('avg_h3_renter_density', 0), self.tracts[z].get('max_h3_renter_density', 0), self.tracts[z].get('avg_h4_renter_density', 0), self.tracts[z].get('max_h4_renter_density', 0), self.tracts[z].get('renter_pop_est', 0)]) self.make_output(output, filename='model') def make_output(self, data, meth=None, filename=None): if meth == 'pp': pp = pprint.PrettyPrinter() pp.pprint(data) return elif filename is None: filename='output' dir_name = '../data/output' filename_suffix = '.csv' path = os.path.join(dir_name, filename + filename_suffix) with open(path, 'w') as fh: writer = csv.writer(fh) for line in data: writer.writerow(line)
class Model(): """ """ def __init__(self): """Constructor for Model""" self.db = Mysql("Pydb/mysql_config.yml") self.tracts = {} def destroy(self): self.db.exit() def get_tracts(self): """ Builds up a data set of all tracts, including ratio to compute to each zip. :return: """ self._get_zips() self._build_tracts() self._add_median_incomes() self._add_housing() self._add_occupancy_features() def _get_zips(self): """ returns all the Nassau county pk_ids of zips, sorted by city asc :param self: :return: """ with self.db.con.cursor() as cursor: select_sql = "SELECT `pk_id`,`zipcode` FROM zip WHERE `county`=%s " \ "ORDER BY `city` ASC" cursor.execute(select_sql, ("Nassau")) ret = cursor.fetchall() for r in ret: self.tracts[r['zipcode']] = {'zip_pk_id': r['pk_id']} def _build_tracts(self): """ builds up tract information in dataset, adding a track_pk_id, the ratio of residences of that tract to be counted in the chosen zip. :return: """ with self.db.con.cursor() as cursor: for z in self.tracts: select_sql = "SELECT `track_pk_id`, `res_ratio` FROM `zip_tract_cw` " \ "WHERE `zip_pk_id`=%s" cursor.execute(select_sql, (self.tracts[z]['zip_pk_id'])) ret = cursor.fetchall() for r in ret: self.tracts[z][r['track_pk_id']] = { 'res_ratio': r['res_ratio'] } def _add_median_incomes(self): """ builds median incomes for each tract, by occupied, owner-occupied, renter. :return: """ with self.db.con.cursor() as cursor: for z in self.tracts: for k, v in self.tracts[z].items(): if k == 'zip_pk_id': continue select_sql = "SELECT `HC01_VC14`, `HC02_VC14`, `HC03_VC14` FROM `S2503_ACS` " \ "WHERE `track_pk_id`=%s" cursor.execute(select_sql, (k)) ret = cursor.fetchone() if ret is not None: self.tracts[z][k]['HC01_VC14'] = ret['HC01_VC14'] self.tracts[z][k]['HC02_VC14'] = ret['HC02_VC14'] self.tracts[z][k]['HC03_VC14'] = ret['HC03_VC14'] def _add_housing(self): """ builds housing totals for occupied, owner-occupied, renter in each tract. :return: """ with self.db.con.cursor() as cursor: for z in self.tracts: for k, v in self.tracts[z].items(): if k == 'zip_pk_id': continue select_sql = "SELECT `HC01_VC01`, `HC01_VC01_MOE`, " \ "`HC02_VC01`, `HC02_VC01_MOE`, " \ "`HC03_VC01`, `HC03_VC01_MOE` FROM `S2503_ACS` " \ "WHERE `track_pk_id`=%s" cursor.execute(select_sql, (k)) ret = cursor.fetchone() if ret: self.tracts[z][k]['HC01_VC01'] = ret['HC01_VC01'] self.tracts[z][k]['HC02_VC01'] = ret['HC02_VC01'] self.tracts[z][k]['HC03_VC01'] = ret['HC03_VC01'] self.tracts[z][k]['HC01_VC01_MOE'] = ret[ 'HC01_VC01_MOE'] self.tracts[z][k]['HC02_VC01_MOE'] = ret[ 'HC02_VC01_MOE'] self.tracts[z][k]['HC03_VC01_MOE'] = ret[ 'HC03_VC01_MOE'] else: # TODO: Move print to a warn in logger print("_add_housing failed for track pk: {}".format(k)) def _add_occupancy_features(self): """ builds occupancy characteristics for occupied, owner-occupied, renter in each tract. :return: adds occupancy as sub-dict in self.tracts[z][tract_pk_id] """ with self.db.con.cursor() as cursor: for z in self.tracts: for k, v in self.tracts[z].items(): if k == 'zip_pk_id': continue select_sql = "SELECT HC01_VC01, HC01_VC01_MOE, " \ "HC02_VC01, HC02_VC01_MOE, " \ "HC03_VC01, HC03_VC01_MOE, " \ "HC01_VC03, HC01_VC03_MOE, " \ "HC02_VC03, HC02_VC03_MOE, " \ "HC03_VC03, HC03_VC03_MOE, " \ "HC01_VC04, HC01_VC04_MOE, " \ "HC02_VC04, HC02_VC04_MOE, " \ "HC03_VC04, HC03_VC04_MOE, " \ "HC01_VC05, HC01_VC05_MOE, " \ "HC02_VC05, HC02_VC05_MOE, " \ "HC03_VC05, HC03_VC05_MOE, " \ "HC01_VC06, HC01_VC06_MOE, " \ "HC02_VC06, HC02_VC06_MOE, " \ "HC03_VC06, HC03_VC06_MOE, " \ "HC01_VC14, HC01_VC14_MOE, " \ "HC02_VC14, HC02_VC14_MOE, " \ "HC03_VC14, HC03_VC14_MOE, " \ "HC01_VC15, HC01_VC15_MOE, " \ "HC02_VC15, HC02_VC15_MOE, " \ "HC03_VC15, HC03_VC15_MOE, " \ "HC01_VC19, HC01_VC19_MOE, " \ "HC02_VC19, HC02_VC19_MOE, " \ "HC03_VC19, HC03_VC19_MOE, " \ "HC01_VC39, HC01_VC39_MOE, " \ "HC02_VC39, HC02_VC39_MOE, " \ "HC03_VC39, HC03_VC39_MOE " \ "FROM S2501_ACS " \ "WHERE track_pk_id=%s" cursor.execute(select_sql, (k)) ret = cursor.fetchone() if ret: self.tracts[z][k]['occupancy'] = ret else: self.tracts[z][k]['occupancy'] = {} print("add_occupancy_features failed for track pk". format(k)) # TODO: move output to logger def build_rental_housing_densities(self): """ calculates the density of renters/total occupied units for each zip. multiplies number of renters/occupiers by the weight of each tract contributing to the current zip. Firstly, get raw data for h1-4 renter numbers, renters total units as well as total occupied units in tract. Get res_ratio, the fraction of that tract which is contributing to the current zip. Then normalize all of the h1-4 with the total renters to facilitate comparisons on same scale. If normalization is non-zero, calculate the normalized renter density for that group. Check to see if this is the max for the zip and store if so. Calculate the contribution from this tract and add proportion element to total for the zip. Finally, calculate and store the avg for the zip. :return: """ for z in self.tracts: t_rent = 0 t_h1_rent = 0 t_h2_rent = 0 t_h3_rent = 0 t_h4_rent = 0 t_occ = 0 rent_pop = 0 self.tracts[z]['max_renter_density'] = self.tracts[z].get( 'max_renter_density', 0.0) self.tracts[z]['max_renter_income'] = self.tracts[z].get( 'max_renter_income', 0.0) self.tracts[z]['max_h1_renter_density'] = self.tracts[z].get( 'max_h1_renter_density', 0.0) self.tracts[z]['max_h2_renter_density'] = self.tracts[z].get( 'max_h2_renter_density', 0.0) self.tracts[z]['max_h3_renter_density'] = self.tracts[z].get( 'max_h3_renter_density', 0.0) self.tracts[z]['max_h4_renter_density'] = self.tracts[z].get( 'max_h4_renter_density', 0.0) self.tracts[z]['renter_pop_est'] = self.tracts[z].get( 'renter_pop_est', 0.0) for k, v in self.tracts[z].items(): if isinstance(k, int): renters = v.get('HC03_VC01', 0) h1_renters = v['occupancy'].get('HC03_VC03', 0) h2_renters = v['occupancy'].get('HC03_VC04', 0) h3_renters = v['occupancy'].get('HC03_VC05', 0) h4_renters = v['occupancy'].get('HC03_VC06', 0) occupied = v.get('HC01_VC01', 1) if v.get('res_ratio', 1) > 0: res_ratio = v.get('res_ratio', 1) else: res_ratio = 1 if renters > 0: current_den = renters / occupied else: current_den = 0.0 t_h_renters = h1_renters + h2_renters + h3_renters + h4_renters if t_h_renters > 0: ren_norm = renters / t_h_renters else: ren_norm = 1 h1_renters_norm = h1_renters * ren_norm h2_renters_norm = h2_renters * ren_norm h3_renters_norm = h3_renters * ren_norm h4_renters_norm = h4_renters * ren_norm if h1_renters_norm > 0: current_h1_den = h1_renters_norm / occupied else: current_h1_den = 0.0 if h2_renters_norm > 0: current_h2_den = h2_renters_norm / occupied else: current_h2_den = 0.0 if h3_renters_norm > 0: current_h3_den = h3_renters_norm / occupied else: current_h3_den = 0.0 if h4_renters_norm > 0: current_h4_den = h4_renters_norm / occupied else: current_h4_den = 0.0 if self.tracts[z]['max_renter_density'] < current_den: self.tracts[z]['max_renter_density'] = round( current_den, 2) self.tracts[z]['max_renter_income'] = round( v.get('HC03_VC14', 0.0), 0) if self.tracts[z]['max_h1_renter_density'] < current_h1_den: self.tracts[z]['max_h1_renter_density'] = round( current_h1_den, 2) if self.tracts[z]['max_h2_renter_density'] < current_h2_den: self.tracts[z]['max_h2_renter_density'] = round( current_h2_den, 2) if self.tracts[z]['max_h3_renter_density'] < current_h3_den: self.tracts[z]['max_h3_renter_density'] = round( current_h3_den, 2) if self.tracts[z]['max_h4_renter_density'] < current_h4_den: self.tracts[z]['max_h4_renter_density'] = round( current_h4_den, 2) t_rent += renters * res_ratio t_h1_rent += h1_renters_norm * res_ratio t_h2_rent += h2_renters_norm * res_ratio t_h3_rent += h3_renters_norm * res_ratio t_h4_rent += h4_renters_norm * res_ratio t_occ += occupied * res_ratio rent_pop += h1_renters_norm * res_ratio + \ h2_renters_norm * res_ratio * 2 + \ h3_renters_norm * res_ratio * 3 + \ h4_renters_norm * res_ratio * 4 if t_rent == 0: self.tracts[z]['avg_renter_density'] = 0.0 else: self.tracts[z]['avg_renter_density'] = round(t_rent / t_occ, 2) self.tracts[z]['renter_pop_est'] = round(rent_pop, 0) if t_h1_rent == 0: self.tracts[z]['avg_h1_renter_density'] = 0.0 else: self.tracts[z]['avg_h1_renter_density'] = round( t_h1_rent / t_occ, 2) if t_h2_rent == 0: self.tracts[z]['avg_h2_renter_density'] = 0.0 else: self.tracts[z]['avg_h2_renter_density'] = round( t_h2_rent / t_occ, 2) if t_h3_rent == 0: self.tracts[z]['avg_h3_renter_density'] = 0.0 else: self.tracts[z]['avg_h3_renter_density'] = round( t_h3_rent / t_occ, 2) if t_h4_rent == 0: self.tracts[z]['avg_h4_renter_density'] = 0.0 else: self.tracts[z]['avg_h4_renter_density'] = round( t_h4_rent / t_occ, 2) #self.make_output(output) #self.make_output(self.tracts, meth='pp') def build_incomes(self): """ calculates the avg median renter income in each zip. multiplies number of renters by the weight of each tract contributing to the current zip. :return: """ for z in self.tracts: t_rent_income = 0 t_pop = 0 self.tracts[z]['max_renter_density'] = self.tracts[z].get( 'max_renter_density', 0.0) self.tracts[z]['max_renter_income'] = self.tracts[z].get( 'max_renter_income', 0) for k, v in self.tracts[z].items(): if isinstance(k, int): income = v.get('HC03_VC14', 0) pop = v.get('HC03_VC01', 0) res_ratio = v.get('res_ratio', 1) if res_ratio == 0: res_ratio = 1 t_rent_income += income * pop * res_ratio t_pop += pop * res_ratio if t_pop == 0: self.tracts[z]['avg_renter_income'] = 0.0 else: self.tracts[z]['avg_renter_income'] = round( t_rent_income / t_pop, 0) def build_model(self): """ Builds up a derived dataset of relevant features from ACS info pulled or built in other methods. Purpose is specific to data needed for this particular output. :return: """ output = [] for z in self.tracts: output.append([ z, self.tracts[z].get('avg_renter_density', 0.0), self.tracts[z].get('max_renter_density', 0.0), self.tracts[z].get('avg_renter_income', 0), self.tracts[z].get( 'max_renter_income', 0), self.tracts[z].get('avg_h1_renter_density', 0), self.tracts[z].get('max_h1_renter_density', 0), self.tracts[z].get('avg_h2_renter_density', 0), self.tracts[z].get('max_h2_renter_density', 0), self.tracts[z].get('avg_h3_renter_density', 0), self.tracts[z].get('max_h3_renter_density', 0), self.tracts[z].get('avg_h4_renter_density', 0), self.tracts[z].get('max_h4_renter_density', 0), self.tracts[z].get('renter_pop_est', 0) ]) self.make_output(output, filename='model') def make_output(self, data, meth=None, filename=None): if meth == 'pp': pp = pprint.PrettyPrinter() pp.pprint(data) return elif filename is None: filename = 'output' dir_name = '../data/output' filename_suffix = '.csv' path = os.path.join(dir_name, filename + filename_suffix) with open(path, 'w') as fh: writer = csv.writer(fh) for line in data: writer.writerow(line)
class Populate(): """Builds the census_2010_schema with various tables. loads: census_2010 zips and tract ids. loads: US geo details for all states, counties, cities with zip, lat/long. """ def __init__(self): """Constructor for Populate""" self.acsdb = Mysql("../Pydb/mysql_config.yml") def destroy(self): self.acsdb.exit() def load_tracts(self): tracts = {} # {track_id : track_name} with open("data/TRACT_ZIP_122015.csv") as fh: r_count = 0 reader = csv.reader(fh) for r in reader: r_count += 1 if r_count < 2: continue tracts[r[0]] = self.get_tract_name(r[0]) with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM census_tract_2010" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("census_tract_2010 table already exists. Skipping...") return for t in tracts: update_sql = "INSERT INTO `census_tract_2010` (`track_id`, `track_name`) VALUES (%s, %s)" try: cursor.execute(update_sql, (t, tracts[t])) except: pass self.acsdb.con.commit() @staticmethod def get_tract_name(tract_id): """Pulls out the Census Tract Name from up to the last 6 digits in the track_id. This is not finished since it is taking 020100 and yielding 0201.00 and it should drop the initial 0 and for suffixes with no information it should drop 00. :param tract_id: this is the full tract id from the input file """ base = re.search(r'\d+(\d{4})(\d\d)', tract_id) name = ".".join((base.group(1), base.group(2))) return name def load_zip_tract_crosswalk(self): data = [] with open("data/TRACT_ZIP_122015.csv") as fh: reader = csv.reader(fh) index = 0 for r in reader: index += 1 if index < 2: # skip header continue data.append([ r[0], r[1], r[2], r[3], r[4], r[5] ]) # TRACT,ZIP,RES_RATIO,BUS_RATIO,OTH_RATIO,TOT_RATIO with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM zip_tract_cw" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("zip_tract_cw table already exists. Skipping...") return for r in data: zip_id_sql = "SELECT `pk_id` FROM `zip` WHERE `zipcode`=%s" cursor.execute(zip_id_sql, (r[1])) zip_pk_id = cursor.fetchone() if zip_pk_id is None: print(r[1]) track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` WHERE `track_id`=%s" cursor.execute(track_id_sql, (r[0])) track_pk_id = cursor.fetchone() if track_pk_id is None: print(r[0]) continue insert_sql = "INSERT INTO `zip_tract_cw` " \ "(`track_pk_id`, " \ "`zip_pk_id`, " \ "`res_ratio`, " \ "`bus_ratio`, " \ "`oth_ratio`, " \ "`tot_ratio`) " \ "VALUES (%s, %s, %s, %s, %s, %s)" try: cursor.execute(insert_sql, (track_pk_id['pk_id'], zip_pk_id['pk_id'], r[2], r[3], r[4], r[5])) self.acsdb.con.commit() except Exception as e: print( "tract %s attempted to map into zip %s and not found - %s" % (r[0], r[1], e)) def load_geo_details(self): geo = {} with open("data/US/US.txt") as fh: reader = csv.reader(fh, delimiter='\t') for r in reader: geo[r[1]] = [{ 'city': r[2] }, { 'county': r[5] }, { 'state': r[3] }, { 'lat': r[9] }, { 'lon': r[10] }] with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM zip" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("Zip table already exists. Skipping...") return for g in geo: update_sql = "INSERT INTO `zip` (`city`, `county`, `state`, `lat`, `lon`, `zipcode`) " \ "VALUES (%s, %s, %s, %s, %s, %s)" try: cursor.execute(update_sql, (geo[g][0]['city'], geo[g][1]['county'], geo[g][2]['state'], geo[g][3]['lat'], geo[g][4]['lon'], g)) except: pass self.acsdb.con.commit() def load_S2503(self): data = {} with open("data/ACS_14_5YR_S2503.csv") as fh: reader = csv.reader(fh) r_count = 0 for r in reader: r_count += 1 if r_count < 3: continue # name = re.search(r'[\w .]+', r[2]).group(0) r = [e.replace('-', '0') for e in r] r = [e.replace('(X)', '0') for e in r] r = [e.replace('250,000+', '250000') for e in r] r = [e.replace('***', '0') for e in r] r = [e.replace('**', '0') for e in r] data[r[1]] = [{ 'HC01_VC01': float(r[3]) }, { 'HC02_VC01': float(r[5]) }, { 'HC03_VC01': float(r[7]) }, { 'HC01_VC03': float(r[9]) }, { 'HC02_VC03': float(r[11]) }, { 'HC03_VC03': float(r[13]) }, { 'HC01_VC04': float(r[15]) }, { 'HC02_VC04': float(r[17]) }, { 'HC03_VC04': float(r[19]) }, { 'HC01_VC05': float(r[21]) }, { 'HC02_VC05': float(r[23]) }, { 'HC03_VC05': float(r[25]) }, { 'HC01_VC06': float(r[27]) }, { 'HC02_VC06': float(r[29]) }, { 'HC03_VC06': float(r[31]) }, { 'HC01_VC07': float(r[33]) }, { 'HC02_VC07': float(r[35]) }, { 'HC03_VC07': float(r[37]) }, { 'HC01_VC08': float(r[39]) }, { 'HC02_VC08': float(r[41]) }, { 'HC03_VC08': float(r[43]) }, { 'HC01_VC09': float(r[45]) }, { 'HC02_VC09': float(r[47]) }, { 'HC03_VC09': float(r[49]) }, { 'HC01_VC10': float(r[51]) }, { 'HC02_VC10': float(r[53]) }, { 'HC03_VC10': float(r[55]) }, { 'HC01_VC11': float(r[57]) }, { 'HC02_VC11': float(r[59]) }, { 'HC03_VC11': float(r[61]) }, { 'HC01_VC12': float(r[63]) }, { 'HC02_VC12': float(r[65]) }, { 'HC03_VC12': float(r[67]) }, { 'HC01_VC13': float(r[69]) }, { 'HC02_VC13': float(r[71]) }, { 'HC03_VC13': float(r[73]) }, { 'HC01_VC14': float(r[75]) }, { 'HC02_VC14': float(r[77]) }, { 'HC03_VC14': float(r[79]) }] with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM S2503_ACS" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("S2503_ACS table already exists. Skipping...") return for r in data: get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s" cursor.execute(get_track_id_sql, (r)) track_pk_id = cursor.fetchone() if track_pk_id is None: continue update_sql = "INSERT INTO `S2503_ACS` " \ "(`HC01_VC01`, `HC02_VC01`, `HC03_VC01`, " \ "`HC01_VC03`, `HC02_VC03`, `HC03_VC03`, " \ "`HC01_VC04`, `HC02_VC04`, `HC03_VC04`, " \ "`HC01_VC05`, `HC02_VC05`, `HC03_VC05`, " \ "`HC01_VC06`, `HC02_VC06`, `HC03_VC06`, " \ "`HC01_VC07`, `HC02_VC07`, `HC03_VC07`, " \ "`HC01_VC08`, `HC02_VC08`, `HC03_VC08`, " \ "`HC01_VC09`, `HC02_VC09`, `HC03_VC09`, " \ "`HC01_VC10`, `HC02_VC10`, `HC03_VC10`, " \ "`HC01_VC11`, `HC02_VC11`, `HC03_VC11`, " \ "`HC01_VC12`, `HC02_VC12`, `HC03_VC12`, " \ "`HC01_VC13`, `HC02_VC13`, `HC03_VC13`, " \ "`HC01_VC14`, `HC02_VC14`, `HC03_VC14`, " \ "`track_pk_id`) " \ "VALUES " \ "(%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s, %s, %s, " \ "%s)" try: cursor.execute( update_sql, (data[r][0]['HC01_VC01'], data[r][1]['HC02_VC01'], data[r][2]['HC03_VC01'], data[r][3]['HC01_VC03'], data[r][4]['HC02_VC03'], data[r][5]['HC03_VC03'], data[r][6]['HC01_VC04'], data[r][7]['HC02_VC04'], data[r][8]['HC03_VC04'], data[r][9]['HC01_VC05'], data[r][10]['HC02_VC05'], data[r][11]['HC03_VC05'], data[r][12]['HC01_VC06'], data[r][13]['HC02_VC06'], data[r][14]['HC03_VC06'], data[r][15]['HC01_VC07'], data[r][16]['HC02_VC07'], data[r][17]['HC03_VC07'], data[r][18]['HC01_VC08'], data[r][19]['HC02_VC08'], data[r][20]['HC03_VC08'], data[r][21]['HC01_VC09'], data[r][22]['HC02_VC09'], data[r][23]['HC03_VC09'], data[r][24]['HC01_VC10'], data[r][25]['HC02_VC10'], data[r][26]['HC03_VC10'], data[r][27]['HC01_VC11'], data[r][28]['HC02_VC11'], data[r][29]['HC03_VC11'], data[r][30]['HC01_VC12'], data[r][31]['HC02_VC12'], data[r][32]['HC03_VC12'], data[r][33]['HC01_VC13'], data[r][34]['HC02_VC13'], data[r][35]['HC03_VC13'], data[r][36]['HC01_VC14'], data[r][37]['HC02_VC14'], data[r][38]['HC03_VC14'], track_pk_id['pk_id'])) except Exception as e: print(e) self.acsdb.con.commit() def load_S2503_moe( self ): # TODO: Should refactor this to fold MOE into data load...Needs new ddl as well """ Same general method as load_S2503 but this loads the MOE for each variable in the dataset. :return: """ data = {} with open("data/ACS_14_5YR_S2503.csv") as fh: reader = csv.reader(fh) r_count = 0 for r in reader: r_count += 1 if r_count < 3: continue # name = re.search(r'[\w .]+', r[2]).group(0) r = [e.replace('-', '0') for e in r] r = [e.replace('(X)', '0') for e in r] r = [e.replace('250,000+', '250000') for e in r] r = [e.replace('***', '0') for e in r] r = [e.replace('**', '0') for e in r] data[r[1]] = [{ 'HC01_VC01_MOE': float(r[4]) }, { 'HC02_VC01_MOE': float(r[6]) }, { 'HC03_VC01_MOE': float(r[8]) }, { 'HC01_VC03_MOE': float(r[10]) }, { 'HC02_VC03_MOE': float(r[12]) }, { 'HC03_VC03_MOE': float(r[14]) }, { 'HC01_VC04_MOE': float(r[16]) }, { 'HC02_VC04_MOE': float(r[18]) }, { 'HC03_VC04_MOE': float(r[20]) }, { 'HC01_VC05_MOE': float(r[22]) }, { 'HC02_VC05_MOE': float(r[24]) }, { 'HC03_VC05_MOE': float(r[26]) }, { 'HC01_VC06_MOE': float(r[28]) }, { 'HC02_VC06_MOE': float(r[30]) }, { 'HC03_VC06_MOE': float(r[32]) }, { 'HC01_VC07_MOE': float(r[34]) }, { 'HC02_VC07_MOE': float(r[36]) }, { 'HC03_VC07_MOE': float(r[38]) }, { 'HC01_VC08_MOE': float(r[40]) }, { 'HC02_VC08_MOE': float(r[42]) }, { 'HC03_VC08_MOE': float(r[44]) }, { 'HC01_VC09_MOE': float(r[46]) }, { 'HC02_VC09_MOE': float(r[48]) }, { 'HC03_VC09_MOE': float(r[50]) }, { 'HC01_VC10_MOE': float(r[52]) }, { 'HC02_VC10_MOE': float(r[54]) }, { 'HC03_VC10_MOE': float(r[56]) }, { 'HC01_VC11_MOE': float(r[58]) }, { 'HC02_VC11_MOE': float(r[60]) }, { 'HC03_VC11_MOE': float(r[62]) }, { 'HC01_VC12_MOE': float(r[64]) }, { 'HC02_VC12_MOE': float(r[66]) }, { 'HC03_VC12_MOE': float(r[68]) }, { 'HC01_VC13_MOE': float(r[70]) }, { 'HC02_VC13_MOE': float(r[72]) }, { 'HC03_VC13_MOE': float(r[74]) }, { 'HC01_VC14_MOE': float(r[76]) }, { 'HC02_VC14_MOE': float(r[78]) }, { 'HC03_VC14_MOE': float(r[80]) }] with self.acsdb.con.cursor() as cursor: test_sql = "SELECT HC01_VC01_MOE FROM S2501_ACS" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print( "S2501_ACS table already provisioned with MOEs. Skipping..." ) return for r in data: get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s" cursor.execute(get_track_id_sql, (r)) track_pk_id = cursor.fetchone() if track_pk_id is None: continue get_pk_id_sql = "SELECT pk_id FROM S2503_ACS WHERE track_pk_id=%s" cursor.execute(get_pk_id_sql, (track_pk_id['pk_id'])) ret = cursor.fetchone() update_sql = "UPDATE `S2503_ACS` SET " \ "`HC01_VC01_MOE`=%s, `HC02_VC01_MOE`=%s, `HC03_VC01_MOE`=%s, " \ "`HC01_VC03_MOE`=%s, `HC02_VC03_MOE`=%s, `HC03_VC03_MOE`=%s, " \ "`HC01_VC04_MOE`=%s, `HC02_VC04_MOE`=%s, `HC03_VC04_MOE`=%s, " \ "`HC01_VC05_MOE`=%s, `HC02_VC05_MOE`=%s, `HC03_VC05_MOE`=%s, " \ "`HC01_VC06_MOE`=%s, `HC02_VC06_MOE`=%s, `HC03_VC06_MOE`=%s, " \ "`HC01_VC07_MOE`=%s, `HC02_VC07_MOE`=%s, `HC03_VC07_MOE`=%s, " \ "`HC01_VC08_MOE`=%s, `HC02_VC08_MOE`=%s, `HC03_VC08_MOE`=%s, " \ "`HC01_VC09_MOE`=%s, `HC02_VC09_MOE`=%s, `HC03_VC09_MOE`=%s, " \ "`HC01_VC10_MOE`=%s, `HC02_VC10_MOE`=%s, `HC03_VC10_MOE`=%s, " \ "`HC01_VC11_MOE`=%s, `HC02_VC11_MOE`=%s, `HC03_VC11_MOE`=%s, " \ "`HC01_VC12_MOE`=%s, `HC02_VC12_MOE`=%s, `HC03_VC12_MOE`=%s, " \ "`HC01_VC13_MOE`=%s, `HC02_VC13_MOE`=%s, `HC03_VC13_MOE`=%s, " \ "`HC01_VC14_MOE`=%s, `HC02_VC14_MOE`=%s, `HC03_VC14_MOE`=%s " \ "WHERE " \ "pk_id=%s" try: cursor.execute( update_sql, (data[r][0]['HC01_VC01_MOE'], data[r][1]['HC02_VC01_MOE'], data[r][2]['HC03_VC01_MOE'], data[r][3]['HC01_VC03_MOE'], data[r][4]['HC02_VC03_MOE'], data[r][5]['HC03_VC03_MOE'], data[r][6]['HC01_VC04_MOE'], data[r][7]['HC02_VC04_MOE'], data[r][8]['HC03_VC04_MOE'], data[r][9]['HC01_VC05_MOE'], data[r][10]['HC02_VC05_MOE'], data[r][11]['HC03_VC05_MOE'], data[r][12]['HC01_VC06_MOE'], data[r][13]['HC02_VC06_MOE'], data[r][14]['HC03_VC06_MOE'], data[r][15]['HC01_VC07_MOE'], data[r][16]['HC02_VC07_MOE'], data[r][17]['HC03_VC07_MOE'], data[r][18]['HC01_VC08_MOE'], data[r][19]['HC02_VC08_MOE'], data[r][20]['HC03_VC08_MOE'], data[r][21]['HC01_VC09_MOE'], data[r][22]['HC02_VC09_MOE'], data[r][23]['HC03_VC09_MOE'], data[r][24]['HC01_VC10_MOE'], data[r][25]['HC02_VC10_MOE'], data[r][26]['HC03_VC10_MOE'], data[r][27]['HC01_VC11_MOE'], data[r][28]['HC02_VC11_MOE'], data[r][29]['HC03_VC11_MOE'], data[r][30]['HC01_VC12_MOE'], data[r][31]['HC02_VC12_MOE'], data[r][32]['HC03_VC12_MOE'], data[r][33]['HC01_VC13_MOE'], data[r][34]['HC02_VC13_MOE'], data[r][35]['HC03_VC13_MOE'], data[r][36]['HC01_VC14_MOE'], data[r][37]['HC02_VC14_MOE'], data[r][38]['HC03_VC14_MOE'], ret['pk_id'])) except Exception as e: print(e) self.acsdb.con.commit() def load_S2501(self): data = {} with open("data/ACS_14_5YR_S2501.csv") as fh: reader = csv.reader(fh) r_count = 0 for r in reader: r_count += 1 if r_count < 3: # csv file first two rows are not data continue # name = re.search(r'[\w .]+', r[2]).group(0) r = [e.replace('-', '0') for e in r] r = [e.replace('(X)', '0') for e in r] r = [e.replace('250,000+', '250000') for e in r] r = [e.replace('***', '0') for e in r] r = [e.replace('**', '0') for e in r] data[r[1]] = [{ 'HC01_VC01': float(r[3]) }, { 'HC01_VC01_MOE': float(r[4]) }, { 'HC02_VC01': float(r[5]) }, { 'HC02_VC01_MOE': float(r[6]) }, { 'HC03_VC01': float(r[7]) }, { 'HC03_VC01_MOE': float(r[8]) }, { 'HC01_VC03': float(r[9]) }, { 'HC01_VC03_MOE': float(r[10]) }, { 'HC02_VC03': float(r[11]) }, { 'HC02_VC03_MOE': float(r[12]) }, { 'HC03_VC03': float(r[13]) }, { 'HC03_VC03_MOE': float(r[14]) }, { 'HC01_VC04': float(r[15]) }, { 'HC01_VC04_MOE': float(r[16]) }, { 'HC02_VC04': float(r[17]) }, { 'HC02_VC04_MOE': float(r[18]) }, { 'HC03_VC04': float(r[19]) }, { 'HC03_VC04_MOE': float(r[20]) }, { 'HC01_VC05': float(r[21]) }, { 'HC01_VC05_MOE': float(r[22]) }, { 'HC02_VC05': float(r[23]) }, { 'HC02_VC05_MOE': float(r[24]) }, { 'HC03_VC05': float(r[25]) }, { 'HC03_VC05_MOE': float(r[26]) }, { 'HC01_VC06': float(r[27]) }, { 'HC01_VC06_MOE': float(r[28]) }, { 'HC02_VC06': float(r[29]) }, { 'HC02_VC06_MOE': float(r[30]) }, { 'HC03_VC06': float(r[31]) }, { 'HC03_VC06_MOE': float(r[32]) }, { 'HC01_VC14': float(r[51]) }, { 'HC01_VC14_MOE': float(r[52]) }, { 'HC02_VC14': float(r[53]) }, { 'HC02_VC14_MOE': float(r[54]) }, { 'HC03_VC14': float(r[55]) }, { 'HC03_VC14_MOE': float(r[56]) }, { 'HC01_VC15': float(r[57]) }, { 'HC01_VC15_MOE': float(r[58]) }, { 'HC02_VC15': float(r[59]) }, { 'HC02_VC15_MOE': float(r[60]) }, { 'HC03_VC15': float(r[61]) }, { 'HC03_VC15_MOE': float(r[62]) }, { 'HC01_VC19': float(r[81]) }, { 'HC01_VC19_MOE': float(r[82]) }, { 'HC02_VC19': float(r[83]) }, { 'HC02_VC19_MOE': float(r[84]) }, { 'HC03_VC19': float(r[85]) }, { 'HC03_VC19_MOE': float(r[86]) }, { 'HC01_VC39': float(r[183]) }, { 'HC01_VC39_MOE': float(r[184]) }, { 'HC02_VC39': float(r[185]) }, { 'HC02_VC39_MOE': float(r[186]) }, { 'HC03_VC39': float(r[187]) }, { 'HC03_VC39_MOE': float(r[188]) }] with self.acsdb.con.cursor() as cursor: test_sql = "SELECT * FROM S2501_ACS" cursor.execute(test_sql, ()) ret = cursor.fetchone() if ret is not None: print("S2501_ACS table already exists. Skipping...") return for r in data: get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s" cursor.execute(get_track_id_sql, (r)) track_pk_id = cursor.fetchone() if track_pk_id is None: continue update_sql = "INSERT INTO `S2501_ACS` " \ "(`HC01_VC01`, `HC01_VC01_MOE`, `HC02_VC01`, `HC02_VC01_MOE`, `HC03_VC01`, `HC03_VC01_MOE`, " \ "`HC01_VC03`, `HC01_VC03_MOE`, `HC02_VC03`, `HC02_VC03_MOE`, `HC03_VC03`, `HC03_VC03_MOE`, " \ "`HC01_VC04`, `HC01_VC04_MOE`, `HC02_VC04`, `HC02_VC04_MOE`, `HC03_VC04`, `HC03_VC04_MOE`, " \ "`HC01_VC05`, `HC01_VC05_MOE`, `HC02_VC05`, `HC02_VC05_MOE`, `HC03_VC05`, `HC03_VC05_MOE`, " \ "`HC01_VC06`, `HC01_VC06_MOE`, `HC02_VC06`, `HC02_VC06_MOE`, `HC03_VC06`, `HC03_VC06_MOE`, " \ "`HC01_VC14`, `HC01_VC14_MOE`, `HC02_VC14`, `HC02_VC14_MOE`, `HC03_VC14`, `HC03_VC14_MOE`, " \ "`HC01_VC15`, `HC01_VC15_MOE`, `HC02_VC15`, `HC02_VC15_MOE`, `HC03_VC15`, `HC03_VC15_MOE`, " \ "`HC01_VC19`, `HC01_VC19_MOE`, `HC02_VC19`, `HC02_VC19_MOE`, `HC03_VC19`, `HC03_VC19_MOE`, " \ "`HC01_VC39`, `HC01_VC39_MOE`, `HC02_VC39`, `HC02_VC39_MOE`, `HC03_VC39`, `HC03_VC39_MOE`, " \ "`track_pk_id`) " \ "VALUES " \ "(%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, " \ "%s)" try: cursor.execute(update_sql, ( data[r][0]['HC01_VC01'], data[r][1]['HC01_VC01_MOE'], data[r][2]['HC02_VC01'], data[r][3]['HC02_VC01_MOE'], data[r][4]['HC03_VC01'], data[r][5]['HC03_VC01_MOE'], data[r][6]['HC01_VC03'], data[r][7]['HC01_VC03_MOE'], data[r][8]['HC02_VC03'], data[r][9]['HC02_VC03_MOE'], data[r][10]['HC03_VC03'], data[r][11]['HC03_VC03_MOE'], data[r][12]['HC01_VC04'], data[r][13]['HC01_VC04_MOE'], data[r][14]['HC02_VC04'], data[r][15]['HC02_VC04_MOE'], data[r][16]['HC03_VC04'], data[r][17]['HC03_VC04_MOE'], data[r][18]['HC01_VC05'], data[r][19]['HC01_VC05_MOE'], data[r][20]['HC02_VC05'], data[r][21]['HC02_VC05_MOE'], data[r][22]['HC03_VC05'], data[r][23]['HC03_VC05_MOE'], data[r][24]['HC01_VC06'], data[r][25]['HC01_VC06_MOE'], data[r][26]['HC02_VC06'], data[r][27]['HC02_VC06_MOE'], data[r][28]['HC03_VC06'], data[r][29]['HC03_VC06_MOE'], data[r][30]['HC01_VC14'], data[r][31]['HC01_VC14_MOE'], data[r][32]['HC02_VC14'], data[r][33]['HC02_VC14_MOE'], data[r][34]['HC03_VC14'], data[r][35]['HC03_VC14_MOE'], data[r][36]['HC01_VC15'], data[r][37]['HC01_VC15_MOE'], data[r][38]['HC02_VC15'], data[r][39]['HC02_VC15_MOE'], data[r][40]['HC03_VC15'], data[r][41]['HC03_VC15_MOE'], data[r][42]['HC01_VC19'], data[r][43]['HC01_VC19_MOE'], data[r][44]['HC02_VC19'], data[r][45]['HC02_VC19_MOE'], data[r][46]['HC03_VC19'], data[r][47]['HC03_VC19_MOE'], data[r][48]['HC01_VC39'], data[r][49]['HC01_VC39_MOE'], data[r][50]['HC02_VC39'], data[r][51]['HC02_VC39_MOE'], data[r][52]['HC03_VC39'], data[r][53]['HC03_VC39_MOE'], track_pk_id['pk_id'])) except Exception as e: print(e) self.acsdb.con.commit()