示例#1
0
class WalmartProdSearch(object):
    def __init__(self):
        init_logging()
        self.logger = logging.getLogger(__name__)
        self.logger.info("WalmartProdSearch object initialized and logging enabled")

        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"walmart_config.yml"), "r") as fh:
            settings = yaml.load(fh)

        self.db = Mysql(settings['db_config'])

        self.driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'])
        self.driver.set_window_size(1024, 768)
        self.shipping_rate = 0.75  # $rate/lb  # TODO: shift this to AZ class
        self.outfile = "../data/test.csv"
        self.depth_limit = settings['depth_limit']
        self.debug = settings['debug']
        self.fieldnames = ('net', 'roi', 'title', 'price', 'az_price', 'weight',
                           'az_sales_rank', 'az_match', 'url', 'img', 'az_url', 'az_asin',
                           'item_id')
        self.site_url = settings['site_url']
        self.page_url = settings['page_url']
        self.base_url = strip_final_slash(get_base_url(self.site_url))
        #self.az = AZ()

    def destroy(self):
        """
        method to destroy all objects and clean up.
        :return:
        """
        #self.driver.service.process.send_signal(signal.SIGTERM)
        self.logger.info("Database connection closed...")
        self.db.exit()
        self.logger.info("Walmart object cleanly destroyed...")
        self.driver.quit()

    def scrape(self, pc=None, change_url=None):
        """

        :param change_url is the changing part of wider site url, if there
        are multiple sections to hit.
        :param pc is an integer indicating where to start with a paginated url.
        """
        self.run = True  # initialization of a site/section.
        if pc is not None:
            self.pc = pc
        while self.run is True:
            url = self.next_page_url(build_search_url(self.site_url, change_url))
            try:
                page = self.get_page(url)
            except Exception as e:
                self.logger.error("Error with %s and skipped" % url)
                continue
            self.get_list(page)
        if change_url is None:
            self.logger.info("Site %s finished" % self.site_url)
        else:
            self.logger.info("Section %s finished" % change_url)

    def init_output(self):
        if not os.path.exists(self.outfile):
            with open(self.outfile, "w", encoding='utf-8') as fh:
                outwriter = csv.DictWriter(fh,
                                           fieldnames=self.fieldnames,
                                           delimiter="\t")
                outwriter.writeheader()

    def _persist_new_match(self, item_id):
        """
        Method takes walmart item id and finds the match_cw row with that item id.
        If none, inserts a new row as this is a new product.
        :param item_id:
        :return:
        """
        with self.db.con.cursor() as cursor:
            #Find the selected product from walmart in the existing product TABLE
            select_sql = "SELECT pk_id " \
                         "FROM walmart_product " \
                         "WHERE item_id=%s"
            cursor.execute(select_sql, (item_id))
            ret = cursor.fetchone()
            wal_id = ret['pk_id']
            #Find the corresponding row in match_cw
            select_sql = "SELECT pk_id " \
                         "FROM match_cw " \
                         "WHERE wal_pk_id=%s"
            cursor.execute(select_sql, (wal_id))
            ret = cursor.fetchone()
            # didn't get a match row so make one
            if ret is None:
                insert_sql = "INSERT INTO match_cw " \
                             "(wal_pk_id, last_update) " \
                             "VALUES " \
                             "(%s, now())"
                try:
                    cursor.execute(insert_sql, (wal_id))
                    self.db.con.commit()
                except:
                    self.logger.exception('Failed to insert new walmart product into match.')
            else:
                self.logger.info('Product %s is already in match table.' % (wal_id))

    def process_output(self, data):
        with self.db.con.cursor() as cursor:
            #Try to find the selected product from walmart in the existing product TABLE
            select_sql = "SELECT pk_id, price " \
                         "FROM walmart_product " \
                         "WHERE item_id=%s"
            cursor.execute(select_sql, (data['item_id']))
            ret = cursor.fetchone()
            # If none found, it's a new product so push it in
            if ret is None:
                insert_sql = "INSERT INTO walmart_product " \
                             "(price, url, img, item_id, " \
                             "title, last_changed, " \
                             "last_read) " \
                             "VALUES " \
                             "(%s, %s, %s, %s, %s, %s, %s)"
                try:
                    cursor.execute(insert_sql, (data['price'].strip(),
                                                data['url'].strip(),
                                                data['img'.strip()],
                                                data['item_id'].strip(),
                                                data['title'].strip(),
                                                datetime.datetime.now(),
                                                datetime.datetime.now()))
                    self.db.con.commit()
                except:
                    self.logger.exception('Failed to insert new walmart product.')
            # Otherwise, it's already been found, so update the price and time stamp
            elif data['price'] != ret['price']:
                update_sql = "UPDATE walmart_product " \
                             "SET price=%s, last_read=now(), " \
                             "last_changed=now(), title=%s, " \
                             "url=%s, img=%s " \
                             "WHERE pk_id=%s"
                try:
                    cursor.execute(update_sql, (data['price'].strip(),
                                                data['title'].strip(),
                                                data['url'].strip(),
                                                data['img'].strip(),
                                                ret['pk_id']))
                    self.db.con.commit()
                except:
                    self.logger.exception('Failed to update walmart product.')
            # Otherwise, it's already been found and price is same so update time stamp
            else:
                update_sql = "UPDATE walmart_product " \
                             "SET last_read=now()" \
                             "WHERE pk_id=%s"
                cursor.execute(update_sql, (ret['pk_id']))
                self.db.con.commit()
            # insert into the match_cw table accordingly
            self._persist_new_match(data['item_id'].strip())
        # with open(self.outfile, 'a', encoding='utf-8') as fh:
        #     outwriter = csv.DictWriter(fh,
        #                                fieldnames=self.fieldnames,
        #                                delimiter="\t")
        #     outwriter.writerow(data)

    def get_dollar_amount(self, f):
        if isinstance(f, str):
            f = f.replace(",", "", 1)
            return round(float(re.match(r'\$?(\d+[.]\d\d)', f.strip()).group(1)), 2)
        else:
            return f

    def get_net(self, data):
        az_price = data['az_price']
        if az_price == 0.0:
            return round(0.0,2)
        price = self.get_dollar_amount(data['price'])
        if data['weight'] == "Weight not fetched" or data['weight'] == "Not Available":
            weight = 0.0
        else:
            weight = float(data['weight'])
        try:
            net = (az_price - (price*1.08 + az_price*0.3 + weight*self.shipping_rate))
        except Exception as e:
            net = 0.0
        try:
            net = round(net, 2)
        except:
            self.logger.error("Bad net value for %s - price:%s, az_price:%s, weight:%s" %
                              (data['title'], data['price'], data['az_price'], data['weight']))
            net = 0.0
        return net

    def get_roi(self, data):
        net = self.get_dollar_amount(data['net'])
        price = self.get_dollar_amount(data['price'])
        return round(net/price, 2)

    def get_list(self, page):
        """
        method takes search results page from Walmart and parses out items to save.
        Has error checking for no search results or an empty set.
        :param page: bs4 object returned from get_page
        :return:
        """
        imitate_user(0.5)
        if page.find(string=re.compile(r'We found 0 results')):
            self.run = False
            return
        elif not page.find("ul", {"class": "tile-list-grid"}):
            self.run = False
            return
        else:
            entries = page.find("ul", {"class": "tile-list-grid"})
        for e in entries:
            if len(e) == 1:
                continue
            elif e.name == "script":
                continue
            else:
                entry = {}
                try:
                    entry['title'] = e.find("a", {"class":"js-product-title"}).get_text().strip()
                except:
                    continue
                if 'http://' in e.find("a", {"class":"js-product-title"}).attrs['href']:
                    entry['url'] = e.find("a", {"class":"js-product-title"}).attrs['href']
                else:
                    entry['url'] = "".join((self.base_url, e.find("a", {"class":"js-product-title"}).attrs['href']))
                try:
                    entry['price'] = e.find("span", {"class":"price-display"}).get_text().replace('$', '')
                except:
                    continue
                entry['img'] = e.find("img", {"class":"product-image"}).attrs['data-default-image']
                entry['item_id'] = e.find("div", {"class": "js-tile", "class": "tile-grid-unit"}).attrs['data-item-id']
                #entry['az_price'], entry['weight'], entry['az_sales_rank'], entry['az_match'], entry['az_url'], entry['az_asin'] = self.az.find_best_match(entry['title'], 'Toys')
                #entry['net'] = self.get_net(entry)
                #entry['roi'] = self.get_roi(entry)
                self.process_output(entry)

    def next_page_url(self, url):
        self.pc += 1
        imitate_user(0.5)
        next_url = url
        if self.page_url:
            next_url += self.page_url
            next_url += str(self.pc)
        if self.pc == self.depth_limit:
            self.run = False  # recursion limit reached
        return next_url

    def get_page(self, url):
        try:
            self.logger.info("Getting %s" % url)
            self.driver.get(url)
            # self.driver.get_cookies()
        except ValueError as e:
            imitate_user(2)
            try:
                self.driver.get(url)
            except:
                raise
        except Exception as e:
            self.logger.error(url, e)
        # try:
        #     wait = WebDriverWait(self.driver, 3)
        #     wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div")))
        # except Exception as e:
        #     self.logger.error("WebDriverWait error")
        page = BeautifulSoup(self.driver.page_source, "lxml")
        return page

    def _is_recent(self, t):
        """
        private method to check whether timestamp is within default time period.
        :param t: timestamp
        :return: boolean of whether it is within default period, true is within period.
        """
        if (datetime.datetime.now() - t) < datetime.timedelta(days=1):
            return True
        else:
            return False

    def get_target(self):
        """
        method to find the next node at walmart to search for products. If there is no
        target that has been more than 24 hours since being searched, will return None.
        Otherwise, takes the first returned and updates the timestamp to now to reset
        its priority for search.
        :return: 'node' is the cat component of the url to hit at walmart.
        """
        with self.db.con.cursor() as cursor:
            select_sql = "select node, last_read from walmart_node ORDER BY last_read ASC"
            cursor.execute(select_sql)
            ret = cursor.fetchone()
            if self._is_recent(ret['last_read']):
                self.logger.info("All product information is up to date - exiting.")
                return None
            else:
                update_sql = "update walmart_node set last_read=now() where node=%s"
                cursor.execute(update_sql, (ret['node']))
                self.db.con.commit()
                return ret['node']
示例#2
0
class Populate():
    """Builds the census_2010_schema with various tables.

    loads: census_2010 zips and tract ids.
    loads: US geo details for all states, counties, cities with zip, lat/long.
    """

    def __init__(self):
        """Constructor for Populate"""
        self.acsdb = Mysql("../Pydb/mysql_config.yml")

    def destroy(self):
        self.acsdb.exit()

    def load_tracts(self):
        tracts = {}  # {track_id : track_name}
        with open("data/TRACT_ZIP_122015.csv") as fh:
            r_count = 0
            reader = csv.reader(fh)
            for r in reader:
                r_count += 1
                if r_count < 2:
                    continue
                tracts[r[0]] = self.get_tract_name(r[0])
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM census_tract_2010"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("census_tract_2010 table already exists. Skipping...")
                return
            for t in tracts:
                update_sql = "INSERT INTO `census_tract_2010` (`track_id`, `track_name`) VALUES (%s, %s)"
                try:
                    cursor.execute(update_sql, (t, tracts[t]))
                except:
                    pass
                self.acsdb.con.commit()

    @staticmethod
    def get_tract_name(tract_id):
        """Pulls out the Census Tract Name from up to the last 6 digits in the track_id.
        This is not finished since it is taking 020100 and yielding 0201.00 and it should
        drop the initial 0 and for suffixes with no information it should drop 00.

        :param tract_id: this is the full tract id from the input file
        """
        base = re.search(r'\d+(\d{4})(\d\d)', tract_id)
        name = ".".join((base.group(1), base.group(2)))
        return name

    def load_zip_tract_crosswalk(self):
        data = []
        with open("data/TRACT_ZIP_122015.csv") as fh:
            reader = csv.reader(fh)
            index = 0
            for r in reader:
                index += 1
                if index < 2:  # skip header
                    continue
                data.append([r[0], r[1], r[2], r[3], r[4], r[5]])  # TRACT,ZIP,RES_RATIO,BUS_RATIO,OTH_RATIO,TOT_RATIO
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM zip_tract_cw"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("zip_tract_cw table already exists. Skipping...")
                return
            for r in data:
                zip_id_sql = "SELECT `pk_id` FROM `zip` WHERE `zipcode`=%s"
                cursor.execute(zip_id_sql, (r[1]))
                zip_pk_id = cursor.fetchone()
                if zip_pk_id is None:
                    print(r[1])
                track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` WHERE `track_id`=%s"
                cursor.execute(track_id_sql, (r[0]))
                track_pk_id = cursor.fetchone()
                if track_pk_id is None:
                    print(r[0])
                    continue
                insert_sql = "INSERT INTO `zip_tract_cw` " \
                             "(`track_pk_id`, " \
                             "`zip_pk_id`, " \
                             "`res_ratio`, " \
                             "`bus_ratio`, " \
                             "`oth_ratio`, " \
                             "`tot_ratio`) " \
                             "VALUES (%s, %s, %s, %s, %s, %s)"
                try:
                    cursor.execute(insert_sql, (track_pk_id['pk_id'], zip_pk_id['pk_id'], r[2], r[3], r[4], r[5]))
                    self.acsdb.con.commit()
                except Exception as e:
                    print("tract %s attempted to map into zip %s and not found - %s" % (r[0], r[1], e))

    def load_geo_details(self):
        geo = {}
        with open("data/US/US.txt") as fh:
            reader = csv.reader(fh, delimiter='\t')
            for r in reader:
                geo[r[1]] = [{'city': r[2]}, {'county': r[5]}, {'state': r[3]}, {'lat': r[9]}, {'lon': r[10]}]
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM zip"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("Zip table already exists. Skipping...")
                return
            for g in geo:
                update_sql = "INSERT INTO `zip` (`city`, `county`, `state`, `lat`, `lon`, `zipcode`) " \
                             "VALUES (%s, %s, %s, %s, %s, %s)"
                try:
                    cursor.execute(update_sql, (geo[g][0]['city'],
                                                geo[g][1]['county'],
                                                geo[g][2]['state'],
                                                geo[g][3]['lat'],
                                                geo[g][4]['lon'],
                                                g))
                except:
                    pass
                self.acsdb.con.commit()

    def load_S2503(self):
        data = {}
        with open("data/ACS_14_5YR_S2503.csv") as fh:
            reader = csv.reader(fh)
            r_count = 0
            for r in reader:
                r_count += 1
                if r_count < 3:
                    continue
                # name = re.search(r'[\w .]+', r[2]).group(0)
                r = [e.replace('-', '0') for e in r]
                r = [e.replace('(X)', '0') for e in r]
                r = [e.replace('250,000+', '250000') for e in r]
                r = [e.replace('***', '0') for e in r]
                r = [e.replace('**', '0') for e in r]
                data[r[1]] = [{'HC01_VC01': float(r[3])},
                              {'HC02_VC01': float(r[5])},
                              {'HC03_VC01': float(r[7])},
                              {'HC01_VC03': float(r[9])},
                              {'HC02_VC03': float(r[11])},
                              {'HC03_VC03': float(r[13])},
                              {'HC01_VC04': float(r[15])},
                              {'HC02_VC04': float(r[17])},
                              {'HC03_VC04': float(r[19])},
                              {'HC01_VC05': float(r[21])},
                              {'HC02_VC05': float(r[23])},
                              {'HC03_VC05': float(r[25])},
                              {'HC01_VC06': float(r[27])},
                              {'HC02_VC06': float(r[29])},
                              {'HC03_VC06': float(r[31])},
                              {'HC01_VC07': float(r[33])},
                              {'HC02_VC07': float(r[35])},
                              {'HC03_VC07': float(r[37])},
                              {'HC01_VC08': float(r[39])},
                              {'HC02_VC08': float(r[41])},
                              {'HC03_VC08': float(r[43])},
                              {'HC01_VC09': float(r[45])},
                              {'HC02_VC09': float(r[47])},
                              {'HC03_VC09': float(r[49])},
                              {'HC01_VC10': float(r[51])},
                              {'HC02_VC10': float(r[53])},
                              {'HC03_VC10': float(r[55])},
                              {'HC01_VC11': float(r[57])},
                              {'HC02_VC11': float(r[59])},
                              {'HC03_VC11': float(r[61])},
                              {'HC01_VC12': float(r[63])},
                              {'HC02_VC12': float(r[65])},
                              {'HC03_VC12': float(r[67])},
                              {'HC01_VC13': float(r[69])},
                              {'HC02_VC13': float(r[71])},
                              {'HC03_VC13': float(r[73])},
                              {'HC01_VC14': float(r[75])},
                              {'HC02_VC14': float(r[77])},
                              {'HC03_VC14': float(r[79])}]
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM S2503_ACS"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("S2503_ACS table already exists. Skipping...")
                return
            for r in data:
                get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s"
                cursor.execute(get_track_id_sql, (r))
                track_pk_id = cursor.fetchone()
                if track_pk_id is None:
                    continue
                update_sql = "INSERT INTO `S2503_ACS` " \
                             "(`HC01_VC01`, `HC02_VC01`, `HC03_VC01`, " \
                             "`HC01_VC03`, `HC02_VC03`, `HC03_VC03`, " \
                             "`HC01_VC04`, `HC02_VC04`, `HC03_VC04`, " \
                             "`HC01_VC05`, `HC02_VC05`, `HC03_VC05`, " \
                             "`HC01_VC06`, `HC02_VC06`, `HC03_VC06`, " \
                             "`HC01_VC07`, `HC02_VC07`, `HC03_VC07`, " \
                             "`HC01_VC08`, `HC02_VC08`, `HC03_VC08`, " \
                             "`HC01_VC09`, `HC02_VC09`, `HC03_VC09`, " \
                             "`HC01_VC10`, `HC02_VC10`, `HC03_VC10`, " \
                             "`HC01_VC11`, `HC02_VC11`, `HC03_VC11`, " \
                             "`HC01_VC12`, `HC02_VC12`, `HC03_VC12`, " \
                             "`HC01_VC13`, `HC02_VC13`, `HC03_VC13`, " \
                             "`HC01_VC14`, `HC02_VC14`, `HC03_VC14`, " \
                             "`track_pk_id`) " \
                             "VALUES " \
                             "(%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s)"
                try:
                    cursor.execute(update_sql, (data[r][0]['HC01_VC01'], data[r][1]['HC02_VC01'], data[r][2]['HC03_VC01'],
                                                data[r][3]['HC01_VC03'], data[r][4]['HC02_VC03'], data[r][5]['HC03_VC03'],
                                                data[r][6]['HC01_VC04'], data[r][7]['HC02_VC04'], data[r][8]['HC03_VC04'],
                                                data[r][9]['HC01_VC05'], data[r][10]['HC02_VC05'], data[r][11]['HC03_VC05'],
                                                data[r][12]['HC01_VC06'], data[r][13]['HC02_VC06'], data[r][14]['HC03_VC06'],
                                                data[r][15]['HC01_VC07'], data[r][16]['HC02_VC07'], data[r][17]['HC03_VC07'],
                                                data[r][18]['HC01_VC08'], data[r][19]['HC02_VC08'], data[r][20]['HC03_VC08'],
                                                data[r][21]['HC01_VC09'], data[r][22]['HC02_VC09'], data[r][23]['HC03_VC09'],
                                                data[r][24]['HC01_VC10'], data[r][25]['HC02_VC10'], data[r][26]['HC03_VC10'],
                                                data[r][27]['HC01_VC11'], data[r][28]['HC02_VC11'], data[r][29]['HC03_VC11'],
                                                data[r][30]['HC01_VC12'], data[r][31]['HC02_VC12'], data[r][32]['HC03_VC12'],
                                                data[r][33]['HC01_VC13'], data[r][34]['HC02_VC13'], data[r][35]['HC03_VC13'],
                                                data[r][36]['HC01_VC14'], data[r][37]['HC02_VC14'], data[r][38]['HC03_VC14'],
                                                track_pk_id['pk_id']))
                except Exception as e:
                    print(e)
                self.acsdb.con.commit()

    def load_S2503_moe(self):  # TODO: Should refactor this to fold MOE into data load...Needs new ddl as well
        """
        Same general method as load_S2503 but this loads the MOE for each variable in the dataset.
        :return:
        """
        data = {}
        with open("data/ACS_14_5YR_S2503.csv") as fh:
            reader = csv.reader(fh)
            r_count = 0
            for r in reader:
                r_count += 1
                if r_count < 3:
                    continue
                # name = re.search(r'[\w .]+', r[2]).group(0)
                r = [e.replace('-', '0') for e in r]
                r = [e.replace('(X)', '0') for e in r]
                r = [e.replace('250,000+', '250000') for e in r]
                r = [e.replace('***', '0') for e in r]
                r = [e.replace('**', '0') for e in r]
                data[r[1]] = [{'HC01_VC01_MOE': float(r[4])},
                              {'HC02_VC01_MOE': float(r[6])},
                              {'HC03_VC01_MOE': float(r[8])},
                              {'HC01_VC03_MOE': float(r[10])},
                              {'HC02_VC03_MOE': float(r[12])},
                              {'HC03_VC03_MOE': float(r[14])},
                              {'HC01_VC04_MOE': float(r[16])},
                              {'HC02_VC04_MOE': float(r[18])},
                              {'HC03_VC04_MOE': float(r[20])},
                              {'HC01_VC05_MOE': float(r[22])},
                              {'HC02_VC05_MOE': float(r[24])},
                              {'HC03_VC05_MOE': float(r[26])},
                              {'HC01_VC06_MOE': float(r[28])},
                              {'HC02_VC06_MOE': float(r[30])},
                              {'HC03_VC06_MOE': float(r[32])},
                              {'HC01_VC07_MOE': float(r[34])},
                              {'HC02_VC07_MOE': float(r[36])},
                              {'HC03_VC07_MOE': float(r[38])},
                              {'HC01_VC08_MOE': float(r[40])},
                              {'HC02_VC08_MOE': float(r[42])},
                              {'HC03_VC08_MOE': float(r[44])},
                              {'HC01_VC09_MOE': float(r[46])},
                              {'HC02_VC09_MOE': float(r[48])},
                              {'HC03_VC09_MOE': float(r[50])},
                              {'HC01_VC10_MOE': float(r[52])},
                              {'HC02_VC10_MOE': float(r[54])},
                              {'HC03_VC10_MOE': float(r[56])},
                              {'HC01_VC11_MOE': float(r[58])},
                              {'HC02_VC11_MOE': float(r[60])},
                              {'HC03_VC11_MOE': float(r[62])},
                              {'HC01_VC12_MOE': float(r[64])},
                              {'HC02_VC12_MOE': float(r[66])},
                              {'HC03_VC12_MOE': float(r[68])},
                              {'HC01_VC13_MOE': float(r[70])},
                              {'HC02_VC13_MOE': float(r[72])},
                              {'HC03_VC13_MOE': float(r[74])},
                              {'HC01_VC14_MOE': float(r[76])},
                              {'HC02_VC14_MOE': float(r[78])},
                              {'HC03_VC14_MOE': float(r[80])}]
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT HC01_VC01_MOE FROM S2501_ACS"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("S2501_ACS table already provisioned with MOEs. Skipping...")
                return
            for r in data:
                get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s"
                cursor.execute(get_track_id_sql, (r))
                track_pk_id = cursor.fetchone()
                if track_pk_id is None:
                    continue
                get_pk_id_sql = "SELECT pk_id FROM S2503_ACS WHERE track_pk_id=%s"
                cursor.execute(get_pk_id_sql, (track_pk_id['pk_id']))
                ret = cursor.fetchone()
                update_sql = "UPDATE `S2503_ACS` SET " \
                             "`HC01_VC01_MOE`=%s, `HC02_VC01_MOE`=%s, `HC03_VC01_MOE`=%s, " \
                             "`HC01_VC03_MOE`=%s, `HC02_VC03_MOE`=%s, `HC03_VC03_MOE`=%s, " \
                             "`HC01_VC04_MOE`=%s, `HC02_VC04_MOE`=%s, `HC03_VC04_MOE`=%s, " \
                             "`HC01_VC05_MOE`=%s, `HC02_VC05_MOE`=%s, `HC03_VC05_MOE`=%s, " \
                             "`HC01_VC06_MOE`=%s, `HC02_VC06_MOE`=%s, `HC03_VC06_MOE`=%s, " \
                             "`HC01_VC07_MOE`=%s, `HC02_VC07_MOE`=%s, `HC03_VC07_MOE`=%s, " \
                             "`HC01_VC08_MOE`=%s, `HC02_VC08_MOE`=%s, `HC03_VC08_MOE`=%s, " \
                             "`HC01_VC09_MOE`=%s, `HC02_VC09_MOE`=%s, `HC03_VC09_MOE`=%s, " \
                             "`HC01_VC10_MOE`=%s, `HC02_VC10_MOE`=%s, `HC03_VC10_MOE`=%s, " \
                             "`HC01_VC11_MOE`=%s, `HC02_VC11_MOE`=%s, `HC03_VC11_MOE`=%s, " \
                             "`HC01_VC12_MOE`=%s, `HC02_VC12_MOE`=%s, `HC03_VC12_MOE`=%s, " \
                             "`HC01_VC13_MOE`=%s, `HC02_VC13_MOE`=%s, `HC03_VC13_MOE`=%s, " \
                             "`HC01_VC14_MOE`=%s, `HC02_VC14_MOE`=%s, `HC03_VC14_MOE`=%s " \
                             "WHERE " \
                             "pk_id=%s"
                try:
                    cursor.execute(update_sql, (data[r][0]['HC01_VC01_MOE'], data[r][1]['HC02_VC01_MOE'], data[r][2]['HC03_VC01_MOE'],
                                                data[r][3]['HC01_VC03_MOE'], data[r][4]['HC02_VC03_MOE'], data[r][5]['HC03_VC03_MOE'],
                                                data[r][6]['HC01_VC04_MOE'], data[r][7]['HC02_VC04_MOE'], data[r][8]['HC03_VC04_MOE'],
                                                data[r][9]['HC01_VC05_MOE'], data[r][10]['HC02_VC05_MOE'], data[r][11]['HC03_VC05_MOE'],
                                                data[r][12]['HC01_VC06_MOE'], data[r][13]['HC02_VC06_MOE'], data[r][14]['HC03_VC06_MOE'],
                                                data[r][15]['HC01_VC07_MOE'], data[r][16]['HC02_VC07_MOE'], data[r][17]['HC03_VC07_MOE'],
                                                data[r][18]['HC01_VC08_MOE'], data[r][19]['HC02_VC08_MOE'], data[r][20]['HC03_VC08_MOE'],
                                                data[r][21]['HC01_VC09_MOE'], data[r][22]['HC02_VC09_MOE'], data[r][23]['HC03_VC09_MOE'],
                                                data[r][24]['HC01_VC10_MOE'], data[r][25]['HC02_VC10_MOE'], data[r][26]['HC03_VC10_MOE'],
                                                data[r][27]['HC01_VC11_MOE'], data[r][28]['HC02_VC11_MOE'], data[r][29]['HC03_VC11_MOE'],
                                                data[r][30]['HC01_VC12_MOE'], data[r][31]['HC02_VC12_MOE'], data[r][32]['HC03_VC12_MOE'],
                                                data[r][33]['HC01_VC13_MOE'], data[r][34]['HC02_VC13_MOE'], data[r][35]['HC03_VC13_MOE'],
                                                data[r][36]['HC01_VC14_MOE'], data[r][37]['HC02_VC14_MOE'], data[r][38]['HC03_VC14_MOE'],
                                                ret['pk_id']))
                except Exception as e:
                    print(e)
                self.acsdb.con.commit()

    def load_S2501(self):
        data = {}
        with open("data/ACS_14_5YR_S2501.csv") as fh:
            reader = csv.reader(fh)
            r_count = 0
            for r in reader:
                r_count += 1
                if r_count < 3:  # csv file first two rows are not data
                    continue
                # name = re.search(r'[\w .]+', r[2]).group(0)
                r = [e.replace('-', '0') for e in r]
                r = [e.replace('(X)', '0') for e in r]
                r = [e.replace('250,000+', '250000') for e in r]
                r = [e.replace('***', '0') for e in r]
                r = [e.replace('**', '0') for e in r]
                data[r[1]] = [{'HC01_VC01': float(r[3])},
                              {'HC01_VC01_MOE': float(r[4])},
                              {'HC02_VC01': float(r[5])},
                              {'HC02_VC01_MOE': float(r[6])},
                              {'HC03_VC01': float(r[7])},
                              {'HC03_VC01_MOE': float(r[8])},
                              {'HC01_VC03': float(r[9])},
                              {'HC01_VC03_MOE': float(r[10])},
                              {'HC02_VC03': float(r[11])},
                              {'HC02_VC03_MOE': float(r[12])},
                              {'HC03_VC03': float(r[13])},
                              {'HC03_VC03_MOE': float(r[14])},
                              {'HC01_VC04': float(r[15])},
                              {'HC01_VC04_MOE': float(r[16])},
                              {'HC02_VC04': float(r[17])},
                              {'HC02_VC04_MOE': float(r[18])},
                              {'HC03_VC04': float(r[19])},
                              {'HC03_VC04_MOE': float(r[20])},
                              {'HC01_VC05': float(r[21])},
                              {'HC01_VC05_MOE': float(r[22])},
                              {'HC02_VC05': float(r[23])},
                              {'HC02_VC05_MOE': float(r[24])},
                              {'HC03_VC05': float(r[25])},
                              {'HC03_VC05_MOE': float(r[26])},
                              {'HC01_VC06': float(r[27])},
                              {'HC01_VC06_MOE': float(r[28])},
                              {'HC02_VC06': float(r[29])},
                              {'HC02_VC06_MOE': float(r[30])},
                              {'HC03_VC06': float(r[31])},
                              {'HC03_VC06_MOE': float(r[32])},
                              {'HC01_VC14': float(r[51])},
                              {'HC01_VC14_MOE': float(r[52])},
                              {'HC02_VC14': float(r[53])},
                              {'HC02_VC14_MOE': float(r[54])},
                              {'HC03_VC14': float(r[55])},
                              {'HC03_VC14_MOE': float(r[56])},
                              {'HC01_VC15': float(r[57])},
                              {'HC01_VC15_MOE': float(r[58])},
                              {'HC02_VC15': float(r[59])},
                              {'HC02_VC15_MOE': float(r[60])},
                              {'HC03_VC15': float(r[61])},
                              {'HC03_VC15_MOE': float(r[62])},
                              {'HC01_VC19': float(r[81])},
                              {'HC01_VC19_MOE': float(r[82])},
                              {'HC02_VC19': float(r[83])},
                              {'HC02_VC19_MOE': float(r[84])},
                              {'HC03_VC19': float(r[85])},
                              {'HC03_VC19_MOE': float(r[86])},
                              {'HC01_VC39': float(r[183])},
                              {'HC01_VC39_MOE': float(r[184])},
                              {'HC02_VC39': float(r[185])},
                              {'HC02_VC39_MOE': float(r[186])},
                              {'HC03_VC39': float(r[187])},
                              {'HC03_VC39_MOE': float(r[188])}]
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM S2501_ACS"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("S2501_ACS table already exists. Skipping...")
                return
            for r in data:
                get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s"
                cursor.execute(get_track_id_sql, (r))
                track_pk_id = cursor.fetchone()
                if track_pk_id is None:
                    continue
                update_sql = "INSERT INTO `S2501_ACS` " \
                             "(`HC01_VC01`, `HC01_VC01_MOE`, `HC02_VC01`, `HC02_VC01_MOE`, `HC03_VC01`, `HC03_VC01_MOE`, " \
                             "`HC01_VC03`, `HC01_VC03_MOE`, `HC02_VC03`, `HC02_VC03_MOE`, `HC03_VC03`, `HC03_VC03_MOE`, " \
                             "`HC01_VC04`, `HC01_VC04_MOE`, `HC02_VC04`, `HC02_VC04_MOE`, `HC03_VC04`, `HC03_VC04_MOE`, " \
                             "`HC01_VC05`, `HC01_VC05_MOE`, `HC02_VC05`, `HC02_VC05_MOE`, `HC03_VC05`, `HC03_VC05_MOE`, " \
                             "`HC01_VC06`, `HC01_VC06_MOE`, `HC02_VC06`, `HC02_VC06_MOE`, `HC03_VC06`, `HC03_VC06_MOE`, " \
                             "`HC01_VC14`, `HC01_VC14_MOE`, `HC02_VC14`, `HC02_VC14_MOE`, `HC03_VC14`, `HC03_VC14_MOE`, " \
                             "`HC01_VC15`, `HC01_VC15_MOE`, `HC02_VC15`, `HC02_VC15_MOE`, `HC03_VC15`, `HC03_VC15_MOE`, " \
                             "`HC01_VC19`, `HC01_VC19_MOE`, `HC02_VC19`, `HC02_VC19_MOE`, `HC03_VC19`, `HC03_VC19_MOE`, " \
                             "`HC01_VC39`, `HC01_VC39_MOE`, `HC02_VC39`, `HC02_VC39_MOE`, `HC03_VC39`, `HC03_VC39_MOE`, " \
                             "`track_pk_id`) " \
                             "VALUES " \
                             "(%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s)"
                try:
                    cursor.execute(update_sql, (data[r][0]['HC01_VC01'], data[r][1]['HC01_VC01_MOE'], data[r][2]['HC02_VC01'], data[r][3]['HC02_VC01_MOE'], data[r][4]['HC03_VC01'], data[r][5]['HC03_VC01_MOE'],
                                                data[r][6]['HC01_VC03'], data[r][7]['HC01_VC03_MOE'], data[r][8]['HC02_VC03'], data[r][9]['HC02_VC03_MOE'], data[r][10]['HC03_VC03'], data[r][11]['HC03_VC03_MOE'],
                                                data[r][12]['HC01_VC04'], data[r][13]['HC01_VC04_MOE'], data[r][14]['HC02_VC04'], data[r][15]['HC02_VC04_MOE'], data[r][16]['HC03_VC04'], data[r][17]['HC03_VC04_MOE'],
                                                data[r][18]['HC01_VC05'], data[r][19]['HC01_VC05_MOE'], data[r][20]['HC02_VC05'], data[r][21]['HC02_VC05_MOE'], data[r][22]['HC03_VC05'], data[r][23]['HC03_VC05_MOE'],
                                                data[r][24]['HC01_VC06'], data[r][25]['HC01_VC06_MOE'], data[r][26]['HC02_VC06'], data[r][27]['HC02_VC06_MOE'], data[r][28]['HC03_VC06'], data[r][29]['HC03_VC06_MOE'],
                                                data[r][30]['HC01_VC14'], data[r][31]['HC01_VC14_MOE'], data[r][32]['HC02_VC14'], data[r][33]['HC02_VC14_MOE'], data[r][34]['HC03_VC14'], data[r][35]['HC03_VC14_MOE'],
                                                data[r][36]['HC01_VC15'], data[r][37]['HC01_VC15_MOE'], data[r][38]['HC02_VC15'], data[r][39]['HC02_VC15_MOE'], data[r][40]['HC03_VC15'], data[r][41]['HC03_VC15_MOE'],
                                                data[r][42]['HC01_VC19'], data[r][43]['HC01_VC19_MOE'], data[r][44]['HC02_VC19'], data[r][45]['HC02_VC19_MOE'], data[r][46]['HC03_VC19'], data[r][47]['HC03_VC19_MOE'],
                                                data[r][48]['HC01_VC39'], data[r][49]['HC01_VC39_MOE'], data[r][50]['HC02_VC39'], data[r][51]['HC02_VC39_MOE'], data[r][52]['HC03_VC39'], data[r][53]['HC03_VC39_MOE'],
                                                track_pk_id['pk_id']))
                except Exception as e:
                    print(e)
                self.acsdb.con.commit()
示例#3
0
class AZ(object):
    def __init__(self):
        init_logging()
        self.logger = logging.getLogger(__name__)
        self.logger.info("Amazon object initialized")

        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "az_config.yml"), "r") as fh:
            settings = yaml.load(fh)

        self.db = Mysql(settings['db_config'])

        self.access_key = settings['access_key_id']
        self.secret_key = settings['secret_key_id']
        self.associate_tag = settings['associate_tag']
        self.default_weight = settings['default_weight']
        self.az_price = None
        self.az_asin = None
        self.az_sales_rank = None
        self.az_url = None
        self.az_match = None

        self.amazon = AmazonAPI(self.access_key, self.secret_key, self.associate_tag)

    def destroy(self):
        self.logger.info("Database connection closed...")
        self.db.exit()
        self.logger.info("Amazon object destroyed")

    def find_best_match(self, title, cat='All'):  # TODO: consider using cat='Blended' for default
        """

        :param title: string containing the source site product title
        :param cat: string containing the category for searchindex on amazon
        :return: product dict with price, weight, sales_rank, offer_url and T if match occurred on title search,
        K if match occurred on keyword search, N if no match occurred.
        """
        self._find_by_title(title, cat)
        if self.product['az_price'] != 0:
            self.product['az_match'] = 'T'
        if self.product['az_price'] == 0:
            self._find_by_key(title, cat)
            if self.product['az_price'] != 0:
                self.product['az_match'] = 'K'
        if self.product['az_price'] == 0:  # we didn't find any match so clean remaining attrs
            (self.product['az_sales_rank'], self.product['az_match'], self.product['az_url']) = (0, 'N', '')
        self._get_attrs()
        #return self.az_price, self.az_weight, self.az_sales_rank, self.az_match, self.az_url, self.az_asin
        # consider returning a status flag

    def _find_by_title(self, title, cat):
        lowest = 0.0
        try:
            products = self.amazon.search(Title=title, SearchIndex=cat)
            for i, p in enumerate(products):
                price = p.price_and_currency[0]
                if lowest == 0.0:
                    lowest = price
                    self.product['asin'] = p.asin
                elif price < lowest:
                    lowest = price
                    self.product['asin'] = p.asin
        except Exception as e:
            self.logger.warn("%s had no match in amazon" % title)
        self.product['az_price'] = lowest

    def _find_by_key(self, title, cat):
        lowest = 0.0
        try:
            products = self.amazon.search(Keywords=title, SearchIndex=cat)
            for i, p in enumerate(products):
                price = p.price_and_currency[0]
                if lowest == 0.0:
                    lowest = price
                    self.product['asin'] = p.asin
                elif price < lowest:
                    lowest = price
                    self.product['asin'] = p.asin
        except Exception as e:
            self.logger.warn("%s had no match in amazon" % title)
        self.product['az_price'] = lowest

    def _get_attrs(self):
        (r, w, u, i) = (None, None, None, None)
        if self.product.get('asin', None) is not None:
            try:
                p = self.amazon.lookup(ItemId=self.product['asin'])
                r = int(p.sales_rank)
                i = p.images[0].LargeImage.URL.text
                w = p.get_attribute('ItemDimensions.Weight')
                u = p.offer_url
                self._get_meta_rate(p.reviews[1])
            except:
                pass
            if r is None:
                r = 0
            if i is None:
                i = ''
            if w is None:
                w = self.product['az_weight']  # this should not be in hundreth-pounds
            else:
                w = float(w)*0.01  # its coming from amazon in hundreth-pounds seemingly
        else:  # there was no product found.
            (r, w, u, i) = (0, 0, '', '')
        self.product['az_sales_rank'] = r
        self.product['az_weight'] = w
        self.product['az_url'] = u
        self.product['img'] = i

    def _get_weight(self):
        w = None
        if self.product['asin'] is not None:
            try:
                p = self.amazon.lookup(ItemId=self.product['asin'])
                w = p.get_attribute('ItemDimensions.Weight')
            except:
                pass
            if w is None:
                w = self.product['az_weight']
            else:
                w = float(w)*0.01  # its coming from amazon in hundreth-pounds seemingly
        else:
            w = 0
        return w

    def _get_meta_rate(self, url):
        rank_page = get_page(url, 1)
        if rank_page[0].find(string=re.compile(r'There are no customer reviews for this item')):
            self.product['num_reviews'] = 0
            self.product['avg_review'] = 0
        else:
            try:
                rating = rank_page[0].find("span", {"class":"asinReviewsSummary"}).img.attrs['title']
                self.product['num_reviews'] = rating
            except:
                self.product['num_reviews'] = 0
            try:
                reviews = rank_page[0].find(string=re.compile(r'\d+ customer reviews'))  # TODO: strip out text
                self.product['avg_review'] = reviews
            except:
                self.product['avg_review'] = 0

    def _init_prod_dict(self):
        self.product = {}
        self.product['az_weight'] = self.default_weight
        self.product['az_price'] = 0.0
        self.product['avg_review'] = 0
        self.product['num_reviews'] = 0

    def match_products(self):
        """
        Method looks for unmatched products in match_cw. When it finds a new product,
        attempts to match with amazon product. When successful, will then update
        amazon product row and match_cw row
        :return:
        """
        new_prods = self._get_unmatched_products()
        for k in new_prods:
            self._init_prod_dict()
            self.product['name'] = new_prods[k]['title']
            self.find_best_match(self.product['name'])
            if self.product.get('asin', None) is None:
                continue
            self._persist_product(self.product, k)

    def _get_unmatched_products(self):
        """
        Finds all the empty products in match_cw. Empty is a null element for az_pk_id in a
        product row, which means it was written by another product find and had no match.
        :return: dict {wal_pk_id: {title, match_pk_id}}
        """
        new_products = {}
        with self.db.con.cursor() as cursor:
            select_sql = "SELECT pk_id, wal_pk_id " \
                         "FROM match_cw " \
                         "WHERE az_pk_id is NULL"
            try:
                cursor.execute(select_sql, ())
                ret = cursor.fetchall()
                new_products = {e['wal_pk_id']: {'match_pk_id': e['pk_id']} for e in ret}
            except Exception as e:
                self.logger.exception(e)
            select_sql = "SELECT title " \
                         "FROM walmart_product " \
                         "WHERE pk_id=%s"
            for e in new_products:
                cursor.execute(select_sql, (e))
                ret = cursor.fetchone()
                if ret is not None:
                    new_products[e]['title'] = ret['title']
        return new_products

    def _persist_match(self, az_pk_id, wal_pk_id):
        """
        should actually write az_pk_id to existing product row....
        :param az_pk_id: this is the az_pk_id to write to match_cw row
        :wal_pk_id: this is the wal_pk_id that identifies the row to update
        :return:
        """
        if az_pk_id is None:
            return
        with self.db.con.cursor() as cursor:
            select_sql = "SELECT pk_id " \
                         "FROM match_cw " \
                         "WHERE wal_pk_id=%s"
            cursor.execute(select_sql, (wal_pk_id))
            ret = cursor.fetchone()
            if ret is None:
                self.logger.warn("There was a match issue with wal_pk_id %s" % az_pk_id)
                return
            # there was no row for that amazon product
            update_sql = "UPDATE match_cw " \
                         "SET az_pk_id=%s,  " \
                         "last_update=now() " \
                         "WHERE pk_id=%s"
            cursor.execute(update_sql, (az_pk_id,
                                        ret['pk_id']))
            self.db.con.commit()

    def _update_match(self, match):
        """
        TODO: this needs to update things like roi, etc. possibly pull it out...
        :param match:
        :return:
        """
        if match is None:
            return
        with self.db.con.cursor() as cursor:
            select_sql = "SELECT pk_id " \
                         "FROM match_cw " \
                         "WHERE az_pk_id=%s"
            cursor.execute(select_sql, (match['pk_id']))
            ret = cursor.fetchone()
            if ret is None:
                self.logger.warn("There was a match issue with az_pk_id %s" % match['pk_id'])
                return
            # there was no row for that amazon product
            update_sql = "UPDATE match_cw " \
                         "SET az_pk_id=%s, wal_pk_id=%s, " \
                         "net=%s, roi=%s, match_score=%s, " \
                         "match_meth=%s, last_update=now() " \
                         "WHERE pk_id=%s"
            cursor.execute(update_sql, (match['az_pk_id'],
                                        match['wal_pk_id'],
                                        match['net'],
                                        match['roi'],
                                        match['match_score'],
                                        match['match_meth'],
                                        match['pk_id']))
            self.db.con.commit()

    def _persist_product(self, product, wal_pk_id):
        """
        This method will insert or update a row for the product in the az table as well
        as call to update the corresponding row in match_cw with the az_pk_id. Expects
        to find az_product with asin.
        If a match_cw_id is passed in product dict,
        then there is a row in match_cw that needs to be updated
        with the product az_pk_id.
        :param product: dict with values to insert/update in az_product.
        :param wal_pk_id: pk_id from original search on no amazon info in row
        :return:
        """
        if 'name' not in product:
            self.logger.warn("Attempt to persist empty product!")
            return
        with self.db.con.cursor() as cursor:
            (az_id, az_price) = (0, 0.0)  # declare as a wider scope var to track through entire method
            select_sql = "SELECT pk_id, price " \
                         "FROM az_product " \
                         "WHERE asin=%s"
            cursor.execute(select_sql, (product['asin']))
            ret = cursor.fetchone()
            # no row in az product table with that asin, so insert row
            if ret is None:
                insert_sql = "INSERT INTO az_product " \
                             "(asin, avg_rate, img, num_reviews, " \
                             "price, sales_rank, shipping_cost, " \
                             "title, upc, url, weight, " \
                             "last_read, last_changed) " \
                             "VALUES " \
                             "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, now(), now())"
                try:
                    cursor.execute(insert_sql, (product['asin'],
                                                product['avg_review'],
                                                product['img'],
                                                product['num_reviews'],
                                                product.get('az_price', 0.0),
                                                product['az_sales_rank'],
                                                product.get('shipping_cost', 1.00),  # no shipping cost or upc
                                                product['name'],
                                                product.get('upc',''),
                                                product['az_url'],
                                                product['az_weight']))
                    self.db.con.commit()
                    # now go back and get the pk_id of the new row just inserted
                    select_sql = "SELECT pk_id, price " \
                         "FROM az_product " \
                         "WHERE asin=%s"
                    cursor.execute(select_sql, (product['asin']))
                    ret = cursor.fetchone()
                    az_id = ret.get('pk_id', None)
                    if az_id is None:
                        self.logger.warn("Error in inserting new az_product.")
                        return
                    az_price = ret.get('price', 0.0)
                except:
                    self.logger.exception('Failed to insert new az product: %s.' % product['asin'])
                    return
            # otherwise, product exists - if price is changed update product
            elif product['az_price'] != ret['price']:
                update_sql = "UPDATE az_product " \
                             "SET asin=%s, avg_rate=%s, " \
                             "img=%s, num_reviews=%s, " \
                             "price=%s, sales_rank=%s, " \
                             "shipping_cost=%s, last_read=now(), " \
                             "last_changed=now(), title=%s, " \
                             "url=%s, weight=%s " \
                             "WHERE pk_id=%s"
                try:
                    cursor.execute(update_sql, (product['az_price'],
                                                product['name'],
                                                product['az_url'],
                                                product['img'],
                                                ret['pk_id']))
                    self.db.con.commit()
                except:
                    self.logger.exception('Failed to update az product: %s.' % product['asin'])
            # otherwise, product exists but price is unchanged so simply refresh timestamp
            else:
                update_sql = "UPDATE az_product " \
                             "SET last_read=now()" \
                             "WHERE pk_id=%s"
                cursor.execute(update_sql, (ret['pk_id']))
                self.db.con.commit()
            # now need to place match in match_cw
            if product.get('az_match'):
                self._persist_match(ret['pk_id'], wal_pk_id)
示例#4
0
文件: model.py 项目: smehan/py-gmap
class Model():
    """

    """

    def __init__(self):
        """Constructor for Model"""
        self.db = Mysql("Pydb/mysql_config.yml")
        self.tracts = {}

    def destroy(self):
        self.db.exit()

    def get_tracts(self):
        """
        Builds up a data set of all tracts, including ratio to compute to each zip.
        :return:
        """
        self._get_zips()
        self._build_tracts()
        self._add_median_incomes()
        self._add_housing()
        self._add_occupancy_features()

    def _get_zips(self):
        """
        returns all the Nassau county pk_ids of zips, sorted by city asc
        :param self:
        :return:
        """
        with self.db.con.cursor() as cursor:
            select_sql = "SELECT `pk_id`,`zipcode` FROM zip WHERE `county`=%s " \
                         "ORDER BY `city` ASC"
            cursor.execute(select_sql, ("Nassau"))
            ret = cursor.fetchall()
            for r in ret:
                self.tracts[r['zipcode']] = {'zip_pk_id': r['pk_id']}

    def _build_tracts(self):
        """
        builds up tract information in dataset, adding a track_pk_id, the ratio of
        residences of that tract to be counted in the chosen zip.
        :return:
        """
        with self.db.con.cursor() as cursor:
            for z in self.tracts:
                select_sql = "SELECT `track_pk_id`, `res_ratio` FROM `zip_tract_cw` " \
                             "WHERE `zip_pk_id`=%s"
                cursor.execute(select_sql, (self.tracts[z]['zip_pk_id']))
                ret = cursor.fetchall()
                for r in ret:
                    self.tracts[z][r['track_pk_id']] = {'res_ratio': r['res_ratio']}

    def _add_median_incomes(self):
        """
        builds median incomes for each tract, by occupied, owner-occupied, renter.
        :return:
        """
        with self.db.con.cursor() as cursor:
            for z in self.tracts:
                for k, v in self.tracts[z].items():
                    if k == 'zip_pk_id':
                        continue
                    select_sql = "SELECT `HC01_VC14`, `HC02_VC14`, `HC03_VC14` FROM `S2503_ACS` " \
                                 "WHERE `track_pk_id`=%s"
                    cursor.execute(select_sql, (k))
                    ret = cursor.fetchone()
                    if ret is not None:
                        self.tracts[z][k]['HC01_VC14'] = ret['HC01_VC14']
                        self.tracts[z][k]['HC02_VC14'] = ret['HC02_VC14']
                        self.tracts[z][k]['HC03_VC14'] = ret['HC03_VC14']

    def _add_housing(self):
        """
        builds housing totals for occupied, owner-occupied, renter in each tract.
        :return:
        """
        with self.db.con.cursor() as cursor:
            for z in self.tracts:
                for k, v in self.tracts[z].items():
                    if k == 'zip_pk_id':
                        continue
                    select_sql = "SELECT `HC01_VC01`, `HC01_VC01_MOE`, " \
                                 "`HC02_VC01`, `HC02_VC01_MOE`, " \
                                 "`HC03_VC01`, `HC03_VC01_MOE` FROM `S2503_ACS` " \
                                 "WHERE `track_pk_id`=%s"
                    cursor.execute(select_sql, (k))
                    ret = cursor.fetchone()
                    if ret:
                        self.tracts[z][k]['HC01_VC01'] = ret['HC01_VC01']
                        self.tracts[z][k]['HC02_VC01'] = ret['HC02_VC01']
                        self.tracts[z][k]['HC03_VC01'] = ret['HC03_VC01']
                        self.tracts[z][k]['HC01_VC01_MOE'] = ret['HC01_VC01_MOE']
                        self.tracts[z][k]['HC02_VC01_MOE'] = ret['HC02_VC01_MOE']
                        self.tracts[z][k]['HC03_VC01_MOE'] = ret['HC03_VC01_MOE']
                    else:  # TODO: Move print to a warn in logger
                        print("_add_housing failed for track pk: {}".format(k))

    def _add_occupancy_features(self):
        """
        builds occupancy characteristics for occupied, owner-occupied, renter in each tract.
        :return: adds occupancy as sub-dict in self.tracts[z][tract_pk_id]
        """
        with self.db.con.cursor() as cursor:
            for z in self.tracts:
                for k, v in self.tracts[z].items():
                    if k == 'zip_pk_id':
                        continue
                    select_sql = "SELECT HC01_VC01, HC01_VC01_MOE, " \
                                 "HC02_VC01, HC02_VC01_MOE, " \
                                 "HC03_VC01, HC03_VC01_MOE, " \
                                 "HC01_VC03, HC01_VC03_MOE, " \
                                 "HC02_VC03, HC02_VC03_MOE, " \
                                 "HC03_VC03, HC03_VC03_MOE, " \
                                 "HC01_VC04, HC01_VC04_MOE, " \
                                 "HC02_VC04, HC02_VC04_MOE, " \
                                 "HC03_VC04, HC03_VC04_MOE, " \
                                 "HC01_VC05, HC01_VC05_MOE, " \
                                 "HC02_VC05, HC02_VC05_MOE, " \
                                 "HC03_VC05, HC03_VC05_MOE, " \
                                 "HC01_VC06, HC01_VC06_MOE, " \
                                 "HC02_VC06, HC02_VC06_MOE, " \
                                 "HC03_VC06, HC03_VC06_MOE, " \
                                 "HC01_VC14, HC01_VC14_MOE, " \
                                 "HC02_VC14, HC02_VC14_MOE, " \
                                 "HC03_VC14, HC03_VC14_MOE, " \
                                 "HC01_VC15, HC01_VC15_MOE, " \
                                 "HC02_VC15, HC02_VC15_MOE, " \
                                 "HC03_VC15, HC03_VC15_MOE, " \
                                 "HC01_VC19, HC01_VC19_MOE, " \
                                 "HC02_VC19, HC02_VC19_MOE, " \
                                 "HC03_VC19, HC03_VC19_MOE, " \
                                 "HC01_VC39, HC01_VC39_MOE, " \
                                 "HC02_VC39, HC02_VC39_MOE, " \
                                 "HC03_VC39, HC03_VC39_MOE " \
                                 "FROM S2501_ACS " \
                                 "WHERE track_pk_id=%s"
                    cursor.execute(select_sql, (k))
                    ret = cursor.fetchone()
                    if ret:
                        self.tracts[z][k]['occupancy'] = ret
                    else:
                        self.tracts[z][k]['occupancy'] = {}
                        print("add_occupancy_features failed for track pk".format(k))  # TODO: move output to logger

    def build_rental_housing_densities(self):
        """
        calculates the density of renters/total occupied units for each zip.
        multiplies number of renters/occupiers by the weight of each tract contributing
        to the current zip.
        Firstly, get raw data for h1-4 renter numbers, renters total units as well as total occupied
        units in tract. Get res_ratio, the fraction of that tract which is contributing to the current zip.
        Then normalize all of the h1-4 with the total renters to facilitate comparisons on same scale.
        If normalization is non-zero, calculate the normalized renter density for that group. Check to
        see if this is the max for the zip and store if so. Calculate the contribution from this tract and
        add proportion element to total for the zip. Finally, calculate and store the avg for the zip.
        :return:
        """
        for z in self.tracts:
            t_rent = 0
            t_h1_rent = 0
            t_h2_rent = 0
            t_h3_rent = 0
            t_h4_rent = 0
            t_occ = 0
            rent_pop = 0
            self.tracts[z]['max_renter_density'] = self.tracts[z].get('max_renter_density', 0.0)
            self.tracts[z]['max_renter_income'] = self.tracts[z].get('max_renter_income', 0.0)
            self.tracts[z]['max_h1_renter_density'] = self.tracts[z].get('max_h1_renter_density', 0.0)
            self.tracts[z]['max_h2_renter_density'] = self.tracts[z].get('max_h2_renter_density', 0.0)
            self.tracts[z]['max_h3_renter_density'] = self.tracts[z].get('max_h3_renter_density', 0.0)
            self.tracts[z]['max_h4_renter_density'] = self.tracts[z].get('max_h4_renter_density', 0.0)
            self.tracts[z]['renter_pop_est'] = self.tracts[z].get('renter_pop_est', 0.0)
            for k, v in self.tracts[z].items():
                if isinstance(k, int):
                    renters = v.get('HC03_VC01', 0)
                    h1_renters = v['occupancy'].get('HC03_VC03', 0)
                    h2_renters = v['occupancy'].get('HC03_VC04', 0)
                    h3_renters = v['occupancy'].get('HC03_VC05', 0)
                    h4_renters = v['occupancy'].get('HC03_VC06', 0)
                    occupied = v.get('HC01_VC01', 1)
                    if v.get('res_ratio', 1) > 0:
                        res_ratio = v.get('res_ratio', 1)
                    else:
                        res_ratio = 1
                    if renters > 0:
                        current_den = renters/occupied
                    else:
                        current_den = 0.0
                    t_h_renters = h1_renters + h2_renters + h3_renters + h4_renters
                    if t_h_renters > 0:
                        ren_norm = renters/t_h_renters
                    else:
                        ren_norm = 1
                    h1_renters_norm = h1_renters*ren_norm
                    h2_renters_norm = h2_renters*ren_norm
                    h3_renters_norm = h3_renters*ren_norm
                    h4_renters_norm = h4_renters*ren_norm
                    if h1_renters_norm > 0:
                        current_h1_den = h1_renters_norm/occupied
                    else:
                        current_h1_den = 0.0
                    if h2_renters_norm > 0:
                        current_h2_den = h2_renters_norm/occupied
                    else:
                        current_h2_den = 0.0
                    if h3_renters_norm > 0:
                        current_h3_den = h3_renters_norm/occupied
                    else:
                        current_h3_den = 0.0
                    if h4_renters_norm > 0:
                        current_h4_den = h4_renters_norm/occupied
                    else:
                        current_h4_den = 0.0
                    if self.tracts[z]['max_renter_density'] < current_den:
                        self.tracts[z]['max_renter_density'] = round(current_den, 2)
                        self.tracts[z]['max_renter_income'] = round(v.get('HC03_VC14', 0.0), 0)
                    if self.tracts[z]['max_h1_renter_density'] < current_h1_den:
                        self.tracts[z]['max_h1_renter_density'] = round(current_h1_den, 2)
                    if self.tracts[z]['max_h2_renter_density'] < current_h2_den:
                        self.tracts[z]['max_h2_renter_density'] = round(current_h2_den, 2)
                    if self.tracts[z]['max_h3_renter_density'] < current_h3_den:
                        self.tracts[z]['max_h3_renter_density'] = round(current_h3_den, 2)
                    if self.tracts[z]['max_h4_renter_density'] < current_h4_den:
                        self.tracts[z]['max_h4_renter_density'] = round(current_h4_den, 2)
                    t_rent += renters * res_ratio
                    t_h1_rent += h1_renters_norm * res_ratio
                    t_h2_rent += h2_renters_norm * res_ratio
                    t_h3_rent += h3_renters_norm * res_ratio
                    t_h4_rent += h4_renters_norm * res_ratio
                    t_occ += occupied * res_ratio
                    rent_pop += h1_renters_norm * res_ratio + \
                                h2_renters_norm * res_ratio * 2 + \
                                h3_renters_norm * res_ratio * 3 + \
                                h4_renters_norm * res_ratio * 4
            if t_rent == 0:
                self.tracts[z]['avg_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_renter_density'] = round(t_rent/t_occ, 2)
                self.tracts[z]['renter_pop_est'] = round(rent_pop, 0)
            if t_h1_rent == 0:
                self.tracts[z]['avg_h1_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_h1_renter_density'] = round(t_h1_rent/t_occ, 2)
            if t_h2_rent == 0:
                self.tracts[z]['avg_h2_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_h2_renter_density'] = round(t_h2_rent/t_occ, 2)
            if t_h3_rent == 0:
                self.tracts[z]['avg_h3_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_h3_renter_density'] = round(t_h3_rent/t_occ, 2)
            if t_h4_rent == 0:
                self.tracts[z]['avg_h4_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_h4_renter_density'] = round(t_h4_rent/t_occ, 2)
        #self.make_output(output)
        #self.make_output(self.tracts, meth='pp')

    def build_incomes(self):
        """
        calculates the avg median renter income in each zip.
        multiplies number of renters by the weight of each tract contributing
        to the current zip.
        :return:
        """
        for z in self.tracts:
            t_rent_income = 0
            t_pop = 0
            self.tracts[z]['max_renter_density'] = self.tracts[z].get('max_renter_density', 0.0)
            self.tracts[z]['max_renter_income'] = self.tracts[z].get('max_renter_income', 0)
            for k, v in self.tracts[z].items():
                if isinstance(k, int):
                    income = v.get('HC03_VC14', 0)
                    pop = v.get('HC03_VC01', 0)
                    res_ratio = v.get('res_ratio', 1)
                    if res_ratio == 0: res_ratio = 1
                    t_rent_income += income * pop * res_ratio
                    t_pop += pop * res_ratio
            if t_pop == 0:
                self.tracts[z]['avg_renter_income'] = 0.0
            else:
                self.tracts[z]['avg_renter_income'] = round(t_rent_income/t_pop, 0)

    def build_model(self):
        """
        Builds up a derived dataset of relevant features from ACS info pulled or built in other methods.
        Purpose is specific to data needed for this particular output.
        :return:
        """
        output = []
        for z in self.tracts:
            output.append([z,
                            self.tracts[z].get('avg_renter_density', 0.0),
                            self.tracts[z].get('max_renter_density', 0.0),
                            self.tracts[z].get('avg_renter_income', 0),
                            self.tracts[z].get('max_renter_income', 0),
                            self.tracts[z].get('avg_h1_renter_density', 0),
                            self.tracts[z].get('max_h1_renter_density', 0),
                            self.tracts[z].get('avg_h2_renter_density', 0),
                            self.tracts[z].get('max_h2_renter_density', 0),
                            self.tracts[z].get('avg_h3_renter_density', 0),
                            self.tracts[z].get('max_h3_renter_density', 0),
                            self.tracts[z].get('avg_h4_renter_density', 0),
                            self.tracts[z].get('max_h4_renter_density', 0),
                            self.tracts[z].get('renter_pop_est', 0)])
        self.make_output(output, filename='model')

    def make_output(self, data, meth=None, filename=None):
        if meth == 'pp':
            pp = pprint.PrettyPrinter()
            pp.pprint(data)
            return
        elif filename is None:
            filename='output'
        dir_name = '../data/output'
        filename_suffix = '.csv'
        path = os.path.join(dir_name, filename + filename_suffix)
        with open(path, 'w') as fh:
            writer = csv.writer(fh)
            for line in data:
                writer.writerow(line)
示例#5
0
文件: model.py 项目: smehan/py-gmap
class Model():
    """

    """
    def __init__(self):
        """Constructor for Model"""
        self.db = Mysql("Pydb/mysql_config.yml")
        self.tracts = {}

    def destroy(self):
        self.db.exit()

    def get_tracts(self):
        """
        Builds up a data set of all tracts, including ratio to compute to each zip.
        :return:
        """
        self._get_zips()
        self._build_tracts()
        self._add_median_incomes()
        self._add_housing()
        self._add_occupancy_features()

    def _get_zips(self):
        """
        returns all the Nassau county pk_ids of zips, sorted by city asc
        :param self:
        :return:
        """
        with self.db.con.cursor() as cursor:
            select_sql = "SELECT `pk_id`,`zipcode` FROM zip WHERE `county`=%s " \
                         "ORDER BY `city` ASC"
            cursor.execute(select_sql, ("Nassau"))
            ret = cursor.fetchall()
            for r in ret:
                self.tracts[r['zipcode']] = {'zip_pk_id': r['pk_id']}

    def _build_tracts(self):
        """
        builds up tract information in dataset, adding a track_pk_id, the ratio of
        residences of that tract to be counted in the chosen zip.
        :return:
        """
        with self.db.con.cursor() as cursor:
            for z in self.tracts:
                select_sql = "SELECT `track_pk_id`, `res_ratio` FROM `zip_tract_cw` " \
                             "WHERE `zip_pk_id`=%s"
                cursor.execute(select_sql, (self.tracts[z]['zip_pk_id']))
                ret = cursor.fetchall()
                for r in ret:
                    self.tracts[z][r['track_pk_id']] = {
                        'res_ratio': r['res_ratio']
                    }

    def _add_median_incomes(self):
        """
        builds median incomes for each tract, by occupied, owner-occupied, renter.
        :return:
        """
        with self.db.con.cursor() as cursor:
            for z in self.tracts:
                for k, v in self.tracts[z].items():
                    if k == 'zip_pk_id':
                        continue
                    select_sql = "SELECT `HC01_VC14`, `HC02_VC14`, `HC03_VC14` FROM `S2503_ACS` " \
                                 "WHERE `track_pk_id`=%s"
                    cursor.execute(select_sql, (k))
                    ret = cursor.fetchone()
                    if ret is not None:
                        self.tracts[z][k]['HC01_VC14'] = ret['HC01_VC14']
                        self.tracts[z][k]['HC02_VC14'] = ret['HC02_VC14']
                        self.tracts[z][k]['HC03_VC14'] = ret['HC03_VC14']

    def _add_housing(self):
        """
        builds housing totals for occupied, owner-occupied, renter in each tract.
        :return:
        """
        with self.db.con.cursor() as cursor:
            for z in self.tracts:
                for k, v in self.tracts[z].items():
                    if k == 'zip_pk_id':
                        continue
                    select_sql = "SELECT `HC01_VC01`, `HC01_VC01_MOE`, " \
                                 "`HC02_VC01`, `HC02_VC01_MOE`, " \
                                 "`HC03_VC01`, `HC03_VC01_MOE` FROM `S2503_ACS` " \
                                 "WHERE `track_pk_id`=%s"
                    cursor.execute(select_sql, (k))
                    ret = cursor.fetchone()
                    if ret:
                        self.tracts[z][k]['HC01_VC01'] = ret['HC01_VC01']
                        self.tracts[z][k]['HC02_VC01'] = ret['HC02_VC01']
                        self.tracts[z][k]['HC03_VC01'] = ret['HC03_VC01']
                        self.tracts[z][k]['HC01_VC01_MOE'] = ret[
                            'HC01_VC01_MOE']
                        self.tracts[z][k]['HC02_VC01_MOE'] = ret[
                            'HC02_VC01_MOE']
                        self.tracts[z][k]['HC03_VC01_MOE'] = ret[
                            'HC03_VC01_MOE']
                    else:  # TODO: Move print to a warn in logger
                        print("_add_housing failed for track pk: {}".format(k))

    def _add_occupancy_features(self):
        """
        builds occupancy characteristics for occupied, owner-occupied, renter in each tract.
        :return: adds occupancy as sub-dict in self.tracts[z][tract_pk_id]
        """
        with self.db.con.cursor() as cursor:
            for z in self.tracts:
                for k, v in self.tracts[z].items():
                    if k == 'zip_pk_id':
                        continue
                    select_sql = "SELECT HC01_VC01, HC01_VC01_MOE, " \
                                 "HC02_VC01, HC02_VC01_MOE, " \
                                 "HC03_VC01, HC03_VC01_MOE, " \
                                 "HC01_VC03, HC01_VC03_MOE, " \
                                 "HC02_VC03, HC02_VC03_MOE, " \
                                 "HC03_VC03, HC03_VC03_MOE, " \
                                 "HC01_VC04, HC01_VC04_MOE, " \
                                 "HC02_VC04, HC02_VC04_MOE, " \
                                 "HC03_VC04, HC03_VC04_MOE, " \
                                 "HC01_VC05, HC01_VC05_MOE, " \
                                 "HC02_VC05, HC02_VC05_MOE, " \
                                 "HC03_VC05, HC03_VC05_MOE, " \
                                 "HC01_VC06, HC01_VC06_MOE, " \
                                 "HC02_VC06, HC02_VC06_MOE, " \
                                 "HC03_VC06, HC03_VC06_MOE, " \
                                 "HC01_VC14, HC01_VC14_MOE, " \
                                 "HC02_VC14, HC02_VC14_MOE, " \
                                 "HC03_VC14, HC03_VC14_MOE, " \
                                 "HC01_VC15, HC01_VC15_MOE, " \
                                 "HC02_VC15, HC02_VC15_MOE, " \
                                 "HC03_VC15, HC03_VC15_MOE, " \
                                 "HC01_VC19, HC01_VC19_MOE, " \
                                 "HC02_VC19, HC02_VC19_MOE, " \
                                 "HC03_VC19, HC03_VC19_MOE, " \
                                 "HC01_VC39, HC01_VC39_MOE, " \
                                 "HC02_VC39, HC02_VC39_MOE, " \
                                 "HC03_VC39, HC03_VC39_MOE " \
                                 "FROM S2501_ACS " \
                                 "WHERE track_pk_id=%s"
                    cursor.execute(select_sql, (k))
                    ret = cursor.fetchone()
                    if ret:
                        self.tracts[z][k]['occupancy'] = ret
                    else:
                        self.tracts[z][k]['occupancy'] = {}
                        print("add_occupancy_features failed for track pk".
                              format(k))  # TODO: move output to logger

    def build_rental_housing_densities(self):
        """
        calculates the density of renters/total occupied units for each zip.
        multiplies number of renters/occupiers by the weight of each tract contributing
        to the current zip.
        Firstly, get raw data for h1-4 renter numbers, renters total units as well as total occupied
        units in tract. Get res_ratio, the fraction of that tract which is contributing to the current zip.
        Then normalize all of the h1-4 with the total renters to facilitate comparisons on same scale.
        If normalization is non-zero, calculate the normalized renter density for that group. Check to
        see if this is the max for the zip and store if so. Calculate the contribution from this tract and
        add proportion element to total for the zip. Finally, calculate and store the avg for the zip.
        :return:
        """
        for z in self.tracts:
            t_rent = 0
            t_h1_rent = 0
            t_h2_rent = 0
            t_h3_rent = 0
            t_h4_rent = 0
            t_occ = 0
            rent_pop = 0
            self.tracts[z]['max_renter_density'] = self.tracts[z].get(
                'max_renter_density', 0.0)
            self.tracts[z]['max_renter_income'] = self.tracts[z].get(
                'max_renter_income', 0.0)
            self.tracts[z]['max_h1_renter_density'] = self.tracts[z].get(
                'max_h1_renter_density', 0.0)
            self.tracts[z]['max_h2_renter_density'] = self.tracts[z].get(
                'max_h2_renter_density', 0.0)
            self.tracts[z]['max_h3_renter_density'] = self.tracts[z].get(
                'max_h3_renter_density', 0.0)
            self.tracts[z]['max_h4_renter_density'] = self.tracts[z].get(
                'max_h4_renter_density', 0.0)
            self.tracts[z]['renter_pop_est'] = self.tracts[z].get(
                'renter_pop_est', 0.0)
            for k, v in self.tracts[z].items():
                if isinstance(k, int):
                    renters = v.get('HC03_VC01', 0)
                    h1_renters = v['occupancy'].get('HC03_VC03', 0)
                    h2_renters = v['occupancy'].get('HC03_VC04', 0)
                    h3_renters = v['occupancy'].get('HC03_VC05', 0)
                    h4_renters = v['occupancy'].get('HC03_VC06', 0)
                    occupied = v.get('HC01_VC01', 1)
                    if v.get('res_ratio', 1) > 0:
                        res_ratio = v.get('res_ratio', 1)
                    else:
                        res_ratio = 1
                    if renters > 0:
                        current_den = renters / occupied
                    else:
                        current_den = 0.0
                    t_h_renters = h1_renters + h2_renters + h3_renters + h4_renters
                    if t_h_renters > 0:
                        ren_norm = renters / t_h_renters
                    else:
                        ren_norm = 1
                    h1_renters_norm = h1_renters * ren_norm
                    h2_renters_norm = h2_renters * ren_norm
                    h3_renters_norm = h3_renters * ren_norm
                    h4_renters_norm = h4_renters * ren_norm
                    if h1_renters_norm > 0:
                        current_h1_den = h1_renters_norm / occupied
                    else:
                        current_h1_den = 0.0
                    if h2_renters_norm > 0:
                        current_h2_den = h2_renters_norm / occupied
                    else:
                        current_h2_den = 0.0
                    if h3_renters_norm > 0:
                        current_h3_den = h3_renters_norm / occupied
                    else:
                        current_h3_den = 0.0
                    if h4_renters_norm > 0:
                        current_h4_den = h4_renters_norm / occupied
                    else:
                        current_h4_den = 0.0
                    if self.tracts[z]['max_renter_density'] < current_den:
                        self.tracts[z]['max_renter_density'] = round(
                            current_den, 2)
                        self.tracts[z]['max_renter_income'] = round(
                            v.get('HC03_VC14', 0.0), 0)
                    if self.tracts[z]['max_h1_renter_density'] < current_h1_den:
                        self.tracts[z]['max_h1_renter_density'] = round(
                            current_h1_den, 2)
                    if self.tracts[z]['max_h2_renter_density'] < current_h2_den:
                        self.tracts[z]['max_h2_renter_density'] = round(
                            current_h2_den, 2)
                    if self.tracts[z]['max_h3_renter_density'] < current_h3_den:
                        self.tracts[z]['max_h3_renter_density'] = round(
                            current_h3_den, 2)
                    if self.tracts[z]['max_h4_renter_density'] < current_h4_den:
                        self.tracts[z]['max_h4_renter_density'] = round(
                            current_h4_den, 2)
                    t_rent += renters * res_ratio
                    t_h1_rent += h1_renters_norm * res_ratio
                    t_h2_rent += h2_renters_norm * res_ratio
                    t_h3_rent += h3_renters_norm * res_ratio
                    t_h4_rent += h4_renters_norm * res_ratio
                    t_occ += occupied * res_ratio
                    rent_pop += h1_renters_norm * res_ratio + \
                                h2_renters_norm * res_ratio * 2 + \
                                h3_renters_norm * res_ratio * 3 + \
                                h4_renters_norm * res_ratio * 4
            if t_rent == 0:
                self.tracts[z]['avg_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_renter_density'] = round(t_rent / t_occ, 2)
                self.tracts[z]['renter_pop_est'] = round(rent_pop, 0)
            if t_h1_rent == 0:
                self.tracts[z]['avg_h1_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_h1_renter_density'] = round(
                    t_h1_rent / t_occ, 2)
            if t_h2_rent == 0:
                self.tracts[z]['avg_h2_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_h2_renter_density'] = round(
                    t_h2_rent / t_occ, 2)
            if t_h3_rent == 0:
                self.tracts[z]['avg_h3_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_h3_renter_density'] = round(
                    t_h3_rent / t_occ, 2)
            if t_h4_rent == 0:
                self.tracts[z]['avg_h4_renter_density'] = 0.0
            else:
                self.tracts[z]['avg_h4_renter_density'] = round(
                    t_h4_rent / t_occ, 2)
        #self.make_output(output)
        #self.make_output(self.tracts, meth='pp')

    def build_incomes(self):
        """
        calculates the avg median renter income in each zip.
        multiplies number of renters by the weight of each tract contributing
        to the current zip.
        :return:
        """
        for z in self.tracts:
            t_rent_income = 0
            t_pop = 0
            self.tracts[z]['max_renter_density'] = self.tracts[z].get(
                'max_renter_density', 0.0)
            self.tracts[z]['max_renter_income'] = self.tracts[z].get(
                'max_renter_income', 0)
            for k, v in self.tracts[z].items():
                if isinstance(k, int):
                    income = v.get('HC03_VC14', 0)
                    pop = v.get('HC03_VC01', 0)
                    res_ratio = v.get('res_ratio', 1)
                    if res_ratio == 0: res_ratio = 1
                    t_rent_income += income * pop * res_ratio
                    t_pop += pop * res_ratio
            if t_pop == 0:
                self.tracts[z]['avg_renter_income'] = 0.0
            else:
                self.tracts[z]['avg_renter_income'] = round(
                    t_rent_income / t_pop, 0)

    def build_model(self):
        """
        Builds up a derived dataset of relevant features from ACS info pulled or built in other methods.
        Purpose is specific to data needed for this particular output.
        :return:
        """
        output = []
        for z in self.tracts:
            output.append([
                z, self.tracts[z].get('avg_renter_density', 0.0),
                self.tracts[z].get('max_renter_density', 0.0),
                self.tracts[z].get('avg_renter_income', 0), self.tracts[z].get(
                    'max_renter_income',
                    0), self.tracts[z].get('avg_h1_renter_density', 0),
                self.tracts[z].get('max_h1_renter_density', 0),
                self.tracts[z].get('avg_h2_renter_density', 0),
                self.tracts[z].get('max_h2_renter_density', 0),
                self.tracts[z].get('avg_h3_renter_density', 0),
                self.tracts[z].get('max_h3_renter_density', 0),
                self.tracts[z].get('avg_h4_renter_density', 0),
                self.tracts[z].get('max_h4_renter_density',
                                   0), self.tracts[z].get('renter_pop_est', 0)
            ])
        self.make_output(output, filename='model')

    def make_output(self, data, meth=None, filename=None):
        if meth == 'pp':
            pp = pprint.PrettyPrinter()
            pp.pprint(data)
            return
        elif filename is None:
            filename = 'output'
        dir_name = '../data/output'
        filename_suffix = '.csv'
        path = os.path.join(dir_name, filename + filename_suffix)
        with open(path, 'w') as fh:
            writer = csv.writer(fh)
            for line in data:
                writer.writerow(line)
示例#6
0
class Populate():
    """Builds the census_2010_schema with various tables.

    loads: census_2010 zips and tract ids.
    loads: US geo details for all states, counties, cities with zip, lat/long.
    """
    def __init__(self):
        """Constructor for Populate"""
        self.acsdb = Mysql("../Pydb/mysql_config.yml")

    def destroy(self):
        self.acsdb.exit()

    def load_tracts(self):
        tracts = {}  # {track_id : track_name}
        with open("data/TRACT_ZIP_122015.csv") as fh:
            r_count = 0
            reader = csv.reader(fh)
            for r in reader:
                r_count += 1
                if r_count < 2:
                    continue
                tracts[r[0]] = self.get_tract_name(r[0])
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM census_tract_2010"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("census_tract_2010 table already exists. Skipping...")
                return
            for t in tracts:
                update_sql = "INSERT INTO `census_tract_2010` (`track_id`, `track_name`) VALUES (%s, %s)"
                try:
                    cursor.execute(update_sql, (t, tracts[t]))
                except:
                    pass
                self.acsdb.con.commit()

    @staticmethod
    def get_tract_name(tract_id):
        """Pulls out the Census Tract Name from up to the last 6 digits in the track_id.
        This is not finished since it is taking 020100 and yielding 0201.00 and it should
        drop the initial 0 and for suffixes with no information it should drop 00.

        :param tract_id: this is the full tract id from the input file
        """
        base = re.search(r'\d+(\d{4})(\d\d)', tract_id)
        name = ".".join((base.group(1), base.group(2)))
        return name

    def load_zip_tract_crosswalk(self):
        data = []
        with open("data/TRACT_ZIP_122015.csv") as fh:
            reader = csv.reader(fh)
            index = 0
            for r in reader:
                index += 1
                if index < 2:  # skip header
                    continue
                data.append([
                    r[0], r[1], r[2], r[3], r[4], r[5]
                ])  # TRACT,ZIP,RES_RATIO,BUS_RATIO,OTH_RATIO,TOT_RATIO
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM zip_tract_cw"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("zip_tract_cw table already exists. Skipping...")
                return
            for r in data:
                zip_id_sql = "SELECT `pk_id` FROM `zip` WHERE `zipcode`=%s"
                cursor.execute(zip_id_sql, (r[1]))
                zip_pk_id = cursor.fetchone()
                if zip_pk_id is None:
                    print(r[1])
                track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` WHERE `track_id`=%s"
                cursor.execute(track_id_sql, (r[0]))
                track_pk_id = cursor.fetchone()
                if track_pk_id is None:
                    print(r[0])
                    continue
                insert_sql = "INSERT INTO `zip_tract_cw` " \
                             "(`track_pk_id`, " \
                             "`zip_pk_id`, " \
                             "`res_ratio`, " \
                             "`bus_ratio`, " \
                             "`oth_ratio`, " \
                             "`tot_ratio`) " \
                             "VALUES (%s, %s, %s, %s, %s, %s)"
                try:
                    cursor.execute(insert_sql,
                                   (track_pk_id['pk_id'], zip_pk_id['pk_id'],
                                    r[2], r[3], r[4], r[5]))
                    self.acsdb.con.commit()
                except Exception as e:
                    print(
                        "tract %s attempted to map into zip %s and not found - %s"
                        % (r[0], r[1], e))

    def load_geo_details(self):
        geo = {}
        with open("data/US/US.txt") as fh:
            reader = csv.reader(fh, delimiter='\t')
            for r in reader:
                geo[r[1]] = [{
                    'city': r[2]
                }, {
                    'county': r[5]
                }, {
                    'state': r[3]
                }, {
                    'lat': r[9]
                }, {
                    'lon': r[10]
                }]
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM zip"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("Zip table already exists. Skipping...")
                return
            for g in geo:
                update_sql = "INSERT INTO `zip` (`city`, `county`, `state`, `lat`, `lon`, `zipcode`) " \
                             "VALUES (%s, %s, %s, %s, %s, %s)"
                try:
                    cursor.execute(update_sql,
                                   (geo[g][0]['city'], geo[g][1]['county'],
                                    geo[g][2]['state'], geo[g][3]['lat'],
                                    geo[g][4]['lon'], g))
                except:
                    pass
                self.acsdb.con.commit()

    def load_S2503(self):
        data = {}
        with open("data/ACS_14_5YR_S2503.csv") as fh:
            reader = csv.reader(fh)
            r_count = 0
            for r in reader:
                r_count += 1
                if r_count < 3:
                    continue
                # name = re.search(r'[\w .]+', r[2]).group(0)
                r = [e.replace('-', '0') for e in r]
                r = [e.replace('(X)', '0') for e in r]
                r = [e.replace('250,000+', '250000') for e in r]
                r = [e.replace('***', '0') for e in r]
                r = [e.replace('**', '0') for e in r]
                data[r[1]] = [{
                    'HC01_VC01': float(r[3])
                }, {
                    'HC02_VC01': float(r[5])
                }, {
                    'HC03_VC01': float(r[7])
                }, {
                    'HC01_VC03': float(r[9])
                }, {
                    'HC02_VC03': float(r[11])
                }, {
                    'HC03_VC03': float(r[13])
                }, {
                    'HC01_VC04': float(r[15])
                }, {
                    'HC02_VC04': float(r[17])
                }, {
                    'HC03_VC04': float(r[19])
                }, {
                    'HC01_VC05': float(r[21])
                }, {
                    'HC02_VC05': float(r[23])
                }, {
                    'HC03_VC05': float(r[25])
                }, {
                    'HC01_VC06': float(r[27])
                }, {
                    'HC02_VC06': float(r[29])
                }, {
                    'HC03_VC06': float(r[31])
                }, {
                    'HC01_VC07': float(r[33])
                }, {
                    'HC02_VC07': float(r[35])
                }, {
                    'HC03_VC07': float(r[37])
                }, {
                    'HC01_VC08': float(r[39])
                }, {
                    'HC02_VC08': float(r[41])
                }, {
                    'HC03_VC08': float(r[43])
                }, {
                    'HC01_VC09': float(r[45])
                }, {
                    'HC02_VC09': float(r[47])
                }, {
                    'HC03_VC09': float(r[49])
                }, {
                    'HC01_VC10': float(r[51])
                }, {
                    'HC02_VC10': float(r[53])
                }, {
                    'HC03_VC10': float(r[55])
                }, {
                    'HC01_VC11': float(r[57])
                }, {
                    'HC02_VC11': float(r[59])
                }, {
                    'HC03_VC11': float(r[61])
                }, {
                    'HC01_VC12': float(r[63])
                }, {
                    'HC02_VC12': float(r[65])
                }, {
                    'HC03_VC12': float(r[67])
                }, {
                    'HC01_VC13': float(r[69])
                }, {
                    'HC02_VC13': float(r[71])
                }, {
                    'HC03_VC13': float(r[73])
                }, {
                    'HC01_VC14': float(r[75])
                }, {
                    'HC02_VC14': float(r[77])
                }, {
                    'HC03_VC14': float(r[79])
                }]
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM S2503_ACS"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("S2503_ACS table already exists. Skipping...")
                return
            for r in data:
                get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s"
                cursor.execute(get_track_id_sql, (r))
                track_pk_id = cursor.fetchone()
                if track_pk_id is None:
                    continue
                update_sql = "INSERT INTO `S2503_ACS` " \
                             "(`HC01_VC01`, `HC02_VC01`, `HC03_VC01`, " \
                             "`HC01_VC03`, `HC02_VC03`, `HC03_VC03`, " \
                             "`HC01_VC04`, `HC02_VC04`, `HC03_VC04`, " \
                             "`HC01_VC05`, `HC02_VC05`, `HC03_VC05`, " \
                             "`HC01_VC06`, `HC02_VC06`, `HC03_VC06`, " \
                             "`HC01_VC07`, `HC02_VC07`, `HC03_VC07`, " \
                             "`HC01_VC08`, `HC02_VC08`, `HC03_VC08`, " \
                             "`HC01_VC09`, `HC02_VC09`, `HC03_VC09`, " \
                             "`HC01_VC10`, `HC02_VC10`, `HC03_VC10`, " \
                             "`HC01_VC11`, `HC02_VC11`, `HC03_VC11`, " \
                             "`HC01_VC12`, `HC02_VC12`, `HC03_VC12`, " \
                             "`HC01_VC13`, `HC02_VC13`, `HC03_VC13`, " \
                             "`HC01_VC14`, `HC02_VC14`, `HC03_VC14`, " \
                             "`track_pk_id`) " \
                             "VALUES " \
                             "(%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s, %s, %s, " \
                             "%s)"
                try:
                    cursor.execute(
                        update_sql,
                        (data[r][0]['HC01_VC01'], data[r][1]['HC02_VC01'],
                         data[r][2]['HC03_VC01'], data[r][3]['HC01_VC03'],
                         data[r][4]['HC02_VC03'], data[r][5]['HC03_VC03'],
                         data[r][6]['HC01_VC04'], data[r][7]['HC02_VC04'],
                         data[r][8]['HC03_VC04'], data[r][9]['HC01_VC05'],
                         data[r][10]['HC02_VC05'], data[r][11]['HC03_VC05'],
                         data[r][12]['HC01_VC06'], data[r][13]['HC02_VC06'],
                         data[r][14]['HC03_VC06'], data[r][15]['HC01_VC07'],
                         data[r][16]['HC02_VC07'], data[r][17]['HC03_VC07'],
                         data[r][18]['HC01_VC08'], data[r][19]['HC02_VC08'],
                         data[r][20]['HC03_VC08'], data[r][21]['HC01_VC09'],
                         data[r][22]['HC02_VC09'], data[r][23]['HC03_VC09'],
                         data[r][24]['HC01_VC10'], data[r][25]['HC02_VC10'],
                         data[r][26]['HC03_VC10'], data[r][27]['HC01_VC11'],
                         data[r][28]['HC02_VC11'], data[r][29]['HC03_VC11'],
                         data[r][30]['HC01_VC12'], data[r][31]['HC02_VC12'],
                         data[r][32]['HC03_VC12'], data[r][33]['HC01_VC13'],
                         data[r][34]['HC02_VC13'], data[r][35]['HC03_VC13'],
                         data[r][36]['HC01_VC14'], data[r][37]['HC02_VC14'],
                         data[r][38]['HC03_VC14'], track_pk_id['pk_id']))
                except Exception as e:
                    print(e)
                self.acsdb.con.commit()

    def load_S2503_moe(
        self
    ):  # TODO: Should refactor this to fold MOE into data load...Needs new ddl as well
        """
        Same general method as load_S2503 but this loads the MOE for each variable in the dataset.
        :return:
        """
        data = {}
        with open("data/ACS_14_5YR_S2503.csv") as fh:
            reader = csv.reader(fh)
            r_count = 0
            for r in reader:
                r_count += 1
                if r_count < 3:
                    continue
                # name = re.search(r'[\w .]+', r[2]).group(0)
                r = [e.replace('-', '0') for e in r]
                r = [e.replace('(X)', '0') for e in r]
                r = [e.replace('250,000+', '250000') for e in r]
                r = [e.replace('***', '0') for e in r]
                r = [e.replace('**', '0') for e in r]
                data[r[1]] = [{
                    'HC01_VC01_MOE': float(r[4])
                }, {
                    'HC02_VC01_MOE': float(r[6])
                }, {
                    'HC03_VC01_MOE': float(r[8])
                }, {
                    'HC01_VC03_MOE': float(r[10])
                }, {
                    'HC02_VC03_MOE': float(r[12])
                }, {
                    'HC03_VC03_MOE': float(r[14])
                }, {
                    'HC01_VC04_MOE': float(r[16])
                }, {
                    'HC02_VC04_MOE': float(r[18])
                }, {
                    'HC03_VC04_MOE': float(r[20])
                }, {
                    'HC01_VC05_MOE': float(r[22])
                }, {
                    'HC02_VC05_MOE': float(r[24])
                }, {
                    'HC03_VC05_MOE': float(r[26])
                }, {
                    'HC01_VC06_MOE': float(r[28])
                }, {
                    'HC02_VC06_MOE': float(r[30])
                }, {
                    'HC03_VC06_MOE': float(r[32])
                }, {
                    'HC01_VC07_MOE': float(r[34])
                }, {
                    'HC02_VC07_MOE': float(r[36])
                }, {
                    'HC03_VC07_MOE': float(r[38])
                }, {
                    'HC01_VC08_MOE': float(r[40])
                }, {
                    'HC02_VC08_MOE': float(r[42])
                }, {
                    'HC03_VC08_MOE': float(r[44])
                }, {
                    'HC01_VC09_MOE': float(r[46])
                }, {
                    'HC02_VC09_MOE': float(r[48])
                }, {
                    'HC03_VC09_MOE': float(r[50])
                }, {
                    'HC01_VC10_MOE': float(r[52])
                }, {
                    'HC02_VC10_MOE': float(r[54])
                }, {
                    'HC03_VC10_MOE': float(r[56])
                }, {
                    'HC01_VC11_MOE': float(r[58])
                }, {
                    'HC02_VC11_MOE': float(r[60])
                }, {
                    'HC03_VC11_MOE': float(r[62])
                }, {
                    'HC01_VC12_MOE': float(r[64])
                }, {
                    'HC02_VC12_MOE': float(r[66])
                }, {
                    'HC03_VC12_MOE': float(r[68])
                }, {
                    'HC01_VC13_MOE': float(r[70])
                }, {
                    'HC02_VC13_MOE': float(r[72])
                }, {
                    'HC03_VC13_MOE': float(r[74])
                }, {
                    'HC01_VC14_MOE': float(r[76])
                }, {
                    'HC02_VC14_MOE': float(r[78])
                }, {
                    'HC03_VC14_MOE': float(r[80])
                }]
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT HC01_VC01_MOE FROM S2501_ACS"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print(
                    "S2501_ACS table already provisioned with MOEs. Skipping..."
                )
                return
            for r in data:
                get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s"
                cursor.execute(get_track_id_sql, (r))
                track_pk_id = cursor.fetchone()
                if track_pk_id is None:
                    continue
                get_pk_id_sql = "SELECT pk_id FROM S2503_ACS WHERE track_pk_id=%s"
                cursor.execute(get_pk_id_sql, (track_pk_id['pk_id']))
                ret = cursor.fetchone()
                update_sql = "UPDATE `S2503_ACS` SET " \
                             "`HC01_VC01_MOE`=%s, `HC02_VC01_MOE`=%s, `HC03_VC01_MOE`=%s, " \
                             "`HC01_VC03_MOE`=%s, `HC02_VC03_MOE`=%s, `HC03_VC03_MOE`=%s, " \
                             "`HC01_VC04_MOE`=%s, `HC02_VC04_MOE`=%s, `HC03_VC04_MOE`=%s, " \
                             "`HC01_VC05_MOE`=%s, `HC02_VC05_MOE`=%s, `HC03_VC05_MOE`=%s, " \
                             "`HC01_VC06_MOE`=%s, `HC02_VC06_MOE`=%s, `HC03_VC06_MOE`=%s, " \
                             "`HC01_VC07_MOE`=%s, `HC02_VC07_MOE`=%s, `HC03_VC07_MOE`=%s, " \
                             "`HC01_VC08_MOE`=%s, `HC02_VC08_MOE`=%s, `HC03_VC08_MOE`=%s, " \
                             "`HC01_VC09_MOE`=%s, `HC02_VC09_MOE`=%s, `HC03_VC09_MOE`=%s, " \
                             "`HC01_VC10_MOE`=%s, `HC02_VC10_MOE`=%s, `HC03_VC10_MOE`=%s, " \
                             "`HC01_VC11_MOE`=%s, `HC02_VC11_MOE`=%s, `HC03_VC11_MOE`=%s, " \
                             "`HC01_VC12_MOE`=%s, `HC02_VC12_MOE`=%s, `HC03_VC12_MOE`=%s, " \
                             "`HC01_VC13_MOE`=%s, `HC02_VC13_MOE`=%s, `HC03_VC13_MOE`=%s, " \
                             "`HC01_VC14_MOE`=%s, `HC02_VC14_MOE`=%s, `HC03_VC14_MOE`=%s " \
                             "WHERE " \
                             "pk_id=%s"
                try:
                    cursor.execute(
                        update_sql,
                        (data[r][0]['HC01_VC01_MOE'],
                         data[r][1]['HC02_VC01_MOE'],
                         data[r][2]['HC03_VC01_MOE'],
                         data[r][3]['HC01_VC03_MOE'],
                         data[r][4]['HC02_VC03_MOE'],
                         data[r][5]['HC03_VC03_MOE'],
                         data[r][6]['HC01_VC04_MOE'],
                         data[r][7]['HC02_VC04_MOE'],
                         data[r][8]['HC03_VC04_MOE'],
                         data[r][9]['HC01_VC05_MOE'],
                         data[r][10]['HC02_VC05_MOE'],
                         data[r][11]['HC03_VC05_MOE'],
                         data[r][12]['HC01_VC06_MOE'],
                         data[r][13]['HC02_VC06_MOE'],
                         data[r][14]['HC03_VC06_MOE'],
                         data[r][15]['HC01_VC07_MOE'],
                         data[r][16]['HC02_VC07_MOE'],
                         data[r][17]['HC03_VC07_MOE'],
                         data[r][18]['HC01_VC08_MOE'],
                         data[r][19]['HC02_VC08_MOE'],
                         data[r][20]['HC03_VC08_MOE'],
                         data[r][21]['HC01_VC09_MOE'],
                         data[r][22]['HC02_VC09_MOE'],
                         data[r][23]['HC03_VC09_MOE'],
                         data[r][24]['HC01_VC10_MOE'],
                         data[r][25]['HC02_VC10_MOE'],
                         data[r][26]['HC03_VC10_MOE'],
                         data[r][27]['HC01_VC11_MOE'],
                         data[r][28]['HC02_VC11_MOE'],
                         data[r][29]['HC03_VC11_MOE'],
                         data[r][30]['HC01_VC12_MOE'],
                         data[r][31]['HC02_VC12_MOE'],
                         data[r][32]['HC03_VC12_MOE'],
                         data[r][33]['HC01_VC13_MOE'],
                         data[r][34]['HC02_VC13_MOE'],
                         data[r][35]['HC03_VC13_MOE'],
                         data[r][36]['HC01_VC14_MOE'],
                         data[r][37]['HC02_VC14_MOE'],
                         data[r][38]['HC03_VC14_MOE'], ret['pk_id']))
                except Exception as e:
                    print(e)
                self.acsdb.con.commit()

    def load_S2501(self):
        data = {}
        with open("data/ACS_14_5YR_S2501.csv") as fh:
            reader = csv.reader(fh)
            r_count = 0
            for r in reader:
                r_count += 1
                if r_count < 3:  # csv file first two rows are not data
                    continue
                # name = re.search(r'[\w .]+', r[2]).group(0)
                r = [e.replace('-', '0') for e in r]
                r = [e.replace('(X)', '0') for e in r]
                r = [e.replace('250,000+', '250000') for e in r]
                r = [e.replace('***', '0') for e in r]
                r = [e.replace('**', '0') for e in r]
                data[r[1]] = [{
                    'HC01_VC01': float(r[3])
                }, {
                    'HC01_VC01_MOE': float(r[4])
                }, {
                    'HC02_VC01': float(r[5])
                }, {
                    'HC02_VC01_MOE': float(r[6])
                }, {
                    'HC03_VC01': float(r[7])
                }, {
                    'HC03_VC01_MOE': float(r[8])
                }, {
                    'HC01_VC03': float(r[9])
                }, {
                    'HC01_VC03_MOE': float(r[10])
                }, {
                    'HC02_VC03': float(r[11])
                }, {
                    'HC02_VC03_MOE': float(r[12])
                }, {
                    'HC03_VC03': float(r[13])
                }, {
                    'HC03_VC03_MOE': float(r[14])
                }, {
                    'HC01_VC04': float(r[15])
                }, {
                    'HC01_VC04_MOE': float(r[16])
                }, {
                    'HC02_VC04': float(r[17])
                }, {
                    'HC02_VC04_MOE': float(r[18])
                }, {
                    'HC03_VC04': float(r[19])
                }, {
                    'HC03_VC04_MOE': float(r[20])
                }, {
                    'HC01_VC05': float(r[21])
                }, {
                    'HC01_VC05_MOE': float(r[22])
                }, {
                    'HC02_VC05': float(r[23])
                }, {
                    'HC02_VC05_MOE': float(r[24])
                }, {
                    'HC03_VC05': float(r[25])
                }, {
                    'HC03_VC05_MOE': float(r[26])
                }, {
                    'HC01_VC06': float(r[27])
                }, {
                    'HC01_VC06_MOE': float(r[28])
                }, {
                    'HC02_VC06': float(r[29])
                }, {
                    'HC02_VC06_MOE': float(r[30])
                }, {
                    'HC03_VC06': float(r[31])
                }, {
                    'HC03_VC06_MOE': float(r[32])
                }, {
                    'HC01_VC14': float(r[51])
                }, {
                    'HC01_VC14_MOE': float(r[52])
                }, {
                    'HC02_VC14': float(r[53])
                }, {
                    'HC02_VC14_MOE': float(r[54])
                }, {
                    'HC03_VC14': float(r[55])
                }, {
                    'HC03_VC14_MOE': float(r[56])
                }, {
                    'HC01_VC15': float(r[57])
                }, {
                    'HC01_VC15_MOE': float(r[58])
                }, {
                    'HC02_VC15': float(r[59])
                }, {
                    'HC02_VC15_MOE': float(r[60])
                }, {
                    'HC03_VC15': float(r[61])
                }, {
                    'HC03_VC15_MOE': float(r[62])
                }, {
                    'HC01_VC19': float(r[81])
                }, {
                    'HC01_VC19_MOE': float(r[82])
                }, {
                    'HC02_VC19': float(r[83])
                }, {
                    'HC02_VC19_MOE': float(r[84])
                }, {
                    'HC03_VC19': float(r[85])
                }, {
                    'HC03_VC19_MOE': float(r[86])
                }, {
                    'HC01_VC39': float(r[183])
                }, {
                    'HC01_VC39_MOE': float(r[184])
                }, {
                    'HC02_VC39': float(r[185])
                }, {
                    'HC02_VC39_MOE': float(r[186])
                }, {
                    'HC03_VC39': float(r[187])
                }, {
                    'HC03_VC39_MOE': float(r[188])
                }]
        with self.acsdb.con.cursor() as cursor:
            test_sql = "SELECT * FROM S2501_ACS"
            cursor.execute(test_sql, ())
            ret = cursor.fetchone()
            if ret is not None:
                print("S2501_ACS table already exists. Skipping...")
                return
            for r in data:
                get_track_id_sql = "SELECT `pk_id` FROM `census_tract_2010` AS c WHERE `track_id`=%s"
                cursor.execute(get_track_id_sql, (r))
                track_pk_id = cursor.fetchone()
                if track_pk_id is None:
                    continue
                update_sql = "INSERT INTO `S2501_ACS` " \
                             "(`HC01_VC01`, `HC01_VC01_MOE`, `HC02_VC01`, `HC02_VC01_MOE`, `HC03_VC01`, `HC03_VC01_MOE`, " \
                             "`HC01_VC03`, `HC01_VC03_MOE`, `HC02_VC03`, `HC02_VC03_MOE`, `HC03_VC03`, `HC03_VC03_MOE`, " \
                             "`HC01_VC04`, `HC01_VC04_MOE`, `HC02_VC04`, `HC02_VC04_MOE`, `HC03_VC04`, `HC03_VC04_MOE`, " \
                             "`HC01_VC05`, `HC01_VC05_MOE`, `HC02_VC05`, `HC02_VC05_MOE`, `HC03_VC05`, `HC03_VC05_MOE`, " \
                             "`HC01_VC06`, `HC01_VC06_MOE`, `HC02_VC06`, `HC02_VC06_MOE`, `HC03_VC06`, `HC03_VC06_MOE`, " \
                             "`HC01_VC14`, `HC01_VC14_MOE`, `HC02_VC14`, `HC02_VC14_MOE`, `HC03_VC14`, `HC03_VC14_MOE`, " \
                             "`HC01_VC15`, `HC01_VC15_MOE`, `HC02_VC15`, `HC02_VC15_MOE`, `HC03_VC15`, `HC03_VC15_MOE`, " \
                             "`HC01_VC19`, `HC01_VC19_MOE`, `HC02_VC19`, `HC02_VC19_MOE`, `HC03_VC19`, `HC03_VC19_MOE`, " \
                             "`HC01_VC39`, `HC01_VC39_MOE`, `HC02_VC39`, `HC02_VC39_MOE`, `HC03_VC39`, `HC03_VC39_MOE`, " \
                             "`track_pk_id`) " \
                             "VALUES " \
                             "(%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, " \
                             "%s)"
                try:
                    cursor.execute(update_sql, (
                        data[r][0]['HC01_VC01'], data[r][1]['HC01_VC01_MOE'],
                        data[r][2]['HC02_VC01'], data[r][3]['HC02_VC01_MOE'],
                        data[r][4]['HC03_VC01'], data[r][5]['HC03_VC01_MOE'],
                        data[r][6]['HC01_VC03'], data[r][7]['HC01_VC03_MOE'],
                        data[r][8]['HC02_VC03'], data[r][9]['HC02_VC03_MOE'],
                        data[r][10]['HC03_VC03'], data[r][11]['HC03_VC03_MOE'],
                        data[r][12]['HC01_VC04'], data[r][13]['HC01_VC04_MOE'],
                        data[r][14]['HC02_VC04'], data[r][15]['HC02_VC04_MOE'],
                        data[r][16]['HC03_VC04'], data[r][17]['HC03_VC04_MOE'],
                        data[r][18]['HC01_VC05'], data[r][19]['HC01_VC05_MOE'],
                        data[r][20]['HC02_VC05'], data[r][21]['HC02_VC05_MOE'],
                        data[r][22]['HC03_VC05'], data[r][23]['HC03_VC05_MOE'],
                        data[r][24]['HC01_VC06'], data[r][25]['HC01_VC06_MOE'],
                        data[r][26]['HC02_VC06'], data[r][27]['HC02_VC06_MOE'],
                        data[r][28]['HC03_VC06'], data[r][29]['HC03_VC06_MOE'],
                        data[r][30]['HC01_VC14'], data[r][31]['HC01_VC14_MOE'],
                        data[r][32]['HC02_VC14'], data[r][33]['HC02_VC14_MOE'],
                        data[r][34]['HC03_VC14'], data[r][35]['HC03_VC14_MOE'],
                        data[r][36]['HC01_VC15'], data[r][37]['HC01_VC15_MOE'],
                        data[r][38]['HC02_VC15'], data[r][39]['HC02_VC15_MOE'],
                        data[r][40]['HC03_VC15'], data[r][41]['HC03_VC15_MOE'],
                        data[r][42]['HC01_VC19'], data[r][43]['HC01_VC19_MOE'],
                        data[r][44]['HC02_VC19'], data[r][45]['HC02_VC19_MOE'],
                        data[r][46]['HC03_VC19'], data[r][47]['HC03_VC19_MOE'],
                        data[r][48]['HC01_VC39'], data[r][49]['HC01_VC39_MOE'],
                        data[r][50]['HC02_VC39'], data[r][51]['HC02_VC39_MOE'],
                        data[r][52]['HC03_VC39'], data[r][53]['HC03_VC39_MOE'],
                        track_pk_id['pk_id']))
                except Exception as e:
                    print(e)
                self.acsdb.con.commit()