def __init__(self): init_logging() self.logger = logging.getLogger(__name__) self.logger.info("Job started and logging enabled") with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"config.yml"), "r") as fh: settings = yaml.load(fh) self.driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) self.driver.set_window_size(1024, 768) self.shipping_rate = 0.75 # $rate/lb # TODO: shift this to AZ class self.outfile = "../data/test.csv" self.fieldnames = ('net', 'roi', 'name', 'price', 'az_price', 'weight', 'az_sales_rank', 'az_match', 'url', 'img', 'az_url', 'az_asin') self.url_cats = settings['toys'] self.site_url = settings['site_url'] self.page_url = settings['page_url'] self.base_url = strip_final_slash(get_base_url(self.site_url)) self.az = AZ() self.depth_limit = settings['depth_limit']
class WalmartScraper(object): def __init__(self): init_logging() self.logger = logging.getLogger(__name__) self.logger.info("Job started and logging enabled") with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"config.yml"), "r") as fh: settings = yaml.load(fh) self.driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) self.driver.set_window_size(1024, 768) self.shipping_rate = 0.75 # $rate/lb # TODO: shift this to AZ class self.outfile = "../data/test.csv" self.fieldnames = ('net', 'roi', 'name', 'price', 'az_price', 'weight', 'az_sales_rank', 'az_match', 'url', 'img', 'az_url', 'az_asin') self.url_cats = settings['toys'] self.site_url = settings['site_url'] self.page_url = settings['page_url'] self.base_url = strip_final_slash(get_base_url(self.site_url)) self.az = AZ() self.depth_limit = settings['depth_limit'] def destroy(self): """ method to destroy all objects and clean up. :return: """ #self.driver.service.process.send_signal(signal.SIGTERM) self.logger.info("Walmart object cleanly destroyed...") self.driver.quit() def scrape(self, pc=None, change_url=None): """ :param change_url is the changing part of wider site url, if there are multiple sections to hit. :param pc is an integer indicating where to start with a paginated url. """ self.run = True # initialization of a site/section. if pc is not None: self.pc = pc while self.run is True: url = self.next_page_url(build_search_url(self.site_url, change_url)) try: page = self.get_page(url) except Exception as e: self.logger.error("Error with %s and skipped" % url) continue self.get_list(page) if change_url is None: self.logger.info("Site %s finished" % self.site_url) else: self.logger.info("Section %s finished" % change_url) def init_output(self): if not os.path.exists(self.outfile): with open(self.outfile, "w", encoding='utf-8') as fh: outwriter = csv.DictWriter(fh, fieldnames=self.fieldnames, delimiter="\t") outwriter.writeheader() def process_output(self, data): with open(self.outfile, 'a', encoding='utf-8') as fh: outwriter = csv.DictWriter(fh, fieldnames=self.fieldnames, delimiter="\t") outwriter.writerow(data) def get_dollar_amount(self, f): if isinstance(f, str): f = f.replace(",", "", 1) return round(float(re.match(r'\$?(\d+[.]\d\d)', f.strip()).group(1)), 2) else: return f def get_net(self, data): az_price = data['az_price'] if az_price == 0.0: return round(0.0,2) price = self.get_dollar_amount(data['price']) if data['weight'] == "Weight not fetched" or data['weight'] == "Not Available": weight = 0.0 else: weight = float(data['weight']) try: net = (az_price - (price*1.08 + az_price*0.3 + weight*self.shipping_rate)) except Exception as e: net = 0.0 try: net = round(net, 2) except: self.logger.error("Bad net value for %s - price:%s, az_price:%s, weight:%s" % (data['name'], data['price'], data['az_price'], data['weight'])) net = 0.0 return net def get_roi(self, data): net = self.get_dollar_amount(data['net']) price = self.get_dollar_amount(data['price']) return round(net/price, 2) def get_list(self, page): """ method takes search results page from Walmart and parses out items to save. Has error checking for no search results or an empty set. :param page: bs4 object returned from get_page :return: """ if page.find(string=re.compile(r'We found 0 results')): self.run = False return elif not page.find("ul", {"class": "tile-list-grid"}): self.run = False return else: entries = page.find("ul", {"class": "tile-list-grid"}) for e in entries: if len(e) == 1: continue elif e.name == "script": continue else: imitate_user(0.05) entry = {} try: entry['name'] = e.find("a", {"class":"js-product-title"}).get_text().strip() except: continue if 'http://' in e.find("a", {"class":"js-product-title"}).attrs['href']: entry['url'] = e.find("a", {"class":"js-product-title"}).attrs['href'] else: entry['url'] = "".join((self.base_url, e.find("a", {"class":"js-product-title"}).attrs['href'])) try: entry['price'] = e.find("span", {"class":"price-display"}).get_text() except: continue entry['img'] = e.find("img", {"class":"product-image"}).attrs['data-default-image'] entry['az_price'], entry['weight'], entry['az_sales_rank'], entry['az_match'], entry['az_url'], entry['az_asin'] = self.az.find_best_match(entry['name'], 'Toys') entry['net'] = self.get_net(entry) entry['roi'] = self.get_roi(entry) self.process_output(entry) def next_page_url(self, url): self.pc += 1 imitate_user(0.5) next_url = url if self.page_url: next_url += self.page_url next_url += str(self.pc) if self.pc == self.depth_limit: self.run = False # recursion limit reached return next_url def get_page(self, url): try: self.logger.info("Getting %s" % url) self.driver.get(url) # self.driver.get_cookies() except ValueError as e: imitate_user(2) try: self.driver.get(url) except: raise except Exception as e: self.logger.error(url, e) # try: # wait = WebDriverWait(self.driver, 3) # wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div"))) # except Exception as e: # self.logger.error("WebDriverWait error") page = BeautifulSoup(self.driver.page_source, "lxml") return page