def __init__(self, review, stars, save_data=True): super().__init__() self.review = review.replace("\n", "").replace("\r", "") self.stars = stars if save_data: self.data_saver = DataSaver() self.save_object()
class ProductPage(object): def __init__(self, name, url_base, href, save_data=True): self.name = name self.url_base = url_base self.href = href if save_data: self.data_saver = DataSaver() self.save_object() self.elements = [] def search_by(self): return def save_object(self): self.data_saver.product_page_append(self.name,self.get_url()) def get_url(self): return self.url_base + self.href def get_products(self): browser = FirefoxScrapping() browser.open_url(self.get_url()) page = BeautifulSoup(browser.get_html_content(), 'html.parser') products = page.find_all("a", class_="a-link-normal a-text-normal", href=True) products_product = [] for product in products: href = product["href"] name_span = product.find_all("span") name = "" if len(name_span) > 0: name = name_span[0].text products_product.append(Product(name, self.url_base[:-1], href)) self.elements = products_product return products_product
def __init__(self, name, url_base, href, save_data=True): self.name = name self.url_base = url_base self.href = href if save_data: self.data_saver = DataSaver() self.save_object() self.elements = []
class ProductCategory(object): def __init__(self, name, href, url_base): self.name = name print("\t" * 2 + self.name) self.href = href self.url_base = url_base self.data_saver = DataSaver() self.save_object() self.elements = [] try: self.set_products() except KeyboardInterrupt: raise KeyboardInterrupt except: print(f"Hubo un problema con {self.name}") self.data_saver.save_product_page() def save_object(self): self.data_saver.product_category_append(self.name, self.get_url()) def get_url(self): return self.url_base + self.href def set_product_page(self, all_products): browser = self.browser new_url = self.url_base + all_products browser.open_url(new_url) page = BeautifulSoup(browser.get_html_content(), 'html.parser') sidebar = page.find_all("div", id="s-refinements")[0] department = sidebar.find_all( "li", class_="a-spacing-micro s-navigation-indent-1")[0] name = page.find_all( "span", class_="a-size-base a-color-base a-text-bold")[1].text self.elements.append(ProductPage(name, self.url_base, all_products)) return def set_products(self): url = self.get_url() browser = FirefoxScrapping() self.browser = browser browser.open_url(url) page = BeautifulSoup(browser.get_html_content(), 'html.parser') footer = page.find_all( "div", class_="a-box a-text-center apb-browse-searchresults-footer") if len(footer) > 0: footer = footer[0] else: # La pagina contiene todos los productos self.set_product_page(self.href) return a_all_products = footer.find_all('a', href=True)[0] all_products = a_all_products["href"] self.set_product_page(all_products) def get_elements(self): return self.elements
class Category(object): """ This class is for better control when searching for products on Amazon. Search all categories or (sub departments) on amazon.com.mx and create Product Categories. """ def __init__(self, name, html_code, href, url_base): self.name = name print("\t" + self.name) self.html_code = html_code self.href = href self.url_base = url_base self.data_saver = DataSaver() self.save_object() self.elements = [] try: self.set_product_category() except KeyboardInterrupt: raise KeyboardInterrupt except: print(f"Hubo un problema con {self.name}") self.data_saver.save_product_category() def save_object(self): self.data_saver.category_append(self.name, self.get_url()) def get_url(self): return self.url_base[:-1] + self.href def get_product_section(self, sections, section_name="Departamento"): for section in sections: div_header = section.find_all( "div", class_="a-section a-spacing-small")[0] header_name = div_header.find_all("span") if len(header_name) > 0 and header_name[0].text == section_name: return section return None def set_product_category(self): url = self.get_url() browser = FirefoxScrapping() browser.open_url(url) page = BeautifulSoup(browser.get_html_content(), 'html.parser') sidebar = page.find_all("div", id="s-refinements")[0] sections = sidebar.find_all("div", class_="a-section a-spacing-none") categories = self.get_product_section(sections).find_all( "span", class_="a-list-item")[1:] for product_category in categories: a_href = product_category.find_all('a', href=True) if len(a_href) > 0: href = a_href[0]["href"] name = product_category.find_all('span')[0].text self.elements.append(ProductCategory(name, href, self.url_base)) def get_elements(self): return self.elements
class Review(object): def __init__(self, review, stars, save_data=True): super().__init__() self.review = review.replace("\n", "").replace("\r", "") self.stars = stars if save_data: self.data_saver = DataSaver() self.save_object() def save_object(self): self.data_saver.review_append(self.review, self.stars)
def __init__(self, name, url_base, href, save_data=True): super().__init__() self.name = name self.url_base = url_base self.href = href try: self.reviews = self.get_review_s() self.create_reviews() if save_data: self.data_saver = DataSaver() self.save_object() except KeyboardInterrupt: raise KeyboardInterrupt except: pass
def __init__(self, name, html_code, url_base): self.name = name print(self.name) self.html_code = html_code self.url_base = url_base self.data_saver = DataSaver() self.save_object() self.elements = [] try: self.set_elements() except KeyboardInterrupt: raise KeyboardInterrupt except: print(f"Hubo un problema con {self.name}") self.data_saver.save_category()
def __init__(self, name, href, url_base): self.name = name print("\t" * 2 + self.name) self.href = href self.url_base = url_base self.data_saver = DataSaver() self.save_object() self.elements = [] try: self.set_products() except KeyboardInterrupt: raise KeyboardInterrupt except: print(f"Hubo un problema con {self.name}") self.data_saver.save_product_page()
def get_all_categories(self, not_included=[ "Amazon Prime Video", "Amazon Music", "Echo y Alexa", "Amazon Fire TV", "E-readers y eBooks Kindle" ]): url = self.url_base + "gp/site-directory?ref_=nav_em__allcategories_0_1_1_30" response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') table = soup.find_all(id="shopAllLinks") categories = table[0].find_all("div", class_="popover-grouping") all_categories = [] for categ in categories: topic = categ.find_all("h2")[0] if not topic.text in not_included: all_categories.append( Department(topic.text, categ, self.url_base)) self.amazon_tree = all_categories data_saver = DataSaver() data_saver.save_department() return all_categories
class Department(object): """ This class is for better control when searching for products on Amazon. Search all departments on amazon.com.mx and create categories o (sub departments) """ def __init__(self, name, html_code, url_base): self.name = name print(self.name) self.html_code = html_code self.url_base = url_base self.data_saver = DataSaver() self.save_object() self.elements = [] try: self.set_elements() except KeyboardInterrupt: raise KeyboardInterrupt except: print(f"Hubo un problema con {self.name}") self.data_saver.save_category() def save_object(self): self.data_saver.department_append(self.name, self.url_base) def set_elements(self): html = self.html_code for element in html.find_all("li"): href = element.find_all('a', href=True)[0]["href"] self.elements.append( Category(element.text, element, href, self.url_base)) def get_elements(self): return self.elements def append_element(self, element): self.elements.append(element) def __str__(self): return self.name
class Product(object): def __init__(self, name, url_base, href, save_data=True): super().__init__() self.name = name self.url_base = url_base self.href = href try: self.reviews = self.get_review_s() self.create_reviews() if save_data: self.data_saver = DataSaver() self.save_object() except KeyboardInterrupt: raise KeyboardInterrupt except: pass def get_url(self): return self.url_base + self.href def save_object(self): self.data_saver.product_append(self.name,self.url_base + self.href) def create_reviews(self): self.elements = [] for review in self.reviews: self.elements.append(Review(review[0],review[1])) def get_review_s(self): from selenium import webdriver # from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.keys import Keys import time my_list2 = [] # driver = webdriver.Chrome(ChromeDriverManager().install()) driver = FirefoxScrapping().driver driver.get(self.get_url()) time.sleep(2) try: driver.find_element_by_xpath('//*[@id="reviews-medley-global-expand-head"]/div[2]/div/span/a').click() except: # print("SOLO EN ESPAÑOL O SIN COMENTARIOS") pass try: # hace click en mostrar todos los comentarios driver.find_element_by_xpath('//*[@id="reviews-medley-footer"]/div[2]/a').click() time.sleep(2) for i in range(1,13): try: temp=driver.find_element_by_xpath(f'/html/body/div[1]/div[3]/div[1]/div[1]/div/div[1]/div[5]/div[3]/div/div[{i}]/div/div/div[4]/span/span').text # my_list2.append(driver.find_element_by_xpath(f'/html/body/div[1]/div[3]/div[1]/div[1]/div/div[1]/div[5]/div[3]/div/div[{i}]/div/div/div[4]/span/span').text) # import pdb;pdb.set_trace() try: stars=driver.find_element_by_xpath(f'/html/body/div[1]/div[3]/div[1]/div[1]/div/div[1]/div[5]/div[3]/div/div[{i}]/div/div/div[2]/a[1]/i').get_attribute("textContent") stars2=int(float(stars.split()[0])) my_list2.append((temp,stars2)) except: my_list2.append((temp,"-1")) # my_list2.append(temp) print("elemento en español") # import pdb;pdb.set_trace() if temp=='': my_list2.pop() print("elemento vacio en lista de reviews") temp=driver.find_element_by_xpath(f'/html/body/div[1]/div[3]/div[1]/div[1]/div/div[1]/div[5]/div[3]/div/div[{i}]/div/div/div[4]/span/span[2]').text stars=driver.find_element_by_xpath(f'/html/body/div[1]/div[3]/div[1]/div[1]/div/div[1]/div[5]/div[3]/div/div[{i}]/div/div/div[2]/i').get_attribute("textContent") stars2=int(float(stars.split()[0])) my_list2.append((temp,stars2)) # my_list2.append(temp) # my_list2.append(driver.find_element_by_xpath(f'/html/body/div[1]/div[3]/div[1]/div[1]/div/div[1]/div[5]/div[3]/div/div[{i}]/div/div/div[4]/span/span')) except: print("Excepción fin") # for i in range(len(my_list2)): # print(i, my_list2[i],"\n\n") return my_list2 driver.close() except NoSuchElementException as exception: try: print("\nEsta en ingles!!\n") # hace click en mostrar todos los comentarios button = driver.find_element_by_xpath('//*[@id="cr-pagination-footer-0"]/a') driver.execute_script("arguments[0].click();", button) time.sleep(2) for i in range(4,16): try: temp=driver.find_element_by_xpath(f'/html/body/div[1]/div[3]/div[1]/div[1]/div/div[1]/div[5]/div[3]/div/div[{i}]/div/div/div[4]/span/span[2]').text stars=driver.find_element_by_xpath(f'/html/body/div[1]/div[3]/div[1]/div[1]/div/div[1]/div[5]/div[3]/div/div[{i}]/div/div/div[2]/i').get_attribute("textContent") # my_list2.append(driver.find_element_by_xpath(f'/html/body/div[1]/div[3]/div[1]/div[1]/div/div[1]/div[5]/div[3]/div/div[{i}]/div/div/div[4]/span/span[2]').text) stars2=int(float(stars.split()[0])) my_list2.append((temp,stars2)) if temp=='': my_list2.pop() # print("elemento vacio en lista de reviews") except: pass # for i in range(len(my_list2)): # print(i, my_list2[i],"\n\n") return my_list2 driver.close() except NoSuchElementException as exception: print("NO REVIEW xD") return []