def extract_product(html_content, url): #String Buffer string_buffer = "" errs = list() #Read page and read to extract product infomation parser = BeautifulSoup(html_content, "html.parser") #Check if the page is a product, if not skip page. truth, asin = check_page(parser) if not truth: errs.append("Not product") return (False, errs) #New Product as a object product = Product() #Find URL product.SetUrl(url) #Find Brand: Note: Some products have an image for the brand #truth, string_buffer = search_table(parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name") #if truth: # product.SetBrand(string_buffer) #else: # string_buffer = parser.find("a", attrs={"id": "brand"}) # if string_buffer != None: # product.SetBrand(string_buffer.get_text().strip()) # else: # errs.append("Could not find Brand") #Find Title string_buffer = parser.find("span", attrs={"id": "productTitle"}) string_buffer_2 = parser.find("span", attrs={"id": "btAsinTitle"}) if string_buffer != None: product.SetTitle(string_buffer.get_text().strip()) elif string_buffer_2 != None: product.SetTitle(string_buffer_2.get_text().strip()) elif url != None: product.SetTitle(url.strip("https://www.amazon.com/").split("/dp")[0]) print("Title: ", product.title) else: errs.append("Could not find Title") #return (False, errs) #Find Image #string_buffer = parser.find("img", attrs={"id": "landingImage"}) #if string_buffer != None: # string_buffer = string_buffer.get("data-old-hires") # if len(string_buffer) < 2: # string_buffer = parser.find("img", attrs={"id": "landingImage"}).get("data-a-dynamic-image") # m = re.search('https://(.+?).jpg', string_buffer) # if m: # string_buffer = m.group(1) # string_buffer = "https://{}.jpg".format(string_buffer) # #print ("Img Url: "+string_buffer) # product.SetImage(string_buffer) #else: # errs.append("Could not find Image") #Find ASIN product.SetSourceID(asin) #print("Product after setting ASIN: ",product) #Find price string_buffer = parser.find("span", attrs={"id": "priceblock_saleprice"}) string_buffer_2 = parser.find("span", attrs={"id": "priceblock_ourprice"}) if string_buffer != None: product.SetPrice(string_buffer.get_text()) elif string_buffer_2 != None: product.SetPrice(string_buffer_2.get_text()) else: errs.append("Could not find Price") #return (False, errs) #Find rating string_buffer = parser.find("span", attrs={"id": "acrCustomerReviewText"}) if string_buffer != None: product.SetRating(string_buffer.get_text().split()[0]) #print("Product after setting rating: ",product) #Append the product to large list of products if product.FormCompleted(): return (product, errs) else: return (False, errs)
def extract_product(html_content, url): #String Buffer string_buffer = "" errs = list() #Read page and read to extract product infomation parser = BeautifulSoup(html_content, "html.parser") #Check if the page is a product, if not skip page. truth, asin = check_page(parser) if not truth: errs.append("Not product") return (False, errs) #New Product as a object product = Product() #New Keyword rank keyword = Rake(SmartStopList.words()) #Find URL product.SetUrl(url) #Find Brand: Note: Some products have an image for the brand truth, string_buffer = search_table( parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name") if truth: product.SetBrand(string_buffer) else: string_buffer = parser.find("a", attrs={"id": "brand"}) if string_buffer != None: product.SetBrand(string_buffer.get_text().strip()) else: errs.append("Could not find Brand") #Find Title string_buffer = parser.find("span", attrs={"id": "productTitle"}) if string_buffer != None: product.SetTitle(string_buffer.get_text().strip()) else: errs.append("Could not find Title") return (False, errs) #Find Image string_buffer = parser.find("img", attrs={"id": "landingImage"}) if string_buffer != None: string_buffer = string_buffer.get("data-old-hires") if len(string_buffer) < 2: string_buffer = parser.find("img", attrs={ "id": "landingImage" }).get("data-a-dynamic-image") m = re.search('https://(.+?).jpg', string_buffer) if m: string_buffer = m.group(1) string_buffer = "https://{}.jpg".format(string_buffer) #print ("Img Url: "+string_buffer) product.SetImage(string_buffer) else: errs.append("Could not find Image") #Find Small Blob #TODO: Need to perform keyword analysis string_buffer = parser.find("div", attrs={"id": "feature-bullets"}) if string_buffer != None: string_buffer = string_buffer.find("ul") try: string_buffer = string_buffer.find_all("li") if string_buffer != None: string_buffer_2 = "" for span in string_buffer: string_buffer_3 = span.find("span") if string_buffer_3 != None: string_buffer_3 = string_buffer_3.get_text() try: string_buffer_2 = "{} {}".format( string_buffer_2, string_buffer_3.strip()) except: pass saved_buffer = string_buffer_2.strip() #Calculating Key Words keywords_1 = keyword.run(saved_buffer) product.SetSmallBlog(keywords_1) except: errs.append("Error finding li") else: errs.append("Could not find small section keywords") #Find Large Blob #TODO: Need to perform keyword analysis string_buffer = parser.find("div", attrs={"id": "productDescription"}) if string_buffer != None: string_buffer = string_buffer.find("p") if string_buffer != None: string_buffer = string_buffer.get_text() saved_buffer = string_buffer.strip() #Calculating Key Words keywords_2 = keyword.run(saved_buffer) product.SetLargeBlob(keywords_2) else: errs.append("Could not find large section keywords") #Find ASIN product.SetSourceID(asin) #TODO: Perform price save! #Append the product to large list of products if product.FormCompleted(): return (product, errs) else: return (False, errs)