예제 #1
0
def extract_product(html_content, url):
    #String Buffer
    string_buffer = ""
    errs = list()

    #Read page and read to extract product infomation
    parser = BeautifulSoup(html_content, "html.parser")

    #Check if the page is a product, if not skip page.
    truth, asin = check_page(parser)
    if not truth:
        errs.append("Not product")
        return (False, errs)

    #New Product as a object
    product = Product()

    #Find URL
    product.SetUrl(url)

    #Find Brand: Note: Some products have an image for the brand
    #truth, string_buffer = search_table(parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name")
    #if truth:
    #    product.SetBrand(string_buffer)
    #else:
    #    string_buffer = parser.find("a", attrs={"id": "brand"})
    #    if string_buffer != None:
    #        product.SetBrand(string_buffer.get_text().strip())
    #    else:
    #        errs.append("Could not find Brand")

    #Find Title
    string_buffer = parser.find("span", attrs={"id": "productTitle"})
    string_buffer_2 = parser.find("span", attrs={"id": "btAsinTitle"})
    if string_buffer != None:
        product.SetTitle(string_buffer.get_text().strip())
    elif string_buffer_2 != None:
        product.SetTitle(string_buffer_2.get_text().strip())
    elif url != None:
        product.SetTitle(url.strip("https://www.amazon.com/").split("/dp")[0])
        print("Title: ", product.title)
    else:
        errs.append("Could not find Title")
        #return (False, errs)

    #Find Image
    #string_buffer = parser.find("img", attrs={"id": "landingImage"})
    #if string_buffer != None:
    #    string_buffer = string_buffer.get("data-old-hires")
    #    if len(string_buffer) < 2:
    #        string_buffer = parser.find("img", attrs={"id": "landingImage"}).get("data-a-dynamic-image")
    #        m = re.search('https://(.+?).jpg', string_buffer)
    #        if m:
    #            string_buffer = m.group(1)
    #            string_buffer = "https://{}.jpg".format(string_buffer)
    #    #print ("Img Url: "+string_buffer)
    #    product.SetImage(string_buffer)
    #else:
    #    errs.append("Could not find Image")

    #Find ASIN
    product.SetSourceID(asin)

    #print("Product after setting ASIN: ",product)

    #Find price
    string_buffer = parser.find("span", attrs={"id": "priceblock_saleprice"})
    string_buffer_2 = parser.find("span", attrs={"id": "priceblock_ourprice"})
    if string_buffer != None:
        product.SetPrice(string_buffer.get_text())
    elif string_buffer_2 != None:
        product.SetPrice(string_buffer_2.get_text())
    else:
        errs.append("Could not find Price")
        #return (False, errs)

    #Find rating
    string_buffer = parser.find("span", attrs={"id": "acrCustomerReviewText"})
    if string_buffer != None:
        product.SetRating(string_buffer.get_text().split()[0])

    #print("Product after setting rating: ",product)

    #Append the product to large list of products
    if product.FormCompleted():
        return (product, errs)
    else:
        return (False, errs)
def extract_product(html_content, url):
    #String Buffer
    string_buffer = ""
    errs = list()

    #Read page and read to extract product infomation
    parser = BeautifulSoup(html_content, "html.parser")

    #Check if the page is a product, if not skip page.
    truth, asin = check_page(parser)
    if not truth:
        errs.append("Not product")
        return (False, errs)

    #New Product as a object
    product = Product()
    #New Keyword rank
    keyword = Rake(SmartStopList.words())

    #Find URL
    product.SetUrl(url)

    #Find Brand: Note: Some products have an image for the brand
    truth, string_buffer = search_table(
        parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name")
    if truth:
        product.SetBrand(string_buffer)
    else:
        string_buffer = parser.find("a", attrs={"id": "brand"})
        if string_buffer != None:
            product.SetBrand(string_buffer.get_text().strip())
        else:
            errs.append("Could not find Brand")

    #Find Title
    string_buffer = parser.find("span", attrs={"id": "productTitle"})
    if string_buffer != None:
        product.SetTitle(string_buffer.get_text().strip())
    else:
        errs.append("Could not find Title")
        return (False, errs)

    #Find Image
    string_buffer = parser.find("img", attrs={"id": "landingImage"})
    if string_buffer != None:
        string_buffer = string_buffer.get("data-old-hires")
        if len(string_buffer) < 2:
            string_buffer = parser.find("img", attrs={
                "id": "landingImage"
            }).get("data-a-dynamic-image")
            m = re.search('https://(.+?).jpg', string_buffer)
            if m:
                string_buffer = m.group(1)
                string_buffer = "https://{}.jpg".format(string_buffer)
        #print ("Img Url: "+string_buffer)
        product.SetImage(string_buffer)
    else:
        errs.append("Could not find Image")

    #Find Small Blob
    #TODO: Need to perform keyword analysis
    string_buffer = parser.find("div", attrs={"id": "feature-bullets"})
    if string_buffer != None:
        string_buffer = string_buffer.find("ul")
    try:
        string_buffer = string_buffer.find_all("li")
        if string_buffer != None:
            string_buffer_2 = ""
            for span in string_buffer:
                string_buffer_3 = span.find("span")
                if string_buffer_3 != None:
                    string_buffer_3 = string_buffer_3.get_text()
                    try:
                        string_buffer_2 = "{} {}".format(
                            string_buffer_2, string_buffer_3.strip())
                    except:
                        pass
            saved_buffer = string_buffer_2.strip()
            #Calculating Key Words
            keywords_1 = keyword.run(saved_buffer)
            product.SetSmallBlog(keywords_1)
    except:
        errs.append("Error finding li")

    else:
        errs.append("Could not find small section keywords")

    #Find Large Blob
    #TODO: Need to perform keyword analysis
    string_buffer = parser.find("div", attrs={"id": "productDescription"})
    if string_buffer != None:
        string_buffer = string_buffer.find("p")
    if string_buffer != None:
        string_buffer = string_buffer.get_text()
        saved_buffer = string_buffer.strip()
        #Calculating Key Words
        keywords_2 = keyword.run(saved_buffer)
        product.SetLargeBlob(keywords_2)
    else:
        errs.append("Could not find large section keywords")

    #Find ASIN
    product.SetSourceID(asin)

    #TODO: Perform price save!

    #Append the product to large list of products
    if product.FormCompleted():
        return (product, errs)
    else:
        return (False, errs)