# I found that there is a page with 10 reviews about this book: # http://www.amazon.fr/product-reviews/2266219154/ # So we want to parse the book id from the first link and mine its reviews page: id = a.split("/")[-2] reviews = "http://www.amazon.fr/product-reviews/" + id + "/" print reviews # We can use Chrome's Developer Tools to inspect the HTML of the review page. # It turns out the reviews are contained in a <table id="productReviews"> element. # This table has one row and two columns. # Each <div> in the first column is a review. # If the table is absent, it means there are no reviews for this book. reviews = URL(reviews).download(cached=True, throttle=20) # throttle = delay between crawls reviews = DOM(reviews).by_id("productReviews") if reviews is not None: for review in reviews.by_tag("div"): # We use a try-except statement to brute-force it: # The <div>'s in the table do not have a class to search for, # and there may be other <div>'s in-between, which end up in the except-block. try: # The star rating is <span class="swSprite s_star_5_0 " title="5.0 etoiles sur 5">. score = review.by_class("swSprite")[0] score = score.attributes["title"] score = score.split(" ")[0] score = float(score) # The review is contained as plain text in the <div>. text = "" for child in review.children: if child.type == "text": text += child.source + " "
# http://www.amazon.fr/product-reviews/2266219154/ # So we want to parse the book id from the first link and mine its reviews page: id = a.split("/")[-2] reviews = "http://www.amazon.fr/product-reviews/" + id + "/" print reviews # We can use Chrome's Developer Tools to inspect the HTML of the review page. # It turns out the reviews are contained in a <table id="productReviews"> element. # This table has one row and two columns. # Each <div> in the first column is a review. # If the table is absent, it means there are no reviews for this book. reviews = URL(reviews).download( cached=True, throttle=20) # throttle = delay between crawls reviews = DOM(reviews).by_id("productReviews") if reviews is not None: for review in reviews.by_tag("div"): # We use a try-except statement to brute-force it: # The <div>'s in the table do not have a class to search for, # and there may be other <div>'s in-between, which end up in the except-block. try: # The star rating is <span class="swSprite s_star_5_0 " title="5.0 etoiles sur 5">. score = review.by_class("swSprite")[0] score = score.attributes["title"] score = score.split(" ")[0] score = float(score) # The review is contained as plain text in the <div>. text = "" for child in review.children: if child.type == "text": text += child.source + " "