def process_reviews(self, rs, item_id, id_db, id_list):
        """
        Inputs: Amazon Reviews object, and a filehandle.
        Output: Returns number of reviews processed. Writes reviews to file.
        """

        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

        count = 0
        output_str = ""
        contents = []
        review_ids = []
        ratings =[]
        review_sentence_num = []
        checker = len(id_db) > 0

        for r in rs.full_reviews():     
            try:
                if self.debug:
                    logging.debug(
                        "{} | {} | {}".format(
                            r.id, r.date, self._encode_safe(
                                r.text)))
                if r.text != "None":
                    if r.id in id_list:
                        print "Review already in current scraped list, pass"
                        continue
                    else:
                        if checker: # if we need to check reviewID
                            if r.id in id_db:
                                print 'scraped review is passed as it is in db'
                                continue 
                            else: 
                                count += 1
                                id_list.append(r.id)
                                sentenceContent_list = getSentencesFromReview(self._encode_safe(r.text))
                                print "First sentence: " + sentenceContent_list[0]
                                for content in sentenceContent_list:
                                    contents.append(content)
                                sentence_num = len(sentenceContent_list)
                                review_sentence_num.append(sentence_num)
                                review_ids.append(r.id)
                                ratings.append(float(r.rating) * 5) # rating directly from API is normalized to 1
                        else: #don't need to check reviewID
                            count += 1
                            id_list.append(r.id)
                            sentenceContent_list = getSentencesFromReview(self._encode_safe(r.text))
                            print "First sentence: " + sentenceContent_list[0]
                            for content in sentenceContent_list:
                                contents.append(content)
                            sentence_num = len(sentenceContent_list)
                            review_sentence_num.append(sentence_num)
                            review_ids.append(r.id)
                            ratings.append(float(r.rating) * 5) # rating directly from API is normalized to 1
            except:
                logging.warn('Encoding problem with review {}'.format(r.id))

        return count, contents, review_ids, ratings, review_sentence_num
    def process_reviews(self, rs):
        """
        Inputs: Amazon Reviews object, and a filehandle.
        Output: Returns number of reviews processed. Writes reviews to file.
        """

        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

        count = 0
        output_str = ""
        contents = []
        for r in rs.full_reviews():
            count += 1
            
            try:
                if self.debug:
                    logging.debug(
                        "{} | {} | {}".format(
                            r.id, r.date, self._encode_safe(
                                r.text)))
                if r.text != "None":
                    sentenceContent_list = getSentencesFromReview(self._encode_safe(r.text))
                    print "First sentence: " + sentenceContent_list[0]
                    for content in sentenceContent_list:
                        contents.append(content)
            except:
                logging.warn(
                    'Encoding problem with review {}'.format(
                        r.id))
        
        return count, contents
def upsert_all_reviews_bulk(review_file_path, meta_dict):
	"""Based on the fact that a product's review is consecutive, this function bulk upsert all the reviews for one product"""
	reviewParser = parse(review_file_path)
	client, db = connect_to_db()
	db_product_collection = db.product_collection
	db_product_collection.create_index([("product_id", ASCENDING)])

	i=0
	num_found = 0
	print "building product_collection in database"
	product_id = "a"
	for review in reviewParser:
		i += 1
		if i % 1000 ==0:
			print i
		#new data:
		product_id_new = review['asin']
		contents_new = getSentencesFromReview(review['reviewText'])
		num_sentence = len(contents_new)
		review_id_new = review['reviewerID']
		rating_new = review['overall']

		# If the product id is the same, then just concatenate the field
		if product_id_new == product_id:
			contents = contents + contents_new
			review_ids.append(review_id_new)
			ratings.append(rating_new)
			review_ending_sentence.append(num_sentence + review_ending_sentence[-1])
			num_reviews += 1

		# If encountering new product: save previous product, and initialize the product variables
		elif product_id_new != product_id:
			if i > 1:
				upsert_new_product(db_product_collection, product_id, product_name, category, contents, review_ids, ratings, review_ending_sentence, num_reviews, ft_senIdx, ft_score)
			product_id = product_id_new
			product_name = []
			category = []
			if product_id in meta_dict:
				product = meta_dict[product_id]			
				if 'product_name' in product:
					product_name = product['product_name']
				if 'category' in product:
					category = product['category']		
			contents = contents_new
			review_ids = []
			ratings = []
			review_ending_sentence = []
			review_ids.append(review_id_new)
			ratings.append(rating_new)
			review_ending_sentence.append(num_sentence)
			num_reviews = 1
			ft_score = {}
			ft_senIdx = {}

	client.close()
def upsert_review_for_product_id(review, db_product_collection, meta_dict):
	"""For each review, if it belongs to the category indicated by "category_name", add it to the product_collection in db"""
	product_id = review['asin']
	query_res = list(db_product_collection.find({"product_id": product_id}))

	contents_new = getSentencesFromReview(review['reviewText'])
	num_sentence = len(contents_new)
	review_id_new = review['reviewerID']
	rating_new = review['overall']

	isfound = 0
	if product_id in meta_dict:
		product = meta_dict[product_id]
		product_name = product['product_name']
		category = product['category']
		if len(category) > 0:
			isfound = 1
			#if product already exists: add to the current product information
			if len(query_res) > 0:
				contents = query_res[0]["contents"] + contents_new
				review_ids = query_res[0]["review_ids"]
				ratings = query_res[0]["ratings"]
				review_ids.append(review_id_new)
				ratings.append(rating_new)
				review_ending_sentence_list = query_res[0]["review_ending_sentence"]
				review_ending_sentence_new = num_sentence + review_ending_sentence_list[-1]
				review_ending_sentence_list.append(review_ending_sentence_new)
				num_reviews = query_res[0]["num_reviews"] + 1

				update_field = {
					"contents": contents,
					"review_ids": review_ids,
					"ratings": ratings,
					"review_ending_sentence": review_ending_sentence_list,
					"num_reviews": num_reviews,
					"category": category
				}
			
			# if product not in database:
			else:
				contents = contents_new
				review_ids = []
				ratings = []
				review_ending_sentence_list = []
				review_ids.append(review_id_new)
				ratings.append(rating_new)
				review_ending_sentence_list.append(num_sentence)
				num_reviews = 1
				update_field = {
					"contents": contents,
					"product_name": product_name,
					"review_ids": review_ids,
					"ratings": ratings,
					"review_ending_sentence": review_ending_sentence_list,
					"num_reviews": num_reviews,
					"category": category,
					"ft_score": {},
					"ft_senIdx": {}
				}

			query = {"product_id": product_id}
			db_product_collection.update(query, {"$set": update_field}, True)

	return isfound
def scrape_reviews_hard(productID, prod_review_ids_db, max_scrape_loop = 1, current_loop=0):
    '''
    This method scraps directly from website and does not need userID or the AmazonScrape object
    However, it can only scrape the 5 top ranked review. 
    '''
    if current_loop > max_scrape_loop:
        return [], [], [], [], [], []
    else:
        try: 
            current_loop += 1
            doc = getWebPage(productID)
            XPATH_NAME = '//h1[@id="title"]//text()'
            XPATH_RATINGS = '//div[contains(@id, "rev-dpReviewsMostHelpfulAUI")]/div/div/a/i/span//text()'           
            XPATH_REVIEWS_IDS = '//div[contains(@id, "rev-dpReviewsMostHelpfulAUI")]/a[2]/@id'
            
            RAW_NAME = doc.xpath(XPATH_NAME)
            RAW_RATINGS = doc.xpath(XPATH_RATINGS)
            ratings = [int(float((x[:3]))) for x in RAW_RATINGS] #remove the rest of the string          
            RAW_REVIEWS_IDS = doc.xpath(XPATH_REVIEWS_IDS)
            
            product_name = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
            review_ids = [x[:x.index(".")] for x in  RAW_REVIEWS_IDS] #remove the rest of the string
            
            contents = []
            review_sentence_num = []
            ind_new_review = []
            for index in range(len(review_ids)):
                review_id = review_ids[index]
                if review_id in prod_review_ids_db:
                    print "scraped review is passed by backup_scraper as it is in db"
                    continue
                else:
                    ind_new_review.append(index)
                XPATH_REVIEW_BODY = '//div[contains(@id, "revData-dpReviewsMostHelpfulAUI-%s")]/div//text()' % review_id
                RAW_REVIEW_BODY = doc.xpath(XPATH_REVIEW_BODY)

                review_content = ""
                for RAW_REVIEW in RAW_REVIEW_BODY:
                    review = RAW_REVIEW.strip().encode('utf-8').decode('utf-8')
                    review_content += (review + " ")
                review_sentences = getSentencesFromReview(review_content)
                print "First sentence: {0}".format(review_sentences[0])
                sentence_num = len(review_sentences)
                review_sentence_num.append(sentence_num)
                contents.extend(review_sentences)

            if len(ind_new_review) > 0:
                print('new reviews available from scrape_reviews_hard')
                review_ids = [review_ids[j] for j in ind_new_review]
                ratings = [ratings[j] for j in ind_new_review]
            else:
                review_ids = []
                ratings = []

            #Getting review_ending_sentence:
            if len(review_sentence_num) == 0:
                review_ending_sentence = []
            else:
                review_ending_sentence = [0]
                for num in review_sentence_num:
                    review_ending_sentence.append(num + review_ending_sentence[-1]) 
                review_ending_sentence = review_ending_sentence[1:]

            scraped_pages_new = [1]

            return product_name, contents, review_ids, ratings, review_ending_sentence, scraped_pages_new
        except: 
            time.sleep(int(random.random() * 1.5 + 1) + random.random())
            print 'scraper failed, reinitiate for the %d th time' % current_loop
            return scrape_reviews_hard(productID, checker, max_scrape_loop, current_loop)