def run_query(query, location): """ Execute search query to Yelp. :param: query: Search terms :param: location: Geographical location to search for :return: A list of Businesses """ FUNC_NAME = inspect.currentframe().f_code.co_name if location == "": log_error(MODULE_NAME, FUNC_NAME, 'No location input provided.') return [] client = get_yelp_api_client() if not client: return [] # Yelp takes search term query in params kwargs, # and location directly as a param in search fxn. params = { 'term': query, } try: search = Search(client) response = search.search(location, **params) except Exception as ex: log_exception(MODULE_NAME, FUNC_NAME, ex) businesses = response.businesses if not businesses: businesses = [] return businesses
def enqueue_fetch_reviews(self, business_id, num_reviews=0): """Fetch business reviews for a specific business""" print('self task for ' + business_id + ' is: ' + str(self)) if Business.objects.filter(id=business_id).exists(): business = Business.objects.get(id=business_id) main.engine.search_businesses.get_business_reviews(business, num_reviews, self) else: log_error(MODULE_NAME, inspect.current_frame().f_code.co_name, '%s | This business does not exists.' % business_id) return True
def enqueue_fetch_reviews(self, business_id, num_reviews=0): """Fetch business reviews for a specific business""" print('self task for ' + business_id + ' is: ' + str(self)) if Business.objects.filter(id=business_id).exists(): business = Business.objects.get(id=business_id) main.engine.search_businesses.get_business_reviews( business, num_reviews, self) else: log_error(MODULE_NAME, inspect.current_frame().f_code.co_name, '%s | This business does not exists.' % business_id) return True
def search_for_businesses(query="", location="", debug=False): """ Search for businesses that match against search terms and return a list of businesses. :param: query: Search terms :param: location: Geographical location to search in or near :return: A list of Businesses """ FUNC_NAME = inspect.currentframe().f_code.co_name if debug: location = 'San Francisco' elif location == "": log_error(MODULE_NAME, FUNC_NAME, 'No location input in search') return [] businesses = [] log(MODULE_NAME, FUNC_NAME, 'query: "%s", location: "%s"' % (query, location)) try: businesses = run_query(query, location) except Exception as ex: log_exception(MODULE_NAME, FUNC_NAME, ex) # First 10 entries. No pagination yet so KISS. businesses = businesses[:10] for cur_business in businesses: has_reviews = Review.objects.filter( business_id=cur_business.id ).exists() cur_business.has_reviews = has_reviews save_business(cur_business.id, cur_business.name, cur_business.image_url, cur_business.url, cur_business.review_count, cur_business.rating) return businesses
def search_for_businesses(query="", location="", debug=False): """ Search for businesses that match against search terms and return a list of businesses. :param: query: Search terms :param: location: Geographical location to search in or near :return: A list of Businesses """ FUNC_NAME = inspect.currentframe().f_code.co_name if debug: location = 'San Francisco' elif location == "": log_error(MODULE_NAME, FUNC_NAME, 'No location input in search') return [] businesses = [] log(MODULE_NAME, FUNC_NAME, 'query: "%s", location: "%s"' % (query, location)) try: businesses = run_query(query, location) except Exception as ex: log_exception(MODULE_NAME, FUNC_NAME, ex) # First 10 entries. No pagination yet so KISS. businesses = businesses[:10] for cur_business in businesses: has_reviews = Review.objects.filter( business_id=cur_business.id).exists() cur_business.has_reviews = has_reviews save_business(cur_business.id, cur_business.name, cur_business.image_url, cur_business.url, cur_business.review_count, cur_business.rating) return businesses
def get_review_graph_data(business, debug=False): """ Calculates the ylp rating along with graphing data and returns them. :param business: Business to get rating data for :param debug: Debug mode is on if True :return: A tuple that returns: timeline list of ylpline_ratings, Yelp review ratings, the overall smoothed rating, recent trend sparkline, 6-month sparkline, 12-month sparkline, and 24-month sparkline. """ FUNC_NAME = inspect.currentframe().f_code.co_name if debug: business = Business.objects.filter(id='whitewater-excitement-lotus-2') elif not business: log_error(MODULE_NAME, FUNC_NAME, '%s | No business input provided' % business.id) return reviews = Review.objects.filter(business=business).order_by('publish_date') ylpline_ratings = [] # Detail graph - ylpline ratings review_ratings = [] # Detail graph - review ratings review = reviews[0] publish_date = review.publish_date actual_rating = float(review.rating) smooth_rating = float(review.rating) today = datetime.today().date() # sparkline one_spark_unit_ago = today - timedelta(days=18) two_spark_units_ago = one_spark_unit_ago - timedelta(days=18) three_spark_units_ago = two_spark_units_ago - timedelta(days=18) four_spark_units_ago = three_spark_units_ago - timedelta(days=18) five_spark_units_ago = four_spark_units_ago - timedelta(days=18) # 6mo one_6mo_unit_ago = today - timedelta(days=45) two_6mo_units_ago = one_6mo_unit_ago - timedelta(days=45) three_6mo_units_ago = two_6mo_units_ago - timedelta(days=45) four_6mo_units_ago = three_6mo_units_ago - timedelta(days=45) # 12mo one_12mo_unit_ago = today - timedelta(days=92) two_12mo_units_ago = one_12mo_unit_ago - timedelta(days=92) three_12mo_units_ago = two_12mo_units_ago - timedelta(days=92) four_12mo_units_ago = three_12mo_units_ago - timedelta(days=92) # 24mo one_24mo_unit_ago = today - timedelta(days=183) two_24mo_units_ago = one_24mo_unit_ago - timedelta(days=183) three_24mo_units_ago = two_24mo_units_ago - timedelta(days=183) four_24mo_units_ago = three_24mo_units_ago - timedelta(days=183) # Graph takes x-axis time in seconds since epoch publish_datetime = datetime(publish_date.year, publish_date.month, publish_date.day) epoch = datetime(1970, 1, 1) publish_since_epoch = (publish_datetime - epoch).total_seconds() * 1000 ylpline_ratings.append([publish_since_epoch, smooth_rating]) review_ratings.append([publish_since_epoch, actual_rating]) prev_smooth_rating = smooth_rating one_spark_unit_back = [] two_spark_units_back = [] three_spark_units_back = [] four_spark_units_back = [] five_spark_units_back = [] one_6mo_unit_back = [] two_6mo_units_back = [] three_6mo_units_back = [] four_6mo_units_back = [] one_12mo_unit_back = [] two_12mo_units_back = [] three_12mo_units_back = [] four_12mo_units_back = [] one_24mo_unit_back = [] two_24mo_units_back = [] three_24mo_units_back = [] four_24mo_units_back = [] # Completed index 0 review before this loop. Now repeat the algorithm over # the collection of reviews. for review in reviews[1:]: publish_date = review.publish_date publish_datetime = datetime(publish_date.year, publish_date.month, publish_date.day) publish_since_epoch = (publish_datetime - epoch).total_seconds() * 1000 actual_rating = float(review.rating) smooth_rating = float(prev_smooth_rating + SMOOTH_FACTOR * (actual_rating-prev_smooth_rating)) if publish_date > one_spark_unit_ago: one_spark_unit_back.append(smooth_rating) elif publish_date > two_spark_units_ago: two_spark_units_back.append(smooth_rating) elif publish_date > three_spark_units_ago: three_spark_units_back.append(smooth_rating) elif publish_date > four_spark_units_ago: four_spark_units_back.append(smooth_rating) elif publish_date > five_spark_units_ago: five_spark_units_back.append(smooth_rating) if publish_date > one_6mo_unit_ago: one_6mo_unit_back.append(smooth_rating) elif publish_date > two_6mo_units_ago: two_6mo_units_back.append(smooth_rating) elif publish_date > three_6mo_units_ago: three_6mo_units_back.append(smooth_rating) elif publish_date > four_6mo_units_ago: four_6mo_units_back.append(smooth_rating) if publish_date > one_12mo_unit_ago: one_12mo_unit_back.append(smooth_rating) elif publish_date > two_12mo_units_ago: two_12mo_units_back.append(smooth_rating) elif publish_date > three_12mo_units_ago: three_12mo_units_back.append(smooth_rating) elif publish_date > four_12mo_units_ago: four_12mo_units_back.append(smooth_rating) if publish_date > one_24mo_unit_ago: one_24mo_unit_back.append(smooth_rating) elif publish_date > two_24mo_units_ago: two_24mo_units_back.append(smooth_rating) elif publish_date > three_24mo_units_ago: three_24mo_units_back.append(smooth_rating) elif publish_date > four_24mo_units_ago: four_24mo_units_back.append(smooth_rating) ylpline_ratings.append([publish_since_epoch, round(smooth_rating, 2)]) review_ratings.append([publish_since_epoch, actual_rating]) prev_smooth_rating = smooth_rating sparkline = get_sparkline([five_spark_units_back, four_spark_units_back, three_spark_units_back, two_spark_units_back, one_spark_unit_back]) sparkline_6mo = get_sparkline([four_6mo_units_back, three_6mo_units_back, two_6mo_units_back, one_6mo_unit_back]) sparkline_12mo = get_sparkline([four_12mo_units_back, three_12mo_units_back, two_12mo_units_back, one_12mo_unit_back]) sparkline_24mo = get_sparkline([four_24mo_units_back, three_24mo_units_back, two_24mo_units_back, one_24mo_unit_back]) return ylpline_ratings, review_ratings, smooth_rating, sparkline, sparkline_6mo, sparkline_12mo, sparkline_24mo
def get_business_reviews(business, num_reviews=0, task=None, debug=False): """ Fetches business reviews for a single business and saves it to the database. This is a computationally intensive function that makes multiple requests to the Yelp website and collect reviews for a business. Yelp restricts the number of reviews per page, so multiple threads must call multiple requests to collect all business reviews. If a business has 500 reviews, and Yelp only shows 20 reviews per page, 25 http requests must be made to Yelp. In the background of each thread, it parses the response HTML content to get the relevant review data for consumption. Warning: You must not set the # of max thread workers to be unreasonably high. This will cause an out of memory error in production and cause the production server to crash and restart. :param business: The business to fetch reviews for. :param num_reviews: If provided, will only get the number of reviews, most recent first. Otherwise, will fetch all reviews of a business. :param debug: Debug mode is on if True. :return: Nothing. Reviews are saved into the database. """ FUNC_NAME = inspect.currentframe().f_code.co_name do_not_store_latest_pull = False print("task for " + business.id + " is: " + str(task)) if debug: business = Business.objects.get(id='girl-and-the-goat-chicago') if not business: return """ if task: fake_fetch(task) return """ latest_review_date = None todays_date = datetime.today().date() # Don't bother fetching if we fetched recently already. if business.latest_review_pull and business.latest_review_pull + \ timedelta(days=DAYS_TO_DELAY_FETCHING) >= todays_date: print("Hitting too recent to fetch") log(MODULE_NAME, FUNC_NAME, '%s | Hitting too recent to fetch' % business.id) return if Review.objects.filter(business_id=business.id).exists(): latest_review_date = Review.objects.filter( business_id=business.id).latest('publish_date').publish_date if num_reviews <= 0: num_reviews = get_num_reviews_for_business(business) num_requests = num_reviews // NUM_REVIEWS_PER_PAGE if num_reviews % NUM_REVIEWS_PER_PAGE != 0: num_requests += 1 log(MODULE_NAME, FUNC_NAME, '%s | Concurrent pull start' % business.id) concurrency_pull_start = default_timer() urls = create_urls_list(business.url, num_reviews) session = FuturesSession(max_workers=MAX_WORKERS) futures = [] responses = [] # Multi-thread requests and HTML parsing for i, url in enumerate(urls): future = session.get(url, background_callback=parse_results_in_background) futures.append(future) # Wait for callbacks to finish print('Response received...', end="", flush=True) for i, future in enumerate(futures, 1): response = future.result() responses.append(response) progress = round(i * 90 / futures.__len__(), 1) if task: task.update_state(state='PROGRESS', meta={'current': progress}) print(str(i) + ": " + str(response.status_code) + " " + str(response.reason) + '...', end="", flush=True) concurrency_pull_end = default_timer() log( MODULE_NAME, FUNC_NAME, '%s | Concurrent pull end. Duration: %s ' 'seconds' % (business.id, str(concurrency_pull_end - concurrency_pull_start))) # Save reviews to database log(MODULE_NAME, FUNC_NAME, '%s | Begin response processing' % business.id) print("Processing response (%s total)..." % num_requests, end="", flush=True) process_start = default_timer() for ctr, response in enumerate(responses, 1): print("%s..." % ctr, end="", flush=True) if response: if response.status_code == 200: save_reviews(response, business, latest_review_date) else: do_not_store_latest_pull = True log_error( MODULE_NAME, FUNC_NAME, 'Fetch unsuccessful. Got an HTTP status code of: %s' % str(response.status_code)) progress = 90 + (round(i * 10 / responses.__len__(), 1)) if task: print("theres a task!") task.update_state(state='PROGRESS', meta={'current': progress}) process_end = default_timer() log( MODULE_NAME, FUNC_NAME, '\nProcessing response duration: %s seconds' % str(process_end - process_start)) # Update business that we fetched reviews today if not do_not_store_latest_pull: business.latest_review_pull = todays_date business.save()
def get_business_reviews(business, num_reviews=0, task=None, debug=False): """ Fetches business reviews for a single business and saves it to the database. This is a computationally intensive function that makes multiple requests to the Yelp website and collect reviews for a business. Yelp restricts the number of reviews per page, so multiple threads must call multiple requests to collect all business reviews. If a business has 500 reviews, and Yelp only shows 20 reviews per page, 25 http requests must be made to Yelp. In the background of each thread, it parses the response HTML content to get the relevant review data for consumption. Warning: You must not set the # of max thread workers to be unreasonably high. This will cause an out of memory error in production and cause the production server to crash and restart. :param business: The business to fetch reviews for. :param num_reviews: If provided, will only get the number of reviews, most recent first. Otherwise, will fetch all reviews of a business. :param debug: Debug mode is on if True. :return: Nothing. Reviews are saved into the database. """ FUNC_NAME = inspect.currentframe().f_code.co_name do_not_store_latest_pull = False print("task for " + business.id + " is: " + str(task)) if debug: business = Business.objects.get(id='girl-and-the-goat-chicago') if not business: return """ if task: fake_fetch(task) return """ latest_review_date = None todays_date = datetime.today().date() # Don't bother fetching if we fetched recently already. # if business.latest_review_pull and business.latest_review_pull + \ # timedelta(days=DAYS_TO_DELAY_FETCHING) >= todays_date: # print("Hitting too recent to fetch") # log(MODULE_NAME, FUNC_NAME, '%s | Hitting too recent to fetch' % # business.id) # return if Review.objects.filter(business_id=business.id).exists(): latest_review_date = Review.objects.filter( business_id=business.id ).latest('publish_date').publish_date if num_reviews <= 0: num_reviews = get_num_reviews_for_business(business) num_requests = num_reviews // NUM_REVIEWS_PER_PAGE if num_reviews % NUM_REVIEWS_PER_PAGE != 0: num_requests += 1 log(MODULE_NAME, FUNC_NAME, '%s | Concurrent pull start' % business.id) concurrency_pull_start = default_timer() urls = create_urls_list(business.url, num_reviews) session = FuturesSession(max_workers=MAX_WORKERS) futures = [] responses = [] # Multi-thread requests and HTML parsings for i, url in enumerate(urls): future = session.get(url, background_callback=parse_results_in_background) futures.append(future) # Wait for callbacks to finish print('Response received...', end="", flush=True) for i, future in enumerate(futures, 1): response = future.result() responses.append(response) progress = round(i * 90 / futures.__len__(), 1) if task: task.update_state(state='PROGRESS', meta={'current': progress}) print(str(i) + ": " + str(response.status_code) + " " + str(response.reason) + '...', end="", flush=True) concurrency_pull_end = default_timer() log(MODULE_NAME, FUNC_NAME, '%s | Concurrent pull end. Duration: %s ' 'seconds' % (business.id, str(concurrency_pull_end-concurrency_pull_start))) # Save reviews to database. Has debugging code. log(MODULE_NAME, FUNC_NAME, '%s | Begin response processing' % business.id) print("Processing response (%s total)..." % num_requests, end="", flush=True) process_start = default_timer() for ctr, response in enumerate(responses, 1): print("%s..." % ctr, end="", flush=True) if response: if response.status_code == 200: save_reviews(response, business, latest_review_date) else: do_not_store_latest_pull = True log_error(MODULE_NAME, FUNC_NAME, 'Fetch unsuccessful. Got an HTTP status code of: %s' % str(response.status_code)) progress = 90 + (round(i * 10 / responses.__len__(), 1)) if task: print("theres a task!") task.update_state(state='PROGRESS', meta={'current': progress}) process_end = default_timer() log(MODULE_NAME, FUNC_NAME, '\nProcessing response duration: %s seconds' % str(process_end-process_start)) # Update business that we fetched reviews today if not do_not_store_latest_pull: business.latest_review_pull = todays_date business.save()
def get_review_graph_data(business, debug=False): """ Calculates the ylp rating along with graphing data and returns them. :param business: Business to get rating data for :param debug: Debug mode is on if True :return: A tuple that returns: timeline list of ylpline_ratings, Yelp review ratings, the overall smoothed rating, recent trend sparkline, 6-month sparkline, 12-month sparkline, and 24-month sparkline. """ FUNC_NAME = inspect.currentframe().f_code.co_name if debug: business = Business.objects.filter(id='whitewater-excitement-lotus-2') elif not business: log_error(MODULE_NAME, FUNC_NAME, '%s | No business input provided' % business.id) return reviews = Review.objects.filter(business=business).order_by('publish_date') ylpline_ratings = [] # Detail graph - ylpline ratings review_ratings = [] # Detail graph - review ratings review = reviews[0] publish_date = review.publish_date actual_rating = float(review.rating) smooth_rating = float(review.rating) today = datetime.today().date() # sparkline one_spark_unit_ago = today - timedelta(days=18) two_spark_units_ago = one_spark_unit_ago - timedelta(days=18) three_spark_units_ago = two_spark_units_ago - timedelta(days=18) four_spark_units_ago = three_spark_units_ago - timedelta(days=18) five_spark_units_ago = four_spark_units_ago - timedelta(days=18) # 6mo one_6mo_unit_ago = today - timedelta(days=45) two_6mo_units_ago = one_6mo_unit_ago - timedelta(days=45) three_6mo_units_ago = two_6mo_units_ago - timedelta(days=45) four_6mo_units_ago = three_6mo_units_ago - timedelta(days=45) # 12mo one_12mo_unit_ago = today - timedelta(days=92) two_12mo_units_ago = one_12mo_unit_ago - timedelta(days=92) three_12mo_units_ago = two_12mo_units_ago - timedelta(days=92) four_12mo_units_ago = three_12mo_units_ago - timedelta(days=92) # 24mo one_24mo_unit_ago = today - timedelta(days=183) two_24mo_units_ago = one_24mo_unit_ago - timedelta(days=183) three_24mo_units_ago = two_24mo_units_ago - timedelta(days=183) four_24mo_units_ago = three_24mo_units_ago - timedelta(days=183) # Graph takes x-axis time in seconds since epoch publish_datetime = datetime(publish_date.year, publish_date.month, publish_date.day) epoch = datetime(1970, 1, 1) publish_since_epoch = (publish_datetime - epoch).total_seconds() * 1000 ylpline_ratings.append([publish_since_epoch, smooth_rating]) review_ratings.append([publish_since_epoch, actual_rating]) prev_smooth_rating = smooth_rating one_spark_unit_back = [] two_spark_units_back = [] three_spark_units_back = [] four_spark_units_back = [] five_spark_units_back = [] one_6mo_unit_back = [] two_6mo_units_back = [] three_6mo_units_back = [] four_6mo_units_back = [] one_12mo_unit_back = [] two_12mo_units_back = [] three_12mo_units_back = [] four_12mo_units_back = [] one_24mo_unit_back = [] two_24mo_units_back = [] three_24mo_units_back = [] four_24mo_units_back = [] # Completed index 0 review before this loop. Now repeat the algorithm over # the collection of reviews. for review in reviews[1:]: publish_date = review.publish_date publish_datetime = datetime(publish_date.year, publish_date.month, publish_date.day) publish_since_epoch = (publish_datetime - epoch).total_seconds() * 1000 actual_rating = float(review.rating) smooth_rating = float(prev_smooth_rating + SMOOTH_FACTOR * (actual_rating - prev_smooth_rating)) if publish_date > one_spark_unit_ago: one_spark_unit_back.append(smooth_rating) elif publish_date > two_spark_units_ago: two_spark_units_back.append(smooth_rating) elif publish_date > three_spark_units_ago: three_spark_units_back.append(smooth_rating) elif publish_date > four_spark_units_ago: four_spark_units_back.append(smooth_rating) elif publish_date > five_spark_units_ago: five_spark_units_back.append(smooth_rating) if publish_date > one_6mo_unit_ago: one_6mo_unit_back.append(smooth_rating) elif publish_date > two_6mo_units_ago: two_6mo_units_back.append(smooth_rating) elif publish_date > three_6mo_units_ago: three_6mo_units_back.append(smooth_rating) elif publish_date > four_6mo_units_ago: four_6mo_units_back.append(smooth_rating) if publish_date > one_12mo_unit_ago: one_12mo_unit_back.append(smooth_rating) elif publish_date > two_12mo_units_ago: two_12mo_units_back.append(smooth_rating) elif publish_date > three_12mo_units_ago: three_12mo_units_back.append(smooth_rating) elif publish_date > four_12mo_units_ago: four_12mo_units_back.append(smooth_rating) if publish_date > one_24mo_unit_ago: one_24mo_unit_back.append(smooth_rating) elif publish_date > two_24mo_units_ago: two_24mo_units_back.append(smooth_rating) elif publish_date > three_24mo_units_ago: three_24mo_units_back.append(smooth_rating) elif publish_date > four_24mo_units_ago: four_24mo_units_back.append(smooth_rating) ylpline_ratings.append([publish_since_epoch, round(smooth_rating, 2)]) review_ratings.append([publish_since_epoch, actual_rating]) prev_smooth_rating = smooth_rating sparkline = get_sparkline([ five_spark_units_back, four_spark_units_back, three_spark_units_back, two_spark_units_back, one_spark_unit_back ]) sparkline_6mo = get_sparkline([ four_6mo_units_back, three_6mo_units_back, two_6mo_units_back, one_6mo_unit_back ]) sparkline_12mo = get_sparkline([ four_12mo_units_back, three_12mo_units_back, two_12mo_units_back, one_12mo_unit_back ]) sparkline_24mo = get_sparkline([ four_24mo_units_back, three_24mo_units_back, two_24mo_units_back, one_24mo_unit_back ]) return ylpline_ratings, review_ratings, smooth_rating, sparkline, sparkline_6mo, sparkline_12mo, sparkline_24mo