def task_yelpScraper(business_ids=None, job_type=0): if business_ids is None: business_ids = getTallyBusiness() # return a list of strings for business_id in business_ids: print(f"scraping business ID {business_id}...") ## get review date range to scrape, e.g. # date_range = (datetime.strptime('2018-06-28', '%Y-%m-%d'), # datetime.strptime('2018-07-01', '%Y-%m-%d')) yelp_review_log = getLatestYelpReviewLog(business_id) if not yelp_review_log: date_range = None m1 = "for all dates" else: date_range = (yelp_review_log[0][0], datetime.now()) m1 = f"from {date_range[0].strftime('%Y-%m-%d')} to {date_range[1].strftime('%Y-%m-%d')}" print(f"scraping {m1}") # scrape Yelp reviews status_code, data = None, [] status_code, data = yelpScraper(business_id, date_range=date_range) if status_code == 200: returncode = updateYelpReviews(business_id, data) job_message = f"status code {status_code}, scraped total {len(data)} reviews, {m1}" insertJobLogs(business_id, job_type, returncode, job_message) if len(data) > 0: insertYelpReviewLog(business_id, data[0][0]) # date else: job_message = f"status code {status_code}" if status_code == 503: # this is special case for web scraping... job_message += " Wasn't able to assign an unblocked proxy IP" insertJobLogs(business_id, job_type, 1, job_message) print(job_message)
def task_getVizdata(business_ids=None): ''' Generate visualization data by background jobs for better user experience ''' if business_ids is None: business_ids = [] business_ids = getTallyBusiness() # return a list of strings for business_id in business_ids: print( f"Generating visualization data for business ID {business_id}...") data = getLatestYelpReviewLog(business_id) if len(data) > 0: timestamp_yelpreview = data[0][0] else: print("Visualization data are recent. No need to re-generate.") return # no reviews to process # viztype 0 and 3 # 2020-01-22 viztype 0 and 3 are sharing an API for historical reasons. # if have time, please change it viztype = 0 data = getVizdataTimestamp(business_id, 0) if len(data) > 0: timestamp_vizdata = data[0][0] # If don't get .date(), it will raise # TypeError: can't compare offset-naive and offset-aware datetimes if len(data) == 0 or timestamp_vizdata.date( ) < timestamp_yelpreview.date(): vizdata = json.dumps(getDataViztype0(business_id), sort_keys=False) if vizdata is not None and len(vizdata) > 0: updateVizdata(business_id, viztype, vizdata) insertVizdataLog(business_id, viztype, triggeredby=0) # triggered by job # viztype 1 viztype = 1 data = getVizdataTimestamp(business_id, 1) if len(data) > 0: timestamp_vizdata = data[0][0] if len(data) == 0 or timestamp_vizdata.date( ) < timestamp_yelpreview.date(): vizdata = json.dumps(yelpTrendyPhrases(business_id), sort_keys=False) if vizdata is not None and len(vizdata) > 0: updateVizdata(business_id, viztype, vizdata) insertVizdataLog(business_id, viztype, triggeredby=0) # triggered by job # viztype 2 viztype = 2 data = getVizdataTimestamp(business_id, 2) if len(data) > 0: timestamp_vizdata = data[0][0] if len(data) == 0 or timestamp_vizdata.date( ) < timestamp_yelpreview.date(): vizdata = json.dumps(yelpReviewCountMonthly(business_id), sort_keys=False) if vizdata is not None and len(vizdata) > 0: updateVizdata(business_id, viztype, vizdata) insertVizdataLog(business_id, viztype, triggeredby=0) # triggered by job # insert a log for the task job_message = "Updated viztype 0,1,2,3" print(job_message) insertJobLogs(business_id, 1, 0, job_message) # job type 1, success