def dedup_sqoot_data_hard(firsttime=False): ''' Summary: Further dedup coupons by checking deals under common fields vs. their locations. ''' last_deduphard_end_time = read_sqoot_log('deduphard') deduphard_start_time = datetime.now(pytz.utc) describe_section("dedup_sqoot_data_hard IS BEGINNING..", show_time()) # Grab all active deals on display to users for deduping. deals_to_dedup = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False, is_duplicate=False, online=False, status='considered-active') if (not firsttime) and last_deduphard_end_time: # If not first time, further filter down to only the newly added unique deals for deduping. deals_to_dedup = deals_to_dedup.filter(date_added__gt=last_deduphard_end_time) crosscheck_by_field(deals_to_dedup, 'coupon_directlink') crosscheck_by_field(deals_to_dedup, 'merchant_name') print "FINISHED DEDUPING HARD....", show_time() deduphard_end_time = datetime.now(pytz.utc) write_sqoot_log('deduphard', deduphard_start_time, deduphard_end_time) print '\n' print "GOOD NEWS! dedup_sqoot_data_hard IS ALL DONE AND LOGGING IT", show_time() reset_db_queries()
def savedown_sqoot_data(): request_parameters = { 'api_key': settings.SQOOT_PUBLIC_KEY, } print "\nSQOOT DATA LOAD STARTING..", show_time() categories_array = requests.get(SQOOT_API_URL + 'categories', params=request_parameters, timeout=5).json()['categories'] categories_dict = establish_categories_dict(categories_array) reorganized_categories_array = reorganize_categories_list(categories_array) for category_dict in reorganized_categories_array: get_or_create_category(category_dict, categories_dict) # loading coupons and merchants describe_section("CHECKING THE LATEST DEAL DATA FROM SQOOT..", show_time()) request_parameters['per_page'] = ITEMS_PER_PAGE active_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] page_count = int(math.ceil(active_deal_count / float(request_parameters['per_page']))) print '%s deals detected, estimating %s pages to iterate' % (active_deal_count, page_count), show_time() describe_section("STARTING TO DOWNLOAD SQOOT DEALS..", show_time()) sqoot_file = open("sqoot_output.json", "w") sqoot_file.write("[") for p in range(page_count): request_parameters['page'] = p + 1 print '## Fetching page %s...' % (p + 1), show_time() response_in_json = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json() sqoot_file.write(json.dumps(response_in_json)) sqoot_file.write(",") sqoot_file.write("]") sqoot_file.flush() sqoot_file.close()
def run_thru_full_cycle(args): ''' Summary: A wrapper function to run daily refresh, validate, dedup and clean functions consecutively. Note: Takes 'firsttime' argument. ''' firsttime = True if 'firsttime' in args else False describe_section("FULLCYCLE STARTING..", show_time()) refresh_sqoot_data(firsttime=firsttime) clean_out_sqoot_data(firsttime=firsttime) validate_sqoot_data(firsttime=firsttime) dedup_sqoot_data_hard(firsttime=firsttime) describe_section("ALL DONE!! :)", show_time())
def savedown_sqoot_data(): request_parameters = { 'api_key': settings.SQOOT_PUBLIC_KEY, } print "\nSQOOT DATA LOAD STARTING..", show_time() categories_array = requests.get(SQOOT_API_URL + 'categories', params=request_parameters, timeout=5).json()['categories'] categories_dict = establish_categories_dict(categories_array) reorganized_categories_array = reorganize_categories_list(categories_array) for category_dict in reorganized_categories_array: get_or_create_category(category_dict, categories_dict) # loading coupons and merchants describe_section("CHECKING THE LATEST DEAL DATA FROM SQOOT..", show_time()) request_parameters['per_page'] = ITEMS_PER_PAGE active_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] page_count = int( math.ceil(active_deal_count / float(request_parameters['per_page']))) print '%s deals detected, estimating %s pages to iterate' % ( active_deal_count, page_count), show_time() describe_section("STARTING TO DOWNLOAD SQOOT DEALS..", show_time()) sqoot_file = open("sqoot_output.json", "w") sqoot_file.write("[") for p in range(page_count): request_parameters['page'] = p + 1 print '## Fetching page %s...' % (p + 1), show_time() response_in_json = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json() sqoot_file.write(json.dumps(response_in_json)) sqoot_file.write(",") sqoot_file.write("]") sqoot_file.flush() sqoot_file.close()
def validate_sqoot_data(firsttime=False, pulseonly=False): ''' Summary: Fetch a deal page and validate deal information and availabilty. ''' last_validate_end_time = read_sqoot_log('validate') validate_start_time = datetime.now(pytz.utc) describe_section("validate_sqoot_data IS BEGINNING..", show_time()) all_active_deals_on_display = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False, is_duplicate=False, online=False)\ .filter(Q(status='unconfirmed') | Q(status='considered-active')) print "...VALIDATING", len(all_active_deals_on_display), "DEALS:" validators = Pool(15) validators.map(go_validate, zip(list(all_active_deals_on_display), repeat(last_validate_end_time), repeat(firsttime), repeat(pulseonly))) print "FINISHED VALIDATING....", show_time() validate_end_time = datetime.now(pytz.utc) write_sqoot_log('validate', validate_start_time, validate_end_time) print '\n' print "GOOD NEWS! validate_sqoot_data IS ALL DONE AND LOGGING IT", show_time() reset_db_queries()
def clean_out_sqoot_data(firsttime=False): ''' Summary: Internal garbage collection cycle that finds and soft-delete all irrelevant and stale local coupons and merchants. Note: * First find all true duplicate deals and soft-delete them * Second, find all folded deals (i.e. is_duplicate=True, related_deal__isnull=True) that are stale (either expired or inactive, both implied and confirmed), and soft-delete them * Third, find all unique deals that are stale, check for folded deals (if so, reassign) and soft-delete them. * Fourth, find all inactive merchants (no active deals), and soft-delete them. ''' from core.signals import delete_object last_refresh_start_time = read_sqoot_log('refresh') if firsttime == False else None cleanout_start_time = datetime.now(pytz.utc) describe_section("clean_out_sqoot_data IS BEGINNING..", show_time()) affected_merchant_list = [] # collect a list of merchant pks whose coupons are being soft-deleted # First true_duplicate_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False, is_duplicate=True, related_deal__isnull=True) deals_for_update = copy(true_duplicate_deals) affected_merchant_list += [c.merchant.pk for c in true_duplicate_deals] true_duplicate_deals.update(is_deleted=True) # triggering deletion of duplicated coupons from search index for coupon in deals_for_update: print 'Deleted %s' % coupon.id delete_object.send(sender=Coupon, instance=coupon) print '~*~*~*~*~*~*~*~*~* First finished ~*~*~*~*~*~*~*~*~*' # Second if last_refresh_start_time: folded_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False, is_duplicate=True, related_deal__isnull=False)\ .filter(Q(last_modified__lt=last_refresh_start_time)\ | Q(status='confirmed-inactive')\ | Q(end__lt=datetime.now(pytz.utc))) else: folded_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False, is_duplicate=True, related_deal__isnull=False)\ .filter(Q(status='confirmed-inactive')\ | Q(end__lt=datetime.now(pytz.utc))) affected_merchant_list += [c.merchant.pk for c in folded_deals] deals_to_signal = [] deals_to_signal += [c.pk for c in folded_deals.filter(Q(status='confirmed-inactive') | Q(end__lt=datetime.now(pytz.utc)))] if last_refresh_start_time: deals_to_signal += [c.pk for c in folded_deals.filter(last_modified__lt=last_refresh_start_time)] folded_deals.filter(last_modified__lt=last_refresh_start_time).update(status='implied-inactive', is_deleted=True) folded_deals.filter(status='confirmed-inactive').update(is_deleted=True) folded_deals.filter(end__lt=datetime.now(pytz.utc)).update(is_deleted=True) deals_to_signal = list(set(deals_to_signal)) for coupon in Coupon.all_objects.filter(pk__in=deals_to_signal): print 'Deleted %s' % coupon.id delete_object.send(sender=Coupon, instance=coupon) print '~*~*~*~*~*~*~*~*~* Second finished ~*~*~*~*~*~*~*~*~*' # Third (Second -> Third; the order matters) if last_refresh_start_time: non_dup_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False, is_duplicate=False)\ .filter(Q(last_modified__lt=last_refresh_start_time)\ | Q(status='confirmed-inactive')\ | Q(end__lt=datetime.now(pytz.utc))) else: non_dup_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_deleted=False, is_duplicate=False)\ .filter(Q(status='confirmed-inactive')\ | Q(end__lt=datetime.now(pytz.utc))) affected_merchant_list += [c.merchant.pk for c in non_dup_deals] deals_with_folded_deals = [c.pk for c in non_dup_deals if Coupon.all_objects.filter(related_deal=c, is_deleted=False).count() != 0] for i in deals_with_folded_deals: reassign_representative_deal(Coupon.all_objects.get(pk=i)) deals_to_signal = [] deals_to_signal += [c.pk for c in non_dup_deals.filter(Q(status='confirmed-inactive') | Q(end__lt=datetime.now(pytz.utc)))] if last_refresh_start_time: deals_to_signal += [c.pk for c in non_dup_deals.filter(last_modified__lt=last_refresh_start_time)] non_dup_deals.filter(last_modified__lt=last_refresh_start_time).update(status='implied-inactive', is_deleted=True) non_dup_deals.filter(status='confirmed-inactive').update(is_deleted=True) non_dup_deals.filter(end__lt=datetime.now(pytz.utc)).update(is_deleted=True) deals_to_signal = list(set(deals_to_signal)) for coupon in Coupon.all_objects.filter(pk__in=deals_to_signal): print 'Deleted %s' % coupon.id delete_object.send(sender=Coupon, instance=coupon) print '~*~*~*~*~*~*~*~*~* Third finished ~*~*~*~*~*~*~*~*~*' # Fourth affected_merchant_list = list(set(affected_merchant_list)) inactive_merchant_list = [] for m_pk in affected_merchant_list: miq = Merchant.all_objects.get(pk=m_pk) # miq == merchant-in-question num_of_active_coupons_from_miq = Coupon.all_objects.filter(ref_id_source='sqoot',\ merchant=miq, is_deleted=False).count() if num_of_active_coupons_from_miq: continue else: inactive_merchant_list.append(miq.pk) Merchant.all_objects.filter(pk__in=inactive_merchant_list).update(is_deleted=True) cleanout_end_time = datetime.now(pytz.utc) write_sqoot_log('cleanout', cleanout_start_time, cleanout_end_time) print '\n' print "GOOD NEWS! cleanout_sqoot_data IS ALL DONE AND LOGGING IT", show_time() reset_db_queries()
def refresh_sqoot_data(indirectload=False, firsttime=False): ''' Summary: Iterate through Sqoot's entire coupon payload and download and update accordingly. ''' last_refresh_start_time = read_sqoot_log('refresh') refresh_start_time = datetime.now(pytz.utc) # Use UTC time to compare & update coupon's 'last_modified' field request_parameters = { 'api_key': settings.SQOOT_PUBLIC_KEY, } print "\nSQOOT DATA LOAD STARTING..", show_time() describe_section("ESTABLISHING CATEGORY DICTIONARY..", show_time()) request_try = 1 while True: try: categories_array = requests.get(SQOOT_API_URL + 'categories', params=request_parameters, timeout=5).json()['categories'] request_try = 1 break except: print "Request timed out after 5 seconds. Let's wait another 5 seconds and try again." time.sleep(5) request_try += 1 print "Trying for the {} time...".format(request_try) categories_dict = establish_categories_dict(categories_array) # Returns a dict with child: parent categories reorganized_categories_array = reorganize_categories_list(categories_array) # list of dict with 'category_name', and 'category_slug' for category_dict in reorganized_categories_array: get_or_create_category(category_dict, categories_dict) # loading coupons and merchants describe_section("CHECKING THE LATEST DEAL DATA FROM SQOOT..", show_time()) request_parameters['per_page'] = ITEMS_PER_PAGE while True: try: sqoot_active_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] request_try = 1 break except: print "Request timed out after 5 seconds. Let's wait another 5 seconds and try again." time.sleep(5) request_try += 1 print "Trying for the {} time...".format(request_try) page_count = int(math.ceil(sqoot_active_deal_count / float(request_parameters['per_page']))) print '%s deals detected, estimating %s pages to iterate' % (sqoot_active_deal_count, page_count), show_time() describe_section("STARTING TO DOWNLOAD SQOOT DEALS..", show_time()) # Since there's only one country & dealtype for all sqoot deals - no need to check it for each coupon country_model = get_or_create_country() dealtype_model = get_or_create_dealtype() sqoot_output_deals = None if indirectload: sqoot_output_deals = json.loads(open("sqoot_output.json","r").read()) for p in range(page_count): request_parameters['page'] = p + 1 print "\n" print '## Fetching page {} out of {}...'.format(p + 1, page_count), show_time() print "\n" if indirectload: response_in_json = sqoot_output_deals[p] else: while True: try: response_in_json = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json() request_try = 1 break except: print "Request timed out after 5 seconds. Let's wait another 5 seconds and try again." time.sleep(5) request_try += 1 print "Trying for the {} time...".format(request_try) active_coupon_ids = [] # List of sqoot coupon ids to hold all active deal ids per page, as set 'page' request_parameters. deals_data = response_in_json['deals'] for deal_data in deals_data: sqoot_coupon_id = int(deal_data['deal']['id']) active_coupon_ids.append(sqoot_coupon_id) deal_last_updated = parse(deal_data['deal']['updated_at']+'+0000') if (not firsttime) and last_refresh_start_time and (deal_last_updated < last_refresh_start_time): continue is_online_bool = deal_data['deal']['online'] merchant_data_dict = deal_data['deal']['merchant'] update_coupon_data(deal_data, categories_dict, merchant_data_dict, is_online_bool, dealtype_model, country_model) print '-' * 60 reset_db_queries() Coupon.all_objects.filter(ref_id_source='sqoot', ref_id__in=active_coupon_ids).update(last_modified=datetime.now(pytz.utc)) refresh_end_time = datetime.now(pytz.utc) write_sqoot_log('refresh', refresh_start_time, refresh_end_time) print '\n' print "GOOD NEWS! refresh_sqoot_data IS ALL DONE AND LOGGING IT", show_time() reset_db_queries() return refresh_start_time, refresh_end_time
def analyze_sqoot_deals(): request_parameters = { 'api_key': settings.SQOOT_PUBLIC_KEY, } # describe_section("Retrieving the latest categories..\n") # categories_array = requests.get(SQOOT_API_URL + 'categories', params=request_parameters).json()['categories'] # category_slugs = [c['category']['slug'] for c in categories_array] describe_section("Retrieving the latest providers..\n") providers_array = requests.get(SQOOT_API_URL + 'providers', params=request_parameters, timeout=5).json()['providers'] provider_slugs = [c['provider']['slug'] for c in providers_array] describe_section("Importing the latest 50 US cities..\n") target_cities = top_50_us_cities_dict describe_section("Checking total sqoot deals available..\n") total_deals_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] TARGET_RADIUS = 50 # miles request_parameters['radius'] = TARGET_RADIUS describe_section("Checking sqoot deals currently available in {} mi radius of the following cities..\n".format(TARGET_RADIUS)) for city in target_cities: request_parameters['location'] = target_cities[city] per_city_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] print city, ': ', per_city_deal_count print 'total sqoot deal count: ', total_deals_count del request_parameters['location'] describe_section("Preparing to check deal availablity from the following providers..\n") for p in provider_slugs: print p for p in provider_slugs: request_parameters['provider_slugs'] = p per_p_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] if per_p_deal_count < 100: print "total deals available from {} too small: {}".format(p, per_p_deal_count) print "Skipping.." continue else: describe_section("Checking deals from {} for each city..\n".format(p)) for city in target_cities: request_parameters['location'] = target_cities[city] per_city_and_p_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] print city, ': ', per_city_and_p_deal_count print 'total {} deal count: {}'.format(p, per_p_deal_count) del request_parameters['location']
def analyze_sqoot_deals(): request_parameters = { 'api_key': settings.SQOOT_PUBLIC_KEY, } # describe_section("Retrieving the latest categories..\n") # categories_array = requests.get(SQOOT_API_URL + 'categories', params=request_parameters).json()['categories'] # category_slugs = [c['category']['slug'] for c in categories_array] describe_section("Retrieving the latest providers..\n") providers_array = requests.get(SQOOT_API_URL + 'providers', params=request_parameters, timeout=5).json()['providers'] provider_slugs = [c['provider']['slug'] for c in providers_array] describe_section("Importing the latest 50 US cities..\n") target_cities = top_50_us_cities_dict describe_section("Checking total sqoot deals available..\n") total_deals_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] TARGET_RADIUS = 50 # miles request_parameters['radius'] = TARGET_RADIUS describe_section( "Checking sqoot deals currently available in {} mi radius of the following cities..\n" .format(TARGET_RADIUS)) for city in target_cities: request_parameters['location'] = target_cities[city] per_city_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] print city, ': ', per_city_deal_count print 'total sqoot deal count: ', total_deals_count del request_parameters['location'] describe_section( "Preparing to check deal availablity from the following providers..\n") for p in provider_slugs: print p for p in provider_slugs: request_parameters['provider_slugs'] = p per_p_deal_count = requests.get(SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] if per_p_deal_count < 100: print "total deals available from {} too small: {}".format( p, per_p_deal_count) print "Skipping.." continue else: describe_section( "Checking deals from {} for each city..\n".format(p)) for city in target_cities: request_parameters['location'] = target_cities[city] per_city_and_p_deal_count = requests.get( SQOOT_API_URL + 'deals', params=request_parameters, timeout=5).json()['query']['total'] print city, ': ', per_city_and_p_deal_count print 'total {} deal count: {}'.format(p, per_p_deal_count) del request_parameters['location']