def process(self): ''' Process all categories in REPL ''' base_url = '{}/deals?api_key={}'.format(settings.SQOOT_BASE_URL, settings.SQOOT_API_KEY) first_visit = True while True: if self.process_cnt == 0: print "[PROCESS URLS] - Starting to process first time" self._construct_and_produce_urls(base_url, first_visit) first_visit = False else: print "[PROCESS URLS] - Starting to process updates" self._construct_and_produce_urls(base_url) self.process_cnt += 1 uf.spinning_cursor(900) # check for updated deals every 15mins
def _construct_and_produce_urls(self, base_url, initial_visit=False): ''' Process all categories. For intial visit, we get everything Subsequent visit, used for real time, we use updated_after query param of the api ''' for idx, category in enumerate(self._categories): print "Processing: {}".format(category) url = '{};category_slug={}'.format(base_url, category) partition_key = idx % 4 # creating 4 partitions by default self._producer.produce_deal_urls(url, self._out_topic, partition_key, self.max_deals_per_page, initial_visit) uf.spinning_cursor(30) # Wait for 30 seconds in between categories
def fetch_sqoot_data(base_url): ''' Fetch Sqoot Data and save relevant information to file ''' files_location = uf.mkdir_if_not_exist() # Folder in /tmp/exstreamly_cheap_files merchants_file = os.path.join(files_location, 'merchants.json') products_file = os.path.join(files_location, 'products.json') events_file = os.path.join(files_location, 'activities_events.json') food_nitelife_file = os.path.join(files_location, 'dining_nitelife.json') categories_map = map_categories(base_url) mvp_categories = [u'product', u'dining-nightlife', u'activities-events'] focus_grp = reduce_categories_scope(categories_map, mvp_categories) start_time = datetime.datetime.now() end_time = start_time + datetime.timedelta(hours=7) all_deals = [] queue = Queue.Queue() while start_time < end_time: try: # Due to api inconsistencies, to always get the newest ones and page 5 # Duplicates will be batchly processed in SPARK # Combine both # Flatten JSON, keep online merchant ID in deals file # Save Merchant in Merchant Table # first_100_deals = get_request(base_url, 'deals', 'per_page=100;radius=10000') # all_deals = all_deals + first_100_deals.json()['deals'] uf.print_out('Crawling first 100 pages') for num in xrange(1, 101): uf.print_out('.' * num) thread_ = threading.Thread(target=get_request, name='Thread{}'.format(num), args=[base_url, 'deals', 'page={};per_page=100;radius=10000'.format(num), queue]) thread_.start() thread_.join() while not queue.empty(): all_deals = all_deals + queue.get() for idx, deal in enumerate(all_deals): uf.print_out('Processing deal: {}'.format(idx)) # If deal category belongs to mvp, save category = category_in_mvp(focus_grp, deal['deal']['category_slug']) if category: output = OrderedDict() output['id'] = deal['deal']['id'] output['category'] = category output['sub_category'] = deal['deal']['category_slug'] output['title'] = deal['deal']['short_title'] output['description'] = deal['deal']['description'] output['fine_print'] = deal['deal']['fine_print'] output['number_sold'] = deal['deal']['number_sold'] output['url'] = deal['deal']['untracked_url'] output['price'] = deal['deal']['price'] output['discount_percentage'] = deal['deal']['discount_percentage'] output['provider_name'] = deal['deal']['provider_name'] output['online'] = deal['deal']['online'] output['expires_at'] = deal['deal']['expires_at'] output['created_at'] = deal['deal']['created_at'] output['updated_at'] = deal['deal']['updated_at'] output['merchant_id'] = deal['deal']['merchant']['id'] # Write deal to file with open(os.path.join(files_location, str(category) + '.json'), 'a') as f: f.write(json.dumps(output)) f.write('\n') # Write merchant info file merchant_info = deal['deal']['merchant'] if not all(merchant_info.values()): merchant_info = clean_merchant_info(merchant_info) with open(os.path.join(files_location, 'merchants.json'), 'a') as f: f.write(json.dumps(merchant_info)) f.write('\n') start_time = datetime.datetime.now() uf.print_out("Time left: {} minute(s)".format((end_time - start_time).seconds / 60)) uf.print_out("Waiting 30mins to crawl again") uf.spinning_cursor(1800) except rq.exceptions.ConnectionError: uf.print_out("[ConnectionError] ==> Issue with API server.") except rq.exceptions.ConnectTimeout: uf.print_out("[ConnectionTimeout] ==> Server connection timing out.")