def __init__(self, config, consumer_mode, to_producer=True): ''' Init a consumer based on mode activated in input ''' self.config = config self.config_section = consumer_mode self.to_producer = to_producer config_params = self.get_config_items() try: self.kafka_hosts = config_params['kafka_hosts'] self.in_topic = config_params['in_topic'] self.out_topic = config_params['out_topic'] self.group = config_params['in_group'] self.zk_hosts = config_params['zookeeper_hosts'] except KeyError: raise uf.print_out("Trying to make connection {}".format(self.in_topic)) self.client = KafkaClient(hosts=self.kafka_hosts) # Create a client self.topic = self.client.topics[self.in_topic] # create topic if not exists self.consumer = self.topic.get_balanced_consumer( # Zookeeper dynamically assigns partitions consumer_group=self.group, auto_commit_enable=True, zookeeper_connect=self.zk_hosts) uf.print_out("Made connection") if self.to_producer: # write into producer try: self.out_group = config_params['out_group'] self.out_topic = self.client.topics[config_params['out_topic']] except KeyError: raise else: self.output = uf.mkdir_if_not_exist() # write to /tmp/exstreamly_cheap uf.print_out("Created output file or producer stage") self.partitions = set() self.msg_cnt = 0 # Num consumed by instance. self.init_time = datetime.now() self.start_time = self.init_time self.url_queue = Queue(maxsize=0) # infinitely sized self.semaphore = BoundedSemaphore()
os.system('hdfs dfs -put {} {}'.format(self.temp_file_path, cached_fullpath)) #uf.print_out('Removing temporary file - {}'.format(os.path.basename(self.temp_file_path))) # os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = '{}/kafka_{}_{}_{}.dat'.format(output_dir, self.out_topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") def get_config_items(self): ''' Retrieve relevant config settings for section applicable to this type of instance for group, in_topic, out_topic if available ''' try: return dict(self.config.items(self.config_section)) except configparser.NoSectionError: raise configparser.NoSectionError('No section: {} exists in the config file' .format(self.config_section)) if __name__ == '__main__': tmp_out_dir = '/home/ubuntu/exstreamly_cheap_all_deals/ingestion/kafka_messages' tmp_out_dir = uf.mkdir_if_not_exist(tmp_out_dir) uf.print_out('Output format: {}'.format(tmp_out_dir)) config = configparser.SafeConfigParser() config.read('../../config/general.conf') print '\nConsuming messages...' cons = ConsumerToHDFS(config, settings.CONSUMER_MODE_DATA) cons.consume_topic(tmp_out_dir)
def fetch_and_clean_up(index_name): """ Fetch Elastic data and clean it up """ # Logstash and HDFS general info output_dir = uf.mkdir_if_not_exist("/tmp/exstreamly_cheap_files/elasticsearch_cleanup") # logstash_file = os.path.join(output_dir, 'clean_deals.json') # HDFS Related data group = "deals_data_hdfs" topic_id = "elastic_deals_data" timestamp = time.strftime("%Y%m%d%H%M%S") hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(timestamp)) hadoop_path = "/exstreamly_cheap_main_files/all_deals/history" cached_path = "/exstreamly_cheap_main_files/all_deals/cached" hadoop_fullpath = "{}/{}_{}_{}.dat".format(hadoop_path, group, topic_id, timestamp) cached_fullpath = "{}/{}_{}_{}.dat".format(cached_path, group, topic_id, timestamp) uf.print_out("Writing the logs to {} which will be pushed to hdfs and S3".format(hadoop_file)) block_cnt = 0 client = make_client() cc = Search(using=client, index=index_name) gen = cc.scan() config = configparser.SafeConfigParser() config.read("../../config/general.conf") config_params = uf.get_config_items(config, settings.PRODUCER_CLEAN_ES_DATA) try: kafka_hosts = config_params["kafka_hosts"] topic = config_params["topic"] group = config_params["group"] zk_hosts = config_params["zookeeper_hosts"] except KeyError: raise kafka_client = KafkaClient(hosts=kafka_hosts) kafka_topic = kafka_client.topics[topic] # Create if not exist uf.print_out("Producing messages to topic {}. Press Ctrl-C to terminate".format(kafka_topic.name)) # Produce to kafka for distributed consumption hdp_output = open(hadoop_file, "w") with kafka_topic.get_producer() as producer: for event in gen: new_string = dict(eval(event.message.encode("utf-8"))) msg = clean_data(new_string) # We can decide to have logstash read from file instead # with open(logstash_file, 'a') as log_output: # log_output.write(json.dumps(msg) + '\n') # Write to producer. producer.produce(json.dumps(msg)) # Back up to file for HDFS and S3 hdp_output.write(json.dumps(msg) + "\n") if hdp_output.tell() > 100000000: hdp_output.close() uf.print_out("Block {}: Flushing 100MB file to HDFS => {}".format(str(block_cnt), hadoop_fullpath)) # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put {} {}".format(hadoop_file, hadoop_fullpath)) os.system("hdfs dfs -put {} {}".format(hadoop_file, cached_fullpath)) # Back up in S3 uf.print_out("Syncing {} to S3 for back up".format(output_dir)) os.system("aws s3 sync {} s3://emmanuel-awa/clean_data_from_elastic".format(output_dir)) # Recreate file handler hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(time.strftime("%Y%m%d%H%M%S"))) hdp_output = open(hadoop_file, "w") uf.print_out("Cleaned {} blocks. File size: {}KB".format(block_cnt, hdp_output.tell() / 1000)) block_cnt += 1
def fetch_sqoot_data(base_url): ''' Fetch Sqoot Data and save relevant information to file ''' files_location = uf.mkdir_if_not_exist() # Folder in /tmp/exstreamly_cheap_files merchants_file = os.path.join(files_location, 'merchants.json') products_file = os.path.join(files_location, 'products.json') events_file = os.path.join(files_location, 'activities_events.json') food_nitelife_file = os.path.join(files_location, 'dining_nitelife.json') categories_map = map_categories(base_url) mvp_categories = [u'product', u'dining-nightlife', u'activities-events'] focus_grp = reduce_categories_scope(categories_map, mvp_categories) start_time = datetime.datetime.now() end_time = start_time + datetime.timedelta(hours=7) all_deals = [] queue = Queue.Queue() while start_time < end_time: try: # Due to api inconsistencies, to always get the newest ones and page 5 # Duplicates will be batchly processed in SPARK # Combine both # Flatten JSON, keep online merchant ID in deals file # Save Merchant in Merchant Table # first_100_deals = get_request(base_url, 'deals', 'per_page=100;radius=10000') # all_deals = all_deals + first_100_deals.json()['deals'] uf.print_out('Crawling first 100 pages') for num in xrange(1, 101): uf.print_out('.' * num) thread_ = threading.Thread(target=get_request, name='Thread{}'.format(num), args=[base_url, 'deals', 'page={};per_page=100;radius=10000'.format(num), queue]) thread_.start() thread_.join() while not queue.empty(): all_deals = all_deals + queue.get() for idx, deal in enumerate(all_deals): uf.print_out('Processing deal: {}'.format(idx)) # If deal category belongs to mvp, save category = category_in_mvp(focus_grp, deal['deal']['category_slug']) if category: output = OrderedDict() output['id'] = deal['deal']['id'] output['category'] = category output['sub_category'] = deal['deal']['category_slug'] output['title'] = deal['deal']['short_title'] output['description'] = deal['deal']['description'] output['fine_print'] = deal['deal']['fine_print'] output['number_sold'] = deal['deal']['number_sold'] output['url'] = deal['deal']['untracked_url'] output['price'] = deal['deal']['price'] output['discount_percentage'] = deal['deal']['discount_percentage'] output['provider_name'] = deal['deal']['provider_name'] output['online'] = deal['deal']['online'] output['expires_at'] = deal['deal']['expires_at'] output['created_at'] = deal['deal']['created_at'] output['updated_at'] = deal['deal']['updated_at'] output['merchant_id'] = deal['deal']['merchant']['id'] # Write deal to file with open(os.path.join(files_location, str(category) + '.json'), 'a') as f: f.write(json.dumps(output)) f.write('\n') # Write merchant info file merchant_info = deal['deal']['merchant'] if not all(merchant_info.values()): merchant_info = clean_merchant_info(merchant_info) with open(os.path.join(files_location, 'merchants.json'), 'a') as f: f.write(json.dumps(merchant_info)) f.write('\n') start_time = datetime.datetime.now() uf.print_out("Time left: {} minute(s)".format((end_time - start_time).seconds / 60)) uf.print_out("Waiting 30mins to crawl again") uf.spinning_cursor(1800) except rq.exceptions.ConnectionError: uf.print_out("[ConnectionError] ==> Issue with API server.") except rq.exceptions.ConnectTimeout: uf.print_out("[ConnectionTimeout] ==> Server connection timing out.")