def __init__(self): self._max_num_channels = 10 # User can sub to max 10 channels config = configparser.SafeConfigParser() config.read('../../config/general.conf') try: config_params = dict(config.items(settings.PRODUCER_SIMULATE_USERS)) self.kafka_hosts = config_params['kafka_hosts'] self.out_topic = config_params['out_topic'] self.group = config_params['group'] self.zk_hosts = config_params['zookeeper_hosts'] except configparser.NoSectionError: raise configparser.NoSectionError('No section: {} exists in the config file' .format(settings.PRODUCER_SIMULATE_USERS)) except KeyError: raise # Create kafka client and produce to topic, if exists # else create it. self.kafka_client = KafkaClient(hosts=self.kafka_hosts) self.out_topic = self.to_str(self.out_topic) # kafka only handles string bytes self.topic = self.kafka_client.topics[self.out_topic] print type(self.out_topic), self.out_topic uf.print_out(''' Connected with Client on {}. Getting ready to produce messages to topic {}. Press Ctrl-C to interrupt. '''.format(self.kafka_client, self.topic.name)) self.msg_cnt = 0
def create_schemas(server_ip, keyspace_name, create_tables=[], column_description=''): # setting up connections to cassandra cluster = Cluster([str(server_ip)]) session = cluster.connect(keyspace) # Create keyspace if not create_keyspace_if_not_exists(session, keyspace_name): uf.print_out('Keyspace {} already exists!'.format(keyspace_name)) else: uf.print_out('Keyspace {} is now created!'.format(keyspace_name)) session.set_keyspace(keyspace_name) # create table create_tables = create_tables or ['products', 'activities_events', 'dining_nightlife'] column_description = column_description or \ ''' (id bigint, merchant_id bigint, provider text, title text, category text, sub_category text, description text, fine_print text, price float, percentage_disc float, number_sold int, created_at timestamp, expires_at timestamp, updated_at timestamp, url text, online boolean, PRIMARY KEY (id,updated_at)) ''' for table in create_tables: create_tables(keyspace_name, table, column_description)
def __init__(self, config, consumer_mode): ''' Init a consumer based on mode activated in input ''' self.temp_file_path = None self.temp_file = None self.block_cnt = 0 self.msg_cnt = 0 self.config = config self.config_section = consumer_mode config_params = self.get_config_items() try: self.kafka_hosts = config_params['kafka_hosts'] self.out_topic = str(config_params['out_topic']) self.group = str(config_params['hdfs_group']) self.hadoop_path = str(config_params['hadoop_path']) self.cached_path = str(config_params['cached_path']) self.zk_hosts = config_params['zookeeper_hosts'] except KeyError: raise uf.print_out("Trying to make connection with params {}".format(config_params)) self.client = KafkaClient(hosts=self.kafka_hosts) # Create a client self.topic = self.client.topics[self.out_topic] # create topic if not exists self.consumer = self.topic.get_balanced_consumer( # Zookeeper dynamically assigns partitions consumer_group=self.group, auto_commit_enable=True, zookeeper_connect=self.zk_hosts) uf.print_out("Made connection")
def flush_to_hdfs(self, output_dir): '''Flushes the File into HDFS. Flushes the file into two folders under hdfs - History (to rebuild batch view if source of truth is down) and Cache that gets flushed in time intervals Args: output_dir: temp folder before loading to HDFS Returns: None ''' uf.print_out('Written {} to {}'.format(self.temp_file.tell(), self.temp_file_path)) self.temp_file.close() timestamp = time.strftime('%Y%m%d%H%M%S') hadoop_fullpath = '{}/{}_{}_{}.dat'.format(self.hadoop_path, self.group, self.out_topic, timestamp) cached_fullpath = '{}/{}_{}_{}.dat'.format(self.cached_path, self.group, self.out_topic, timestamp) print "Block {}: Flushing 100MB file to HDFS => {}".format(str(self.block_cnt), hadoop_fullpath) self.block_cnt += 1 # place blocked messages into history and cached folders on hdfs os.system('hdfs dfs -put {} {}'.format(self.temp_file_path, hadoop_fullpath)) os.system('hdfs dfs -put {} {}'.format(self.temp_file_path, cached_fullpath)) #uf.print_out('Removing temporary file - {}'.format(os.path.basename(self.temp_file_path))) # os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = '{}/kafka_{}_{}_{}.dat'.format(output_dir, self.out_topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w")
def consume_topic(self, output_dir): '''Consumes a stream of messages from the "messages" topic. Code template from https://github.com/aouyang1/PuppyPlaydate.git Args: output_dir: string representing the directory to store the 100MB before transferring to HDFS Returns: None ''' timestamp = time.strftime('%Y%m%d%H%M%S') # open file for writing self.temp_file_path = '{}/kafka_{}_{}_{}.dat'.format(output_dir, self.out_topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, 'w') uf.print_out('Starting to consume and write to temp') while True: try: # get one consumer message - max_queued = 2000 message = self.consumer.consume() #print('consuming {}....'.format(message.value)) self.msg_cnt += 1 #print "Message size {}".format(len(message), type(message)) #uf.print_out(message.value) self.temp_file.write(message.value) self.temp_file.write('\n') # file size > 100MB uf.print_out('Consumed {}: of File size now: {}KB'.format(self.msg_cnt, (self.temp_file.tell()/1000))) if self.temp_file.tell() > 100000000: self.flush_to_hdfs(output_dir) except: print "In the except" raise # Balanced consumer restarts automatically.
def get_users_channels(self): """ Retrieve a user's assigned channels. Only for the engineered data. Ideally user selects his own channels. """ if self._channels: return self._channels else: uf.print_out("[ERROR] - User: {} needs to subscribe first.".format(self._user))
def create_tables(keyspace, table_name, column_descrptn): ''' Create table ''' session.execute( ''' CREATE TABLE IF NOT EXISTS {}.{} ({}) '''.format(keyspace, table_name, column_descrptn) ) uf.print_out('Table {} in keyspace {} is \ now created!'.format(table_name, keyspace))
def consumer_url(self): ''' Consumer a kafka message and get url to fetch ''' self.start_time = datetime.now() # For logging uf.print_out("Inside Consumer url") while True: "Trying to consume message" message = self.consumer.consume() # Read one message (url) uf.print_out(message.value) self.partitions.add(message.partition.id) self.get_category_deals(message)
def get_category_deals(self, msg): ''' Fetch all deals from url found in msg ''' url = self.get_url_msg(msg) list_of_pages = self.get_pagenums_msg(msg) num_threads = len(list_of_pages) uf.print_out("Inside get_category_deals: {} \n{}".format(num_threads, url)) if self.queue_urls(url, list_of_pages): for idx in xrange(num_threads): worker = Thread(target=self.fetch_request_data, name='Thread-{}'.format(idx), args=(list_of_pages[idx],)) worker.setDaemon(True) worker.start() else: raise Queue.Full
def simulate(self, num_of_users=1000000000): ''' Simulate users subscribing to channels ''' with self.topic.get_producer() as prod: for num in xrange(1, num_of_users + 1): full_name = self._generate_random_name() num_channels = randint(1, self._max_num_channels) sub = SubscribeDeal(full_name) subscriptions = sub.subscribe(num_channels) # Produce the subscription object to producer print "Testing {}".format(subscriptions) prod.produce(subscriptions) uf.print_out("[SUCCESSFUL] - {} subscribed ==> {}.".format(sub.get_users_name(), sub.get_users_channels())) uf.print_out("[SUCCESSFUL] - {} Users written to producer".format(num))
def _filter_json_fields(self, all_deals): ''' Select only relevant json fields in deals ''' for idx, deal in enumerate(all_deals): uf.print_out('Processing deal: {}'.format(idx)) if deal: output = OrderedDict() output['id'] = deal['deal']['id'] output['category'] = deal['deal']['category_slug'] output['sub_category'] = deal['deal']['category_slug'] output['title'] = deal['deal']['short_title'] output['description'] = deal['deal']['description'] output['fine_print'] = deal['deal']['fine_print'] output['number_sold'] = deal['deal']['number_sold'] output['url'] = deal['deal']['untracked_url'] output['price'] = deal['deal']['price'] output['discount_percentage'] = deal['deal']['discount_percentage'] output['provider_name'] = deal['deal']['provider_name'] output['online'] = deal['deal']['online'] output['expires_at'] = deal['deal']['expires_at'] output['created_at'] = deal['deal']['created_at'] output['updated_at'] = deal['deal']['updated_at'] output['merchant_id'] = deal['deal']['merchant']['id'] # Online merchants have fields null. Change to '' # and then flatten merchant info merchant_info = deal['deal']['merchant'] if not all(merchant_info.values()): merchant_info = self._clean_merchant_info(merchant_info) output['merchant_name'] = merchant_info['name'] output['merchant_address'] = merchant_info['address'] output['merchant_locality'] = merchant_info['locality'] output['merchant_region'] = merchant_info['region'] output['merchant_postal_code'] = merchant_info['postal_code'] output['merchant_country'] = merchant_info['country'] output['merchant_latitude'] = merchant_info['latitude'] output['merchant_longitude'] = merchant_info['longitude'] output['merchant_phone_number'] = merchant_info['phone_number'] yield output else: uf.print_out('[EMPTY DEAL] - Could not process: #{}'.format(idx))
def __init__(self, config, consumer_mode, to_producer=True): ''' Init a consumer based on mode activated in input ''' self.config = config self.config_section = consumer_mode self.to_producer = to_producer config_params = self.get_config_items() try: self.kafka_hosts = config_params['kafka_hosts'] self.in_topic = config_params['in_topic'] self.out_topic = config_params['out_topic'] self.group = config_params['in_group'] self.zk_hosts = config_params['zookeeper_hosts'] except KeyError: raise uf.print_out("Trying to make connection {}".format(self.in_topic)) self.client = KafkaClient(hosts=self.kafka_hosts) # Create a client self.topic = self.client.topics[self.in_topic] # create topic if not exists self.consumer = self.topic.get_balanced_consumer( # Zookeeper dynamically assigns partitions consumer_group=self.group, auto_commit_enable=True, zookeeper_connect=self.zk_hosts) uf.print_out("Made connection") if self.to_producer: # write into producer try: self.out_group = config_params['out_group'] self.out_topic = self.client.topics[config_params['out_topic']] except KeyError: raise else: self.output = uf.mkdir_if_not_exist() # write to /tmp/exstreamly_cheap uf.print_out("Created output file or producer stage") self.partitions = set() self.msg_cnt = 0 # Num consumed by instance. self.init_time = datetime.now() self.start_time = self.init_time self.url_queue = Queue(maxsize=0) # infinitely sized self.semaphore = BoundedSemaphore()
from src.helper_modules import utility_functions as uf from src.batch_process import hdfs_batch_process as hbp from create_database_schema import * # Constants definition INPUT_KEY_CONVERTER = "com.parsely.spark.converters.FromUsersCQLKeyConverter" INPUT_VALUE_CONVERTER = "com.parsely.spark.converters.FromUsersCQLValueConverter" OUTPUT_KEY_CONVERTER = "com.parsely.spark.converters.ToCassandraCQLKeyConverter" OUTPUT_VALUE_CONVERTER = "com.parsely.spark.converters.ToCassandraCQLValueConverter" if __name__ == '__main__': cluster = Cluster(['172.31.2.39']) session = cluster.connect('deals') categories = ['merchants', 'dining_nightlife', 'activities_events', 'products'] for category in categories: uf.print_out('Cleaning {} Table.'.format(category.capitalize())) file_name = 'hdfs://52.1.154.19:9000/exstreamly_cheap_files/exstreamly_cheap_files/{}.json'.format(category) df_category = hbp.create_dataframe(file_name) df_category = df_category.dropna() df_category = hbp.remove_duplicate_deals(df_category) unique_vals = hbp.count_unique_rows(df_category) uf.print_out('Number of unique {} serving deals: {}'.format(category, unique_vals)) # Insert dataFrames with all our categories into Cassandra df_category.registerTempTable('{}'.format(category)) if category is 'merchants': df_category.select('id', 'name', 'address', 'postal_code', 'country', 'phone_number', 'region', 'longitude', 'latitude', 'url').write.format("org.apache.spark.sql.cassandra").options(table='merchants', keyspace='deals').save(mode='append') else: # other categories df_category.write.format('org.apache.spark.sql.cassandra').options(table='{}'.format(category), keyspace='deals').save(mode='append') # Insert everything into these specific query tables
def fetch_sqoot_data(base_url): ''' Fetch Sqoot Data and save relevant information to file ''' files_location = uf.mkdir_if_not_exist() # Folder in /tmp/exstreamly_cheap_files merchants_file = os.path.join(files_location, 'merchants.json') products_file = os.path.join(files_location, 'products.json') events_file = os.path.join(files_location, 'activities_events.json') food_nitelife_file = os.path.join(files_location, 'dining_nitelife.json') categories_map = map_categories(base_url) mvp_categories = [u'product', u'dining-nightlife', u'activities-events'] focus_grp = reduce_categories_scope(categories_map, mvp_categories) start_time = datetime.datetime.now() end_time = start_time + datetime.timedelta(hours=7) all_deals = [] queue = Queue.Queue() while start_time < end_time: try: # Due to api inconsistencies, to always get the newest ones and page 5 # Duplicates will be batchly processed in SPARK # Combine both # Flatten JSON, keep online merchant ID in deals file # Save Merchant in Merchant Table # first_100_deals = get_request(base_url, 'deals', 'per_page=100;radius=10000') # all_deals = all_deals + first_100_deals.json()['deals'] uf.print_out('Crawling first 100 pages') for num in xrange(1, 101): uf.print_out('.' * num) thread_ = threading.Thread(target=get_request, name='Thread{}'.format(num), args=[base_url, 'deals', 'page={};per_page=100;radius=10000'.format(num), queue]) thread_.start() thread_.join() while not queue.empty(): all_deals = all_deals + queue.get() for idx, deal in enumerate(all_deals): uf.print_out('Processing deal: {}'.format(idx)) # If deal category belongs to mvp, save category = category_in_mvp(focus_grp, deal['deal']['category_slug']) if category: output = OrderedDict() output['id'] = deal['deal']['id'] output['category'] = category output['sub_category'] = deal['deal']['category_slug'] output['title'] = deal['deal']['short_title'] output['description'] = deal['deal']['description'] output['fine_print'] = deal['deal']['fine_print'] output['number_sold'] = deal['deal']['number_sold'] output['url'] = deal['deal']['untracked_url'] output['price'] = deal['deal']['price'] output['discount_percentage'] = deal['deal']['discount_percentage'] output['provider_name'] = deal['deal']['provider_name'] output['online'] = deal['deal']['online'] output['expires_at'] = deal['deal']['expires_at'] output['created_at'] = deal['deal']['created_at'] output['updated_at'] = deal['deal']['updated_at'] output['merchant_id'] = deal['deal']['merchant']['id'] # Write deal to file with open(os.path.join(files_location, str(category) + '.json'), 'a') as f: f.write(json.dumps(output)) f.write('\n') # Write merchant info file merchant_info = deal['deal']['merchant'] if not all(merchant_info.values()): merchant_info = clean_merchant_info(merchant_info) with open(os.path.join(files_location, 'merchants.json'), 'a') as f: f.write(json.dumps(merchant_info)) f.write('\n') start_time = datetime.datetime.now() uf.print_out("Time left: {} minute(s)".format((end_time - start_time).seconds / 60)) uf.print_out("Waiting 30mins to crawl again") uf.spinning_cursor(1800) except rq.exceptions.ConnectionError: uf.print_out("[ConnectionError] ==> Issue with API server.") except rq.exceptions.ConnectTimeout: uf.print_out("[ConnectionTimeout] ==> Server connection timing out.")
def fetch_request_data(self, page_num): ''' Fetch request data from queued up urls ''' uf.print_out("Inside fetch_request_data") while not self.url_queue.empty(): uf.print_out("Trying to dequeue.... Is queue empty? {}".format(self.url_queue.empty())) url = self.url_queue.get() try: req = rq.get(url) except rq.exceptions.RequestException: continue if not req.ok: continue try: data = req.json()['deals'] except simplejson.scanner.JSONDecodeError: continue if not data: # JSON Object Ok but no deals. uf.print_out("No deals found on page {}. Continuing....".format(page_num)) continue # Write deals to output one at a time for deal in self._filter_json_fields(data): self.msg_cnt += 1 if self.to_producer: # write to producer with self.out_topic.get_producer() as prod: prod.produce(json.dumps(deal)) uf.print_out("{} strings written to producer".format(self.msg_cnt)) else: # write to file uf.print_out("Waiting to acquire lock...") self.semaphore.acquire() # Thread safe I/O write uf.print_out(" ==> Got the lock...") uf.print_out("Trying to write to file") with open('deals.json', 'a') as f: f.write(json.dumps(str(deal))) f.write('\n') uf.print_out("{} strings written to file".format(self.msg_cnt)) self.semaphore.release() uf.print_out(" ==> Released the lock...") self.url_queue.task_done()
if __name__ == '__main__': cluster = Cluster(['172.31.2.39']) session = cluster.connect('deals') locations = [] for line in fetch_all_locations(): locations.append(line) # User table prepared table statement query = 'INSERT INTO users (full_name, time_of_creation, latitude, longitude) VALUES (?,?,?,?)' prepared = session.prepare(query) # From the locations lists assign user a get random location for num in xrange(1, 1000000001): random_location = random.choice(locations) # Create user object user = UserProfile() user.assign_location(random_location) print repr(user) # Insert into DB uf.print_out('Inserting into database...') ts = uuid.uuid1() bound = prepared.bind((user.get_name(), ts, float(random_location[1]), float(random_location[2]))) session.execute(bound) session.shutdown()
producer.produce(json.dumps(msg)) # Back up to file for HDFS and S3 hdp_output.write(json.dumps(msg) + "\n") if hdp_output.tell() > 100000000: hdp_output.close() uf.print_out("Block {}: Flushing 100MB file to HDFS => {}".format(str(block_cnt), hadoop_fullpath)) # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put {} {}".format(hadoop_file, hadoop_fullpath)) os.system("hdfs dfs -put {} {}".format(hadoop_file, cached_fullpath)) # Back up in S3 uf.print_out("Syncing {} to S3 for back up".format(output_dir)) os.system("aws s3 sync {} s3://emmanuel-awa/clean_data_from_elastic".format(output_dir)) # Recreate file handler hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(time.strftime("%Y%m%d%H%M%S"))) hdp_output = open(hadoop_file, "w") uf.print_out("Cleaned {} blocks. File size: {}KB".format(block_cnt, hdp_output.tell() / 1000)) block_cnt += 1 if __name__ == "__main__": # Clean up both indexes in ES now and merge them as one for index in ["all_deals_data", "all_deals_data_index"]: uf.print_out("Processing {}....".format(index)) fetch_and_clean_up(index)
def fetch_and_clean_up(index_name): """ Fetch Elastic data and clean it up """ # Logstash and HDFS general info output_dir = uf.mkdir_if_not_exist("/tmp/exstreamly_cheap_files/elasticsearch_cleanup") # logstash_file = os.path.join(output_dir, 'clean_deals.json') # HDFS Related data group = "deals_data_hdfs" topic_id = "elastic_deals_data" timestamp = time.strftime("%Y%m%d%H%M%S") hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(timestamp)) hadoop_path = "/exstreamly_cheap_main_files/all_deals/history" cached_path = "/exstreamly_cheap_main_files/all_deals/cached" hadoop_fullpath = "{}/{}_{}_{}.dat".format(hadoop_path, group, topic_id, timestamp) cached_fullpath = "{}/{}_{}_{}.dat".format(cached_path, group, topic_id, timestamp) uf.print_out("Writing the logs to {} which will be pushed to hdfs and S3".format(hadoop_file)) block_cnt = 0 client = make_client() cc = Search(using=client, index=index_name) gen = cc.scan() config = configparser.SafeConfigParser() config.read("../../config/general.conf") config_params = uf.get_config_items(config, settings.PRODUCER_CLEAN_ES_DATA) try: kafka_hosts = config_params["kafka_hosts"] topic = config_params["topic"] group = config_params["group"] zk_hosts = config_params["zookeeper_hosts"] except KeyError: raise kafka_client = KafkaClient(hosts=kafka_hosts) kafka_topic = kafka_client.topics[topic] # Create if not exist uf.print_out("Producing messages to topic {}. Press Ctrl-C to terminate".format(kafka_topic.name)) # Produce to kafka for distributed consumption hdp_output = open(hadoop_file, "w") with kafka_topic.get_producer() as producer: for event in gen: new_string = dict(eval(event.message.encode("utf-8"))) msg = clean_data(new_string) # We can decide to have logstash read from file instead # with open(logstash_file, 'a') as log_output: # log_output.write(json.dumps(msg) + '\n') # Write to producer. producer.produce(json.dumps(msg)) # Back up to file for HDFS and S3 hdp_output.write(json.dumps(msg) + "\n") if hdp_output.tell() > 100000000: hdp_output.close() uf.print_out("Block {}: Flushing 100MB file to HDFS => {}".format(str(block_cnt), hadoop_fullpath)) # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put {} {}".format(hadoop_file, hadoop_fullpath)) os.system("hdfs dfs -put {} {}".format(hadoop_file, cached_fullpath)) # Back up in S3 uf.print_out("Syncing {} to S3 for back up".format(output_dir)) os.system("aws s3 sync {} s3://emmanuel-awa/clean_data_from_elastic".format(output_dir)) # Recreate file handler hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(time.strftime("%Y%m%d%H%M%S"))) hdp_output = open(hadoop_file, "w") uf.print_out("Cleaned {} blocks. File size: {}KB".format(block_cnt, hdp_output.tell() / 1000)) block_cnt += 1
subscriptions = sub.subscribe(num_channels) # Produce the subscription object to producer print "Testing {}".format(subscriptions) prod.produce(subscriptions) uf.print_out("[SUCCESSFUL] - {} subscribed ==> {}.".format(sub.get_users_name(), sub.get_users_channels())) uf.print_out("[SUCCESSFUL] - {} Users written to producer".format(num)) def _generate_random_name(self): ''' Generate random full name Using open source library from treyhunner ''' return names.get_full_name() def to_str(self, unicode_or_str): ''' Convert unicode to string to write to output ''' if isinstance(unicode_or_str, unicode): val = unicode_or_str.encode('utf-8') else: val = unicode_or_str return val if __name__ == '__main__': sim = SimulateInteraction() uf.print_out('[START] - {}....'.format((datetime.now()).strftime("%Y-%m-%dT%H:%M:%S%Z"))) sim.simulate() uf.print_out('[FINISH] - {}....'.format((datetime.now()).strftime("%Y-%m-%dT%H:%M:%S%Z")))
os.system('hdfs dfs -put {} {}'.format(self.temp_file_path, cached_fullpath)) #uf.print_out('Removing temporary file - {}'.format(os.path.basename(self.temp_file_path))) # os.remove(self.temp_file_path) timestamp = time.strftime('%Y%m%d%H%M%S') self.temp_file_path = '{}/kafka_{}_{}_{}.dat'.format(output_dir, self.out_topic, self.group, timestamp) self.temp_file = open(self.temp_file_path, "w") def get_config_items(self): ''' Retrieve relevant config settings for section applicable to this type of instance for group, in_topic, out_topic if available ''' try: return dict(self.config.items(self.config_section)) except configparser.NoSectionError: raise configparser.NoSectionError('No section: {} exists in the config file' .format(self.config_section)) if __name__ == '__main__': tmp_out_dir = '/home/ubuntu/exstreamly_cheap_all_deals/ingestion/kafka_messages' tmp_out_dir = uf.mkdir_if_not_exist(tmp_out_dir) uf.print_out('Output format: {}'.format(tmp_out_dir)) config = configparser.SafeConfigParser() config.read('../../config/general.conf') print '\nConsuming messages...' cons = ConsumerToHDFS(config, settings.CONSUMER_MODE_DATA) cons.consume_topic(tmp_out_dir)