Python print_out示例，src.helper_modules.utility_functions.print_out Python示例

示例#1

0

显示文件

文件： simulate_user_interaction.py 项目： awaemmanuel/exstreamly_cheap

    def __init__(self):
        self._max_num_channels = 10 # User can sub to max 10 channels
        config = configparser.SafeConfigParser()
        config.read('../../config/general.conf')
        try:
            config_params = dict(config.items(settings.PRODUCER_SIMULATE_USERS))
            self.kafka_hosts = config_params['kafka_hosts']
            self.out_topic = config_params['out_topic']
            self.group = config_params['group']
            self.zk_hosts = config_params['zookeeper_hosts']
        except configparser.NoSectionError:
            raise configparser.NoSectionError('No section: {} exists in the config file'
                                 .format(settings.PRODUCER_SIMULATE_USERS)) 
        except KeyError:
            raise
        
        # Create kafka client and produce to topic, if exists
        # else create it.
        
        self.kafka_client = KafkaClient(hosts=self.kafka_hosts)
        
        self.out_topic = self.to_str(self.out_topic) # kafka only handles string bytes
        self.topic = self.kafka_client.topics[self.out_topic]  
        print type(self.out_topic), self.out_topic
        
        uf.print_out('''
            Connected with Client on {}.
Getting ready to produce messages to topic {}. Press Ctrl-C to interrupt. 
            '''.format(self.kafka_client, self.topic.name))
        self.msg_cnt = 0

示例#2

0

显示文件

文件： create_database_schema.py 项目： awaemmanuel/exstreamly_cheap

def create_schemas(server_ip, keyspace_name, create_tables=[], column_description=''):
     # setting up connections to cassandra
    cluster = Cluster([str(server_ip)])
    session = cluster.connect(keyspace)
    
    # Create keyspace
    if not create_keyspace_if_not_exists(session, keyspace_name):
        uf.print_out('Keyspace {} already exists!'.format(keyspace_name))
    else:
        uf.print_out('Keyspace {} is now created!'.format(keyspace_name))
    session.set_keyspace(keyspace_name)

    # create table
    create_tables = create_tables or ['products', 'activities_events', 'dining_nightlife']
    column_description = column_description or \
    '''
    (id bigint, merchant_id bigint, provider text, 
    title text, category text, sub_category text, 
    description text, fine_print text, price float, 
    percentage_disc float, number_sold int, 
    created_at timestamp, expires_at timestamp, 
    updated_at timestamp, url text, online boolean, 
    PRIMARY KEY (id,updated_at))
    '''
    for table in create_tables:
        create_tables(keyspace_name, table, column_description)

示例#3

0

显示文件

文件： kafka_to_hdfs.py 项目： awaemmanuel/exstreamly_cheap

 def __init__(self, config, consumer_mode):
     ''' Init a consumer based on mode activated in input '''
     self.temp_file_path = None
     self.temp_file = None
     self.block_cnt = 0
     self.msg_cnt = 0
     self.config = config
     self.config_section = consumer_mode
     config_params = self.get_config_items()
     try:
         self.kafka_hosts = config_params['kafka_hosts']
         self.out_topic = str(config_params['out_topic'])
         self.group = str(config_params['hdfs_group'])
         self.hadoop_path = str(config_params['hadoop_path'])
         self.cached_path = str(config_params['cached_path'])
         self.zk_hosts = config_params['zookeeper_hosts']
     except KeyError:
         raise
     uf.print_out("Trying to make connection with params {}".format(config_params))
     self.client = KafkaClient(hosts=self.kafka_hosts) # Create a client
     self.topic = self.client.topics[self.out_topic] # create topic if not exists
     self.consumer = self.topic.get_balanced_consumer( # Zookeeper dynamically assigns partitions
         consumer_group=self.group,
         auto_commit_enable=True,
         zookeeper_connect=self.zk_hosts)
     uf.print_out("Made connection")

示例#4

0

显示文件

文件： kafka_to_hdfs.py 项目： awaemmanuel/exstreamly_cheap

    def flush_to_hdfs(self, output_dir):
        '''Flushes the File into HDFS.

        Flushes the file into two folders under
        hdfs - History (to rebuild batch view if source of truth is down) and 
        Cache that gets flushed in time intervals

        Args:
            output_dir: temp folder before loading to HDFS

        Returns:
            None
        '''
        uf.print_out('Written {} to {}'.format(self.temp_file.tell(), self.temp_file_path))
        self.temp_file.close()

        timestamp = time.strftime('%Y%m%d%H%M%S')


        hadoop_fullpath = '{}/{}_{}_{}.dat'.format(self.hadoop_path, self.group, self.out_topic, timestamp)
        cached_fullpath = '{}/{}_{}_{}.dat'.format(self.cached_path, self.group, self.out_topic, timestamp)
        print "Block {}: Flushing 100MB file to HDFS => {}".format(str(self.block_cnt), hadoop_fullpath)
        self.block_cnt += 1

        # place blocked messages into history and cached folders on hdfs
        os.system('hdfs dfs -put {} {}'.format(self.temp_file_path, hadoop_fullpath))
        os.system('hdfs dfs -put {} {}'.format(self.temp_file_path, cached_fullpath))
        
        #uf.print_out('Removing temporary file - {}'.format(os.path.basename(self.temp_file_path)))
        # os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')
        self.temp_file_path = '{}/kafka_{}_{}_{}.dat'.format(output_dir, self.out_topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")

示例#5

0

显示文件

文件： kafka_to_hdfs.py 项目： awaemmanuel/exstreamly_cheap

    def consume_topic(self, output_dir):
        '''Consumes a stream of messages from the "messages" topic.

        Code template from https://github.com/aouyang1/PuppyPlaydate.git

        Args:
            output_dir: string representing the directory to store the 100MB
                before transferring to HDFS

        Returns:
            None
        '''
        timestamp = time.strftime('%Y%m%d%H%M%S')
        
        # open file for writing
        self.temp_file_path = '{}/kafka_{}_{}_{}.dat'.format(output_dir, self.out_topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, 'w')
        uf.print_out('Starting to consume and write to temp')
        while True:
            try:
                # get one consumer message - max_queued = 2000
                message = self.consumer.consume()
                #print('consuming {}....'.format(message.value))
                self.msg_cnt += 1
                #print "Message size {}".format(len(message), type(message))
                #uf.print_out(message.value)
                self.temp_file.write(message.value)
                self.temp_file.write('\n')
                # file size > 100MB
                uf.print_out('Consumed {}: of File size now: {}KB'.format(self.msg_cnt, (self.temp_file.tell()/1000)))
                if self.temp_file.tell() > 100000000:
                    self.flush_to_hdfs(output_dir)
            except:
                print "In the except"
                raise # Balanced consumer restarts automatically.

示例#6

0

显示文件

文件： generate_user_subscription.py 项目： awaemmanuel/exstreamly_cheap

 def get_users_channels(self):
     """ Retrieve a user's assigned channels.
         Only for the engineered data. 
         Ideally user selects his own channels.
     """
     if self._channels:
         return self._channels
     else:
         uf.print_out("[ERROR] - User: {} needs to subscribe first.".format(self._user))

示例#7

0

显示文件

文件： create_database_schema.py 项目： awaemmanuel/exstreamly_cheap

def create_tables(keyspace, table_name, column_descrptn):
    ''' Create table '''
    session.execute(
    '''
    CREATE TABLE 
    IF NOT EXISTS {}.{} ({})
    '''.format(keyspace, table_name, column_descrptn)
    )
    uf.print_out('Table {} in keyspace {} is \
    now created!'.format(table_name, keyspace))

示例#8

0

显示文件

文件： url_consumer.py 项目： awaemmanuel/exstreamly_cheap

 def consumer_url(self):
     ''' Consumer a kafka message and get url to fetch '''
     self.start_time = datetime.now() # For logging
     uf.print_out("Inside Consumer url")
     while True:
         "Trying to consume message"
         message = self.consumer.consume() # Read one message (url)
         uf.print_out(message.value)
         self.partitions.add(message.partition.id)
         self.get_category_deals(message)

示例#9

0

显示文件

文件： url_consumer.py 项目： awaemmanuel/exstreamly_cheap

 def get_category_deals(self, msg):
     ''' Fetch all deals from url found in msg '''
     url = self.get_url_msg(msg)
     list_of_pages = self.get_pagenums_msg(msg)
     num_threads = len(list_of_pages)
     uf.print_out("Inside get_category_deals: {} \n{}".format(num_threads, url))
     if self.queue_urls(url, list_of_pages):
         for idx in xrange(num_threads):
             worker = Thread(target=self.fetch_request_data, 
                             name='Thread-{}'.format(idx),
                             args=(list_of_pages[idx],))
             worker.setDaemon(True)
             worker.start()
     else:
         raise Queue.Full

示例#10

0

显示文件

文件： simulate_user_interaction.py 项目： awaemmanuel/exstreamly_cheap

 def simulate(self, num_of_users=1000000000): 
     ''' Simulate users subscribing to channels '''  
     with self.topic.get_producer() as prod:
         for num in xrange(1, num_of_users + 1):
             full_name = self._generate_random_name()
             num_channels = randint(1, self._max_num_channels)
             sub = SubscribeDeal(full_name)
             subscriptions = sub.subscribe(num_channels)
         
             #   Produce the subscription object to producer
             print "Testing {}".format(subscriptions)
             prod.produce(subscriptions)
             uf.print_out("[SUCCESSFUL] - {} subscribed ==> {}.".format(sub.get_users_name(),
                                                                    sub.get_users_channels()))
             uf.print_out("[SUCCESSFUL] - {} Users written to producer".format(num))

示例#11

0

显示文件

文件： url_consumer.py 项目： awaemmanuel/exstreamly_cheap

    def _filter_json_fields(self, all_deals):
        ''' Select only relevant json fields in deals '''
        for idx, deal in enumerate(all_deals):
            uf.print_out('Processing deal: {}'.format(idx))
            if deal:
                output = OrderedDict()
                output['id'] = deal['deal']['id']
                output['category'] = deal['deal']['category_slug']
                output['sub_category'] = deal['deal']['category_slug']
                output['title'] = deal['deal']['short_title']
                output['description'] = deal['deal']['description']
                output['fine_print'] = deal['deal']['fine_print']
                output['number_sold'] = deal['deal']['number_sold']
                output['url'] = deal['deal']['untracked_url']
                output['price'] = deal['deal']['price']
                output['discount_percentage'] = deal['deal']['discount_percentage']
                output['provider_name'] = deal['deal']['provider_name']
                output['online'] = deal['deal']['online']
                output['expires_at'] = deal['deal']['expires_at']
                output['created_at'] = deal['deal']['created_at']
                output['updated_at'] = deal['deal']['updated_at']
                output['merchant_id'] = deal['deal']['merchant']['id']

                # Online merchants have fields null. Change to ''
                # and then flatten merchant info
                merchant_info = deal['deal']['merchant']
                if not all(merchant_info.values()):
                    merchant_info = self._clean_merchant_info(merchant_info) 
                output['merchant_name'] = merchant_info['name']
                output['merchant_address'] = merchant_info['address']
                output['merchant_locality'] = merchant_info['locality']
                output['merchant_region'] = merchant_info['region']
                output['merchant_postal_code'] = merchant_info['postal_code']
                output['merchant_country'] = merchant_info['country']
                output['merchant_latitude'] = merchant_info['latitude']
                output['merchant_longitude'] = merchant_info['longitude']
                output['merchant_phone_number'] = merchant_info['phone_number']
                
                yield output
            else:
                uf.print_out('[EMPTY DEAL] - Could not process: #{}'.format(idx))

示例#12

0

显示文件

文件： url_consumer.py 项目： awaemmanuel/exstreamly_cheap

 def __init__(self, config, consumer_mode, to_producer=True):
     ''' Init a consumer based on mode activated in input '''
     self.config = config
     self.config_section = consumer_mode
     self.to_producer = to_producer
     config_params = self.get_config_items()
     try:
         self.kafka_hosts = config_params['kafka_hosts']
         self.in_topic = config_params['in_topic']
         self.out_topic = config_params['out_topic']
         self.group = config_params['in_group']
         self.zk_hosts = config_params['zookeeper_hosts']
     except KeyError:
         raise
     uf.print_out("Trying to make connection {}".format(self.in_topic))
     self.client = KafkaClient(hosts=self.kafka_hosts) # Create a client
     self.topic = self.client.topics[self.in_topic] # create topic if not exists
     self.consumer = self.topic.get_balanced_consumer( # Zookeeper dynamically assigns partitions
         consumer_group=self.group,
         auto_commit_enable=True,
         zookeeper_connect=self.zk_hosts)
     uf.print_out("Made connection")
     if self.to_producer: # write into producer
         try:
             self.out_group = config_params['out_group']
             self.out_topic = self.client.topics[config_params['out_topic']]
         except KeyError:
             raise
     else:
         self.output = uf.mkdir_if_not_exist() # write to /tmp/exstreamly_cheap
     uf.print_out("Created output file or producer stage")
     self.partitions = set()
     self.msg_cnt = 0 # Num consumed by instance.
     self.init_time = datetime.now()
     self.start_time = self.init_time
     self.url_queue = Queue(maxsize=0) # infinitely sized
     self.semaphore = BoundedSemaphore()

示例#13

0

显示文件

文件： insert_into_database.py 项目： awaemmanuel/exstreamly_cheap

from src.helper_modules import utility_functions as uf
from src.batch_process import hdfs_batch_process as hbp
from create_database_schema import *

# Constants definition
INPUT_KEY_CONVERTER = "com.parsely.spark.converters.FromUsersCQLKeyConverter"
INPUT_VALUE_CONVERTER = "com.parsely.spark.converters.FromUsersCQLValueConverter"
OUTPUT_KEY_CONVERTER = "com.parsely.spark.converters.ToCassandraCQLKeyConverter"
OUTPUT_VALUE_CONVERTER = "com.parsely.spark.converters.ToCassandraCQLValueConverter"

if __name__ == '__main__':
    cluster = Cluster(['172.31.2.39'])
    session = cluster.connect('deals')
    categories = ['merchants', 'dining_nightlife', 'activities_events', 'products']
    for category in categories:
        uf.print_out('Cleaning {} Table.'.format(category.capitalize()))
        file_name = 'hdfs://52.1.154.19:9000/exstreamly_cheap_files/exstreamly_cheap_files/{}.json'.format(category)
        df_category = hbp.create_dataframe(file_name)
        df_category = df_category.dropna()
        df_category = hbp.remove_duplicate_deals(df_category)
        unique_vals = hbp.count_unique_rows(df_category)
        uf.print_out('Number of unique {} serving deals: {}'.format(category, unique_vals))
        
        #  Insert dataFrames with all our categories into Cassandra
        df_category.registerTempTable('{}'.format(category))
        if category is 'merchants':            
            df_category.select('id', 'name', 'address', 'postal_code', 'country', 'phone_number', 'region', 'longitude', 'latitude', 'url').write.format("org.apache.spark.sql.cassandra").options(table='merchants', keyspace='deals').save(mode='append')
        else: # other categories
            df_category.write.format('org.apache.spark.sql.cassandra').options(table='{}'.format(category), keyspace='deals').save(mode='append')
            
            # Insert everything into these specific query tables

示例#14

0

显示文件

文件： fetch_sqoot_data.py 项目： awaemmanuel/exstreamly_cheap

def fetch_sqoot_data(base_url):
    ''' Fetch Sqoot Data and save relevant information to file '''
    files_location = uf.mkdir_if_not_exist() # Folder in /tmp/exstreamly_cheap_files
    merchants_file = os.path.join(files_location, 'merchants.json')
    products_file = os.path.join(files_location, 'products.json')
    events_file = os.path.join(files_location, 'activities_events.json')
    food_nitelife_file = os.path.join(files_location, 'dining_nitelife.json')
    categories_map = map_categories(base_url)
    
    mvp_categories = [u'product', u'dining-nightlife', u'activities-events']
    focus_grp = reduce_categories_scope(categories_map, 
                                        mvp_categories)
    start_time = datetime.datetime.now()
    end_time = start_time + datetime.timedelta(hours=7)
    all_deals = []
    queue = Queue.Queue()
    while start_time < end_time:
        try:
            # Due to api inconsistencies, to always get the newest ones and page 5
            # Duplicates will be batchly processed in SPARK
              # Combine both  
            # Flatten JSON, keep online merchant ID in deals file
            # Save Merchant in Merchant Table 
#            first_100_deals = get_request(base_url, 'deals', 'per_page=100;radius=10000')
#            all_deals = all_deals + first_100_deals.json()['deals']  
            
            uf.print_out('Crawling first 100 pages')
            for num in xrange(1, 101):
                uf.print_out('.' * num)
                thread_ = threading.Thread(target=get_request, name='Thread{}'.format(num), args=[base_url, 'deals', 'page={};per_page=100;radius=10000'.format(num), queue])
                thread_.start()
                thread_.join()
                     
            while not queue.empty():
                all_deals = all_deals + queue.get()
                
            for idx, deal in enumerate(all_deals):
                uf.print_out('Processing deal: {}'.format(idx))
                # If deal category belongs to mvp, save
                category = category_in_mvp(focus_grp, deal['deal']['category_slug'])
                if category:
                    output = OrderedDict()
                    output['id'] = deal['deal']['id']
                    output['category'] = category
                    output['sub_category'] = deal['deal']['category_slug']
                    output['title'] = deal['deal']['short_title']
                    output['description'] = deal['deal']['description']
                    output['fine_print'] = deal['deal']['fine_print']
                    output['number_sold'] = deal['deal']['number_sold']
                    output['url'] = deal['deal']['untracked_url']
                    output['price'] = deal['deal']['price']
                    output['discount_percentage'] = deal['deal']['discount_percentage']
                    output['provider_name'] = deal['deal']['provider_name']
                    output['online'] = deal['deal']['online']
                    output['expires_at'] = deal['deal']['expires_at']
                    output['created_at'] = deal['deal']['created_at']
                    output['updated_at'] = deal['deal']['updated_at']
                    output['merchant_id'] = deal['deal']['merchant']['id']
                    
                    # Write deal to file
                    with open(os.path.join(files_location, str(category) + '.json'), 'a') as f:
                        f.write(json.dumps(output))
                        f.write('\n')
                    
                    # Write merchant info file
                    merchant_info = deal['deal']['merchant']
                    if not all(merchant_info.values()):
                        merchant_info = clean_merchant_info(merchant_info)        
                    with open(os.path.join(files_location, 'merchants.json'), 'a') as f:
                        f.write(json.dumps(merchant_info))
                        f.write('\n')
            start_time = datetime.datetime.now()
            uf.print_out("Time left: {} minute(s)".format((end_time - start_time).seconds / 60))
            uf.print_out("Waiting 30mins to crawl again")
            uf.spinning_cursor(1800)
        except rq.exceptions.ConnectionError:
            uf.print_out("[ConnectionError] ==> Issue with API server.")
        except rq.exceptions.ConnectTimeout:
            uf.print_out("[ConnectionTimeout] ==> Server connection timing out.")

示例#15

0

显示文件

文件： url_consumer.py 项目： awaemmanuel/exstreamly_cheap

 def fetch_request_data(self, page_num):
     ''' Fetch request data from queued up urls '''
     uf.print_out("Inside fetch_request_data")
     while not self.url_queue.empty():
         uf.print_out("Trying to dequeue.... Is queue empty? {}".format(self.url_queue.empty()))
         url = self.url_queue.get()
         try:
             req = rq.get(url)
         except rq.exceptions.RequestException:
             continue
         if not req.ok:
             continue
         try:
             data = req.json()['deals']
         except simplejson.scanner.JSONDecodeError:
             continue
         if not data: # JSON Object Ok but no deals.
             uf.print_out("No deals found on page {}. Continuing....".format(page_num))
             continue
     
         # Write deals to output one at a time
         for deal in self._filter_json_fields(data):
             self.msg_cnt += 1
             if self.to_producer: # write to producer
                 with self.out_topic.get_producer() as prod:
                     prod.produce(json.dumps(deal))
                     uf.print_out("{} strings written to producer".format(self.msg_cnt))
             else: # write to file
                 uf.print_out("Waiting to acquire lock...")
                 self.semaphore.acquire() # Thread safe I/O write
                 uf.print_out(" ==> Got the lock...")
                 uf.print_out("Trying to write to file")
                 with open('deals.json', 'a') as f:
                     f.write(json.dumps(str(deal)))
                     f.write('\n')
                 uf.print_out("{} strings written to file".format(self.msg_cnt))
                 self.semaphore.release()
                 uf.print_out(" ==> Released the lock...")
         self.url_queue.task_done()

示例#16

0

显示文件

文件： generate_user_profiles.py 项目： awaemmanuel/exstreamly_cheap

   
if __name__ == '__main__':
    cluster = Cluster(['172.31.2.39'])
    session = cluster.connect('deals')        
    locations = []
    for line in fetch_all_locations():
        locations.append(line)
    
    # User table prepared table statement
    query = 'INSERT INTO users (full_name, time_of_creation, latitude, longitude) VALUES (?,?,?,?)'
    prepared = session.prepare(query)

    # From the locations lists assign user a get random location
    for num in xrange(1, 1000000001):
        random_location = random.choice(locations)
        # Create user object
        user = UserProfile()
        user.assign_location(random_location)
        print repr(user)
                 
        # Insert into DB
        uf.print_out('Inserting into database...')
        ts = uuid.uuid1()
        bound = prepared.bind((user.get_name(),
                   ts,
                   float(random_location[1]),
                   float(random_location[2])))
        
        session.execute(bound)
    session.shutdown()

示例#17

0

显示文件

文件： purify_elasticsearch_data.py 项目： awaemmanuel/exstreamly_cheap

            producer.produce(json.dumps(msg))

            # Back up to file for HDFS and S3
            hdp_output.write(json.dumps(msg) + "\n")
            if hdp_output.tell() > 100000000:
                hdp_output.close()

                uf.print_out("Block {}: Flushing 100MB file to HDFS => {}".format(str(block_cnt), hadoop_fullpath))

                # place blocked messages into history and cached folders on hdfs
                os.system("hdfs dfs -put {} {}".format(hadoop_file, hadoop_fullpath))
                os.system("hdfs dfs -put {} {}".format(hadoop_file, cached_fullpath))

                # Back up in S3
                uf.print_out("Syncing {} to S3 for back up".format(output_dir))
                os.system("aws s3 sync {} s3://emmanuel-awa/clean_data_from_elastic".format(output_dir))

                # Recreate file handler
                hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(time.strftime("%Y%m%d%H%M%S")))
                hdp_output = open(hadoop_file, "w")

            uf.print_out("Cleaned {} blocks. File size: {}KB".format(block_cnt, hdp_output.tell() / 1000))
            block_cnt += 1


if __name__ == "__main__":
    # Clean up both indexes in ES now and merge them as one
    for index in ["all_deals_data", "all_deals_data_index"]:
        uf.print_out("Processing {}....".format(index))
        fetch_and_clean_up(index)

示例#18

0

显示文件

文件： purify_elasticsearch_data.py 项目： awaemmanuel/exstreamly_cheap

def fetch_and_clean_up(index_name):
    """ Fetch Elastic data and clean it up """
    # Logstash and HDFS general info
    output_dir = uf.mkdir_if_not_exist("/tmp/exstreamly_cheap_files/elasticsearch_cleanup")
    #    logstash_file = os.path.join(output_dir, 'clean_deals.json')

    # HDFS Related data
    group = "deals_data_hdfs"
    topic_id = "elastic_deals_data"
    timestamp = time.strftime("%Y%m%d%H%M%S")
    hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(timestamp))
    hadoop_path = "/exstreamly_cheap_main_files/all_deals/history"
    cached_path = "/exstreamly_cheap_main_files/all_deals/cached"
    hadoop_fullpath = "{}/{}_{}_{}.dat".format(hadoop_path, group, topic_id, timestamp)
    cached_fullpath = "{}/{}_{}_{}.dat".format(cached_path, group, topic_id, timestamp)

    uf.print_out("Writing the logs to {} which will be pushed to hdfs and S3".format(hadoop_file))

    block_cnt = 0
    client = make_client()
    cc = Search(using=client, index=index_name)
    gen = cc.scan()

    config = configparser.SafeConfigParser()
    config.read("../../config/general.conf")
    config_params = uf.get_config_items(config, settings.PRODUCER_CLEAN_ES_DATA)
    try:
        kafka_hosts = config_params["kafka_hosts"]
        topic = config_params["topic"]
        group = config_params["group"]
        zk_hosts = config_params["zookeeper_hosts"]
    except KeyError:
        raise

    kafka_client = KafkaClient(hosts=kafka_hosts)
    kafka_topic = kafka_client.topics[topic]  # Create if not exist
    uf.print_out("Producing messages to topic {}. Press Ctrl-C to terminate".format(kafka_topic.name))

    # Produce to kafka for distributed consumption
    hdp_output = open(hadoop_file, "w")
    with kafka_topic.get_producer() as producer:
        for event in gen:
            new_string = dict(eval(event.message.encode("utf-8")))
            msg = clean_data(new_string)

            # We can decide to have logstash read from file instead
            #        with open(logstash_file, 'a') as log_output:
            #            log_output.write(json.dumps(msg) + '\n')
            # Write to producer.
            producer.produce(json.dumps(msg))

            # Back up to file for HDFS and S3
            hdp_output.write(json.dumps(msg) + "\n")
            if hdp_output.tell() > 100000000:
                hdp_output.close()

                uf.print_out("Block {}: Flushing 100MB file to HDFS => {}".format(str(block_cnt), hadoop_fullpath))

                # place blocked messages into history and cached folders on hdfs
                os.system("hdfs dfs -put {} {}".format(hadoop_file, hadoop_fullpath))
                os.system("hdfs dfs -put {} {}".format(hadoop_file, cached_fullpath))

                # Back up in S3
                uf.print_out("Syncing {} to S3 for back up".format(output_dir))
                os.system("aws s3 sync {} s3://emmanuel-awa/clean_data_from_elastic".format(output_dir))

                # Recreate file handler
                hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(time.strftime("%Y%m%d%H%M%S")))
                hdp_output = open(hadoop_file, "w")

            uf.print_out("Cleaned {} blocks. File size: {}KB".format(block_cnt, hdp_output.tell() / 1000))
            block_cnt += 1

示例#19

0

显示文件

文件： simulate_user_interaction.py 项目： awaemmanuel/exstreamly_cheap

                subscriptions = sub.subscribe(num_channels)
            
                #   Produce the subscription object to producer
                print "Testing {}".format(subscriptions)
                prod.produce(subscriptions)
                uf.print_out("[SUCCESSFUL] - {} subscribed ==> {}.".format(sub.get_users_name(),
                                                                       sub.get_users_channels()))
                uf.print_out("[SUCCESSFUL] - {} Users written to producer".format(num))

    def _generate_random_name(self):
        ''' Generate random full name
            Using open source library from treyhunner
        '''
        return names.get_full_name()
    def to_str(self, unicode_or_str):
        ''' Convert unicode to string to write to output '''
        if isinstance(unicode_or_str, unicode):
            val = unicode_or_str.encode('utf-8')
        else:
            val = unicode_or_str
        return val
if __name__ == '__main__':
    sim = SimulateInteraction()
    uf.print_out('[START] - {}....'.format((datetime.now()).strftime("%Y-%m-%dT%H:%M:%S%Z")))
    sim.simulate()
    uf.print_out('[FINISH] - {}....'.format((datetime.now()).strftime("%Y-%m-%dT%H:%M:%S%Z")))

示例#20

0

显示文件

文件： kafka_to_hdfs.py 项目： awaemmanuel/exstreamly_cheap

        os.system('hdfs dfs -put {} {}'.format(self.temp_file_path, cached_fullpath))
        
        #uf.print_out('Removing temporary file - {}'.format(os.path.basename(self.temp_file_path)))
        # os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')
        self.temp_file_path = '{}/kafka_{}_{}_{}.dat'.format(output_dir, self.out_topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
    
    def get_config_items(self):
        ''' Retrieve relevant config settings for section
            applicable to this type of instance for 
            group, in_topic, out_topic if available
        '''
        try:
            return dict(self.config.items(self.config_section))
        except configparser.NoSectionError:
            raise configparser.NoSectionError('No section: {} exists in the config file'
                                 .format(self.config_section))


if __name__ == '__main__':
    tmp_out_dir = '/home/ubuntu/exstreamly_cheap_all_deals/ingestion/kafka_messages'
    tmp_out_dir = uf.mkdir_if_not_exist(tmp_out_dir)
    uf.print_out('Output format: {}'.format(tmp_out_dir))
    config = configparser.SafeConfigParser()
    config.read('../../config/general.conf')
    print '\nConsuming messages...'
    cons = ConsumerToHDFS(config, settings.CONSUMER_MODE_DATA)
    cons.consume_topic(tmp_out_dir)