Exemplo n.º 1
0
 def ingest(self):                    
     debug_print("Ingesting directory {}".format(self.directory))               
     debug_print("Ingesting the files \n{}".format(self.files))                    
     is_lambda =  self.context[c.KEY_LAMBDA_FUNCTION] is not None
     timeout = self.__calculate_aggregate_window_timeout(self.context[c.KEY_MAX_LAMBDA_TIME])
     target_excretion_size = self.context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB]
     compression_ratio = self.context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO]
     sep = self.context[c.KEY_SEPERATOR_PARTITION]
     memory_trigger = self.context[c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER]             
     memory_used = mutil.get_memory_usage()             
     main_filename, main_file_data, main_file_size_mb = self.__get_main_aggregate_file(self.directory, sep, target_excretion_size)        
     main_file_data = self.__append(None, main_file_data)           
     keys_ingested = []
     for file in self.files:
         debug_print("\tProcessing file {}".format(file))
         key_parts = KeyParts(file, sep)
         duration = datetime.datetime.utcnow() - key_parts.filename_timestamp            
         if duration.total_seconds() < 300:
             debug_print("The file '{}' is {}s old.  It is too new and will be processed later to allow for S3 propagation.".format(file, duration.total_seconds()))
             continue
         keys_ingested.append(file)            
         data = self.__open(file, main_file_data)
         if data is None:
             continue            
         size_in_megabytes = self.__size(file)            
         main_file_data = self.__append(main_file_data, data) 
         del data
         gc.collect()
         current_dataframe_size = sys.getsizeof(main_file_data)        
         #break conditions
         #1. Memory limit exceeded
         #2. Time window exceeded
         #3. Target excretion size hit
         main_file_size_mb += size_in_megabytes
         memory_used = mutil.get_memory_usage()           
         debug_print("\t\tSize on S3: {}MB Size of new dataset: {}bytes Estimated Compression Ratio: {} Memory Used: {}% Project Compression Size {}MB  Target Excretion Size {}MB".format(size_in_megabytes, current_dataframe_size, compression_ratio, memory_used, main_file_size_mb, target_excretion_size))
         if util.elapsed(self.context) > timeout or memory_used > memory_trigger or main_file_size_mb > target_excretion_size :
             print "Elapsed", util.elapsed(self.context), "Start:", self.starttime, "Timeout:", timeout, "Has timed out:", util.elapsed(self.context) > timeout, "Mem Used %:", memory_used, "Max Memory %:", memory_trigger
             break                
     
     #only save the files if we have a reasonable amount of time remaining before the lambda timeout.
     debug_print("Time remaining: {}s".format(util.time_remaining(self.context)))    
     debug_print("There were {} keys ingested.  The keys ingested are: \n {}".format(len(keys_ingested), keys_ingested))
     if len(keys_ingested)>0 and util.time_remaining(self.context) > c.SAVE_WINDOW_IN_SECONDS and not main_file_data.empty:            
         main_file_data = self.__convert_to_submission_df(main_file_data)
         gc.collect()
         self.__excret(self.directory, main_filename, main_file_data, sep)            
         self.__delete_keys(keys_ingested)
     elif util.time_remaining(self.context) <= c.SAVE_WINDOW_IN_SECONDS:            
         print "Time has run out!  We have less than {} seconds remaining before this lambda times out.  Abandoning the S3 commit to avoid file corruption.".format(c.SAVE_WINDOW_IN_SECONDS)
         print "Aggregation window (Max Lambda Execution Time * {}): {} seconds".format(c.RATIO_OF_MAX_LAMBDA_TIME, timeout) 
         print "S3 Save window: {} seconds".format(c.SAVE_WINDOW_IN_SECONDS) 
         print "Lambda time remaining: {} seconds".format(util.time_remaining(self.context))                        
        
     remaining_files = list(set(self.files) - set(keys_ingested))
     if len(remaining_files) > 0:        
         debug_print("Re-adding the {} paths to SQS to attempt again. The paths are \n{}".format(len(remaining_files), remaining_files))               
         self.__add_to_sqs(remaining_files)        
     print "I've consumed everything I can in bucket '{}'".format(self.directory)
     return
def flush_and_delete(context, metric_sets):        
    if len(metric_sets)>0:
        print "[{}]Time remaining {} seconds".format(context[c.KEY_REQUEST_ID], util.time_remaining(context))
        print "[{}]Saving aggregated metrics to S3 bucket '{}'".format(context[c.KEY_REQUEST_ID], os.environ[c.RES_S3_STORAGE])
        context[c.CW_ATTR_SAVE_DURATION] = save_duration = flush(context, metric_sets)   
        #delete only messages that have succeeded across all partition saves
        #message payloads can span multiple partitions/schemas
        #only delete messages that succeeded on all partition/schema submissions
        #this could lead to duplication of data if only part of a message is processed successfully.
        #this is better than deleting the message entirely leaving missing data.
        context[c.KEY_THREAD_POOL].wait()
        print "[{}]Time remaining {} seconds".format(context[c.KEY_REQUEST_ID], util.time_remaining(context))
        print "[{}]Deleting messages from SQS queue '{}'".format(context[c.KEY_REQUEST_ID],context[c.KEY_SQS].queue_url)
        delete_duration = context[c.KEY_SQS].delete_message_batch(context[c.KEY_SUCCEEDED_MSG_IDS])
        write_cloudwatch_metrics(context, save_duration, delete_duration)
Exemplo n.º 3
0
 def __break_retry(self, backoff, current_attempt_number):        
     time_remaining = util.time_remaining(self.__context)
     print "The lambda has {} seconds remaining. It started at {}.".format(time_remaining, datetime.datetime.fromtimestamp(self.__context[c.KEY_START_TIME]).strftime('%Y-%m-%d %H:%M:%S'))        
     if time_remaining - backoff < context[c.CW_ATTR_SAVE_DURATION] or current_attempt_number >= self.__context[c.KEY_BACKOFF_MAX_TRYS] :
         print "Breaking the lock attempt as the lambda is nearing timeout or maximum retry limit is hit.  The lambda has {} seconds remaining.  There have been {} attempts made.".format(time_remaining,current_attempt_number)
         return True;
     return False
Exemplo n.º 4
0
def save(context, metric_sets, partition, schema_hash):
    util.debug_print("\t{}:".format(partition))
    for schema_hash, dict in metric_sets[partition].iteritems():
        if util.time_remaining(context) <= (
                context[c.CW_ATTR_DELETE_DURATION] + 20):
            break
        columns = dict[c.KEY_SET_COLUMNS]
        if len(dict[c.KEY_SET]) == 0:
            continue
        values = dict[c.KEY_SET].values()
        set = pd.DataFrame(values, columns=columns)
        util.debug_print("\t\t{}:".format(schema_hash))
        path = create_file_path(partition, schema_hash,
                                context[c.KEY_SEPERATOR_PARTITION])
        util.debug_print("Writing to path '{}' with set:\n {}".format(
            path, set))
        elapsed = 0
        try:
            util.debug_print("Writing to partition '{}'".format(partition))

            write(context[c.KEY_METRIC_BUCKET], path, set,
                  context[c.KEY_SEPERATOR_PARTITION],
                  schema.object_encoding(columns))
            context[c.KEY_SUCCEEDED_MSG_IDS] += dict[c.KEY_MSG_IDS]
        except Exception as e:
            print "[{}]An error occured writing to path '{}'.\nSet: {}\nError: \n{}".format(
                context[c.KEY_REQUEST_ID], path, set, traceback.format_exc())
            raise e
        finally:
            number_of_rows = len(values)
            if c.INFO_TOTAL_ROWS not in context[c.KEY_AGGREGATOR].info:
                context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] = 0
            context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] += number_of_rows
            del set
            del columns
    def __proto_http(self, scheme, address, port, timeout=None):
        url = scheme + "://" + address + ":" + str(port) + "/"
        request = urllib2.Request(url=url, headers=self._http_headers)
        result = None
        response = None
        t1 = time.time()

        try:
            # response is a urllib.addinfourl object
            response = self._proxy_url.open_url(request, timeout=timeout)
        except HTTPError as http_err:
            # HTTPError implies a response from an HTTP server
            # (but indicating something other than success), and
            # since it is also a urllib.addinfourl object, it can be
            # handled just like a normal response.
            response = http_err
        except Exception as err:
            logger.debug("%s failed: %s" % (url, err))

        if response:
            try:
                status_code = response.getcode()
                logger.debug("%s returned HTTP status code %s" % (url, status_code))
                result = True
                remaining = util.time_remaining(t1, timeout)
                self._process_response(response, scheme, address, port, remaining, self._args.download_limit)
            finally:
                response.close()
        return result
Exemplo n.º 6
0
def save(context, metric_sets, partition, schema_hash):
    util.debug_print("\t{}:".format(partition))
    paths = []
    for schema_hash, dict in iteritems(metric_sets[partition]):
        if util.time_remaining(context) <= (
                context[c.CW_ATTR_DELETE_DURATION] + 20):
            break
        columns = dict[c.KEY_SET_COLUMNS]
        if len(dict[c.KEY_SET]) == 0:
            continue
        values = dict[c.KEY_SET].values()
        set = pd.DataFrame(values, columns=columns)

        util.debug_print("\t\t{}:".format(schema_hash))
        path = create_file_path(partition, schema_hash,
                                context[c.KEY_SEPERATOR_PARTITION])
        paths.append(path)
        util.debug_print("Writing to path '{}' with set:\n {}".format(
            path, set))
        elapsed = 0
        try:
            util.debug_print("Writing to partition '{}'".format(partition))
            write(context[c.KEY_METRIC_BUCKET], path, set,
                  context[c.KEY_SEPERATOR_PARTITION],
                  schema.object_encoding(columns))
            handoff_event_to_emitter(context, context[c.KEY_METRIC_BUCKET],
                                     path, set)
            context[c.KEY_SUCCEEDED_MSG_IDS] += dict[c.KEY_MSG_IDS]
            util.debug_print("Save complete to path '{}'".format(path))
        except Exception as e:
            print(
                "[{}]An error occured writing to path '{}'.\nSet: {}\nError: \n{}"
                .format(context[c.KEY_REQUEST_ID], path, set,
                        traceback.format_exc()))
            raise e
        finally:
            number_of_rows = len(values)
            if c.INFO_TOTAL_ROWS not in context[c.KEY_AGGREGATOR].info:
                context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] = 0
            context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] += number_of_rows
            del set
            del values
            del columns
            gc.collect()
    try:
        util.debug_print("Adding amoeba message to SQS queue '{}'".format(
            context[c.KEY_SQS_AMOEBA].queue_url))
        context[c.KEY_SQS_AMOEBA].send_generic_message(
            json.dumps({"paths": paths}))
    except Exception as e:
        print(
            "[{}]An error occured writing messages to the Amoeba SQS queue. \nError: \n{}"
            .format(context[c.KEY_REQUEST_ID], traceback.format_exc()))
        raise e
 def time_remaining(self):
     remaining = None
     self.acquire()
     try:
         remaining = util.time_remaining(self._start_time, self._timeout)
         if remaining:
             logger.debug("Probe of %s will time out in %.2f seconds" %
                          (self._hostname, remaining))
     finally:
         self.release()
     return remaining
Exemplo n.º 8
0
def process(context):
    print mutil.get_memory_object()
    write_initial_stats(context)
    process_bytes = mutil.get_process_memory_usage_bytes()
    if c.KEY_SQS_QUEUE_URL not in context or context[
            c.KEY_SQS_QUEUE_URL] is None:
        context[c.KEY_SQS].set_queue_url(lowest_load_queue=False)
    #execute at least once
    messages_to_process = None
    inflight_messages = 0
    elapsed = 0
    metric_sets = dict({})
    context[c.KEY_AGGREGATOR] = Aggregator(context, metric_sets)
    messages = []
    last_queue_size_check = context[c.KEY_FREQUENCY_TO_CHECK_TO_SPAWN_ANOTHER]
    growth_rate = last_check = 0
    last_message_count = None
    timeout = calculate_aggregate_window_timeout(context)
    value = datetime.datetime.fromtimestamp(context[c.KEY_START_TIME])
    message_processing_time = 0
    print "[{}]Using SQS queue URL '{}'".format(context[c.KEY_REQUEST_ID],
                                                context[c.KEY_SQS].queue_url)
    print "[{}]Started the consumer at {}.  The aggregation window is {} seconds.".format(
        context[c.KEY_REQUEST_ID], value.strftime('%Y-%m-%d %H:%M:%S'),
        timeout)
    while elapsed < timeout:
        if elapsed > last_check:
            last_check = elapsed + context[c.KEY_FREQUENCY_TO_CHECK_SQS_STATE]
            response = context[c.KEY_SQS].get_queue_attributes()
            inflight_messages = int(response['Attributes']
                                    ['ApproximateNumberOfMessagesNotVisible'])
            messages_to_process = int(
                response['Attributes']['ApproximateNumberOfMessages'])
            if last_message_count is None:
                last_message_count = messages_to_process
            else:
                growth_rate = last_message_count if last_message_count == 0 else float(
                    messages_to_process -
                    last_message_count) / last_message_count
                last_message_count = messages_to_process
                grow_if_threshold_hit(
                    context, growth_rate,
                    context[c.KEY_GROWTH_RATE_BEFORE_ADDING_LAMBDAS])

            #if the queue is growing slowly and is above 30,000 messages launch a new consumer
            if elapsed > last_queue_size_check:
                last_queue_size_check = elapsed + context[
                    c.KEY_FREQUENCY_TO_CHECK_TO_SPAWN_ANOTHER]
                print "[{}]\nThere are approximately {} messages that require processing.\n" \
                    "There are {} in-flight messages.\n" \
                    "{} seconds have elapsed and there is {} seconds remaining before timeout.\n" \
                    "The queue growth rate is {}\n" \
                    "{} message(s) were processed.".format(context[c.KEY_REQUEST_ID], messages_to_process,inflight_messages,round(elapsed,2),util.time_remaining(context),growth_rate,len(messages))
                if messages_to_process > context[
                        c.
                        KEY_THRESHOLD_BEFORE_SPAWN_NEW_CONSUMER] and inflight_messages <= context[
                            c.KEY_MAX_INFLIGHT_MESSAGES]:
                    print "The queue size is greater than {}. Launching another consumer.".format(
                        context[c.KEY_THRESHOLD_BEFORE_SPAWN_NEW_CONSUMER])
                    add_consumer(context)
            if last_message_count == 0:
                print "[{}]No more messages to process.".format(
                    context[c.KEY_REQUEST_ID])
                break
        messages = context[c.KEY_SQS].read_queue()

        if len(messages) > 0:
            start = time.time()
            context[c.KEY_AGGREGATOR].append_default_metrics_and_partition(
                messages)
            message_processing_time = round(
                ((time.time() - start) + message_processing_time) / 2, 4)
        else:
            if len(metric_sets) > 1:
                print "[{}]No more messages to process.".format(
                    context[c.KEY_REQUEST_ID])
                break
            else:
                print "[{}]No metric sets to process. Exiting.".format(
                    context[c.KEY_REQUEST_ID])
                return

        #start throttling the message processing when the SQS inflight messages is at 80% (16,000)
        #one queue is only allowed to have 20,000 maximum messages being processed (in-flight)
        usage = mutil.get_memory_usage()
        if inflight_messages > 16000:
            print "[{}]Stopping aggregation.  There are too many messages in flight.  Currently there are {} messages in flight.".format(
                context[c.KEY_REQUEST_ID], inflight_messages)
            break
        if usage > context[c.KEY_MEMORY_FLUSH_TRIGGER]:
            print "[{}]Stopping aggregation.  Memory safe level threshold exceeded.  The lambda is currently at {}%.".format(
                context[c.KEY_REQUEST_ID], usage)
            break
        if util.elapsed(context) + message_processing_time > timeout:
            print "[{}]Stopping aggregation.  The elapsed time and the projected message processing time exceeds the timeout window.  Messages are taking {} seconds to process.  There is {} seconds left before time out and {} seconds for aggregation.".format(
                context[c.KEY_REQUEST_ID], message_processing_time,
                util.time_remaining(context), timeout)
            break

        elapsed = util.elapsed(context)

    util.debug_print(
        "[{}]Lambda has completed the agreggation phase.  Elapsed time was {} seconds and we have {} seconds remaining. There are {} in-flight messages and {} remaining messages to process."
        .format(context[c.KEY_REQUEST_ID], elapsed,
                util.time_remaining(context), inflight_messages,
                messages_to_process))
    context[c.KEY_THREAD_POOL].wait()
    bytes_consumed = mutil.get_process_memory_usage_bytes()
    memory_usage = str(mutil.get_memory_usage())
    print mutil.get_memory_object()
    tables = metric_sets[c.KEY_TABLES]
    del metric_sets[c.KEY_TABLES]
    flush_and_delete(context, metric_sets)
    context[c.KEY_THREAD_POOL].wait()
    update_glue_crawler_datastores(context, tables)
    print mutil.get_memory_object()
    print "[{}]Elapsed time {} seconds. ".format(context[c.KEY_REQUEST_ID],
                                                 util.elapsed(context))
    print "[{}]Message processing averaged {} seconds per message. ".format(
        context[c.KEY_REQUEST_ID], message_processing_time)
    print "[{}]The process consumed {} KB of memory.".format(
        context[c.KEY_REQUEST_ID], bytes_consumed / 1024)
    print '[{}]The memory utilization was at {}%.'.format(
        context[c.KEY_REQUEST_ID], memory_usage)
    print '[{}]The process used {} KB for converting messages to parquet format.'.format(
        context[c.KEY_REQUEST_ID], (bytes_consumed - process_bytes) / 1024)
    print "[{}]The save process took {} seconds.".format(
        context[c.KEY_REQUEST_ID], context[c.CW_ATTR_SAVE_DURATION])
    print "[{}]Processed {} uncompressed bytes.".format(
        context[c.KEY_REQUEST_ID],
        context[c.KEY_AGGREGATOR].bytes_uncompressed)
    print "[{}]Processed {} metrics. ".format(context[c.KEY_REQUEST_ID],
                                              context[c.KEY_AGGREGATOR].rows)
    print "[{}]Processed {} messages. ".format(
        context[c.KEY_REQUEST_ID], context[c.KEY_AGGREGATOR].messages)
    print "[{}]Average metrics per minute {}. ".format(
        context[c.KEY_REQUEST_ID],
        round(
            context[c.KEY_AGGREGATOR].rows / util.elasped_time_in_min(context),
            2))
    print "[{}]Average messages per minute {}. ".format(
        context[c.KEY_REQUEST_ID],
        round(
            context[c.KEY_AGGREGATOR].messages /
            util.elasped_time_in_min(context), 2))
    print "[{}]Average uncompressed bytes per minute {}. ".format(
        context[c.KEY_REQUEST_ID],
        round(
            context[c.KEY_AGGREGATOR].bytes_uncompressed /
            util.elasped_time_in_min(context), 2))
    print "[{}]There are approximately {} messages that require processing.".format(
        context[c.KEY_REQUEST_ID],
        messages_to_process if messages_to_process else 0)
    print "[{}]There are {} in-flight messages.".format(
        context[c.KEY_REQUEST_ID], inflight_messages)
    print "[{}]There was {} seconds remaining before timeout. ".format(
        context[c.KEY_REQUEST_ID], util.time_remaining(context))
    del tables
    del metric_sets
    gc.collect()
Exemplo n.º 9
0
def ingest(event, lambdacontext):
    starttime = time.time()
    gc.collect()
    root = event.get("root", None)
    print "Initial memory size:", mutil.get_memory_object()
    print "Started amoeba with root {}".format(root)
    context = event.get("context", {})
    context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr(
        lambdacontext, 'function_name') else None
    context[c.KEY_START_TIME] = starttime
    is_lambda = context[c.KEY_LAMBDA_FUNCTION] is not None
    bucket = os.environ[c.ENV_S3_STORAGE]
    crawler = Crawler(context, bucket)
    roots = crawler.crawl_from_relative(root)
    s3_fs = s3fs.S3FileSystem()
    s3 = S3(context, bucket)
    timeout = calculate_aggregate_window_timeout(
        context[c.KEY_MAX_LAMBDA_TIME])
    target_excretion_size = context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB]
    compression_ratio = context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO]
    sep = context[c.KEY_SEPERATOR_PARTITION]
    memory_used = mutil.get_memory_usage()
    projected_compressed_file_size_in_mb = 0
    print "Hunting for {} seconds in bucket '{}'".format(timeout, bucket)
    for path in roots:
        #The GLUE Crawler does not work well when a single key in S3 contains varying data schemas.
        files = roots[path]
        if len(files) == 1:
            continue
        debug_print("\t\tIngesting path {}".format(path))
        df = {}
        keys_ingested = []
        data = None

        for file in files:
            debug_print("\t\t\t{}".format(file))
            key = "{}/{}".format(path, file)
            try:
                size_in_megabytes = s3.size_in_megabytes(key)
            except ClientError as e:
                if str(e.response['Error']['Code']) == '404':
                    continue
                else:
                    print "Error: ", e.response['Error']['Code'], key
                    raise e

            if size_in_megabytes > target_excretion_size:
                debug_print(
                    "Skipping file '{}'.  It has reached the targetted file size"
                    .format(key))
                continue
            size_in_bytes = size_in_megabytes * 1024 * 1024
            try:
                data = reader.read(s3_fs, bucket, key)
                keys_ingested.append(key)
            except ClientError as e:
                print e.response['Error']['Code'], "key=>", key
                #handle corrupt files, this can happen if a write did not finish correctly
                if e.message == "Seek before start of file":
                    print "Deleting corrupt file %s", key
                    s3.delete([key])
                elif e.response['Error']['Code'] == 'NoSuchKey':
                    print '{}: for key {}'.format(e.response['Error']['Code'],
                                                  key)
                else:
                    util.logger.error(e)
                continue
            for row in data.itertuples(index=True):
                row = row.__dict__
                del row['Index']
                key_parts = KeyParts(key, sep)
                uuid_key = "{}{}{}".format(row[c.PRIMARY_KEY], key_parts.event,
                                           row[c.TERTIARY_KEY])
                df_size = len(row)

                debug_print(
                    "\t\t\tSize on S3 in MB: {} Size as Dataframe: {} Ratio: {}"
                    .format(size_in_megabytes, df_size, compression_ratio))

                #a dictionary is the fastest way to create a unique set.
                if uuid_key in df:
                    debug_print(
                        "\t\t\tFound duplication in key '{}'.  Keeping the first occurrence."
                        .format(key))
                else:
                    df[uuid_key] = row

                current_dataframe_size = len(df)
                #break conditions
                #1. Memory limit exceeded
                #2. Time window exceeded
                #3. Target excretion size hit
                projected_compressed_file_size_in_mb = (
                    compression_ratio * current_dataframe_size) / 1048576.0
                memory_used = mutil.get_memory_usage()
                debug_print(
                    "\t\t\t{} seconds have elapsed.  {} kilobytes of memory have been used. The projected compressed file size is {} MB.  We are targetting an excretion file size of {} MB."
                    .format(util.elapsed(context), memory_used / 1024,
                            projected_compressed_file_size_in_mb,
                            target_excretion_size))
                if util.elapsed(context) > timeout or memory_used > context[
                        c.
                        KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size:
                    break
            if util.elapsed(context) > timeout or memory_used > context[
                    c.
                    KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size:
                print "Elapsed", util.elapsed(context), "Start:", context[
                    c.
                    KEY_START_TIME], "Timeout:", timeout, "Has timed out:", util.elapsed(
                        context
                    ) > timeout, "Mem Used:", memory_used, "Max Memory %:", context[
                        c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER]
                break
        if len(keys_ingested) > 0 and util.time_remaining(context) > 45:
            values = df.values()
            #since the schema hash of the event columns are all the same we can infer the dict[0].keys has the same column headers as the rest
            columns = values[0].keys()
            set = pd.DataFrame(values, columns=columns)
            excret(s3, bucket, path, set, keys_ingested, sep,
                   m_schema.object_encoding(columns))
            del set
        elif util.time_remaining(context) <= 45:
            return
        del data
        del df
        del keys_ingested

        if util.elapsed(
                context) > timeout or mutil.get_process_memory_usage_bytes(
                ) >= c.ONE_GB_IN_BYTES:
            print "\tThe elapsed time threshold of {} seconds has been hit or the memory threshold of {} megabytes has been hit. Time: {}s, Memory: {}MB".format(
                timeout, c.ONE_GB_IN_BYTES / 1048576.0, util.elapsed(context),
                mutil.get_process_memory_usage_megabytes())
            return
    print "I've consumed everything I can in bucket '{}'".format(bucket)
    return
    def _process_response(self, infile, proto, address, port, timeout=None, dl_limit=sys.maxint):
        download_dir = self._args.download_dir
        try:
            util.mkdir_p(download_dir)
        except Exception as e:
            logger.error("Failed to create download directory %s: %s" %s (download_dir, err))
            return None

        host = address[0:-hs_sfx_len] if address.endswith(hs_sfx) else address
        outfilename = "%s/%s_%s_%s" % (download_dir, host, port, proto)
        try:
            outfile = open(outfilename, 'w')
        except Exception as err:
            logger.error("Failed to open %s for output: %s" % (outfilename, err))
            return None

        try:
            http_code = infile.getcode()
            headers = infile.info()
        except:
            http_code = 0
            headers = {}

        if http_code > 0:
            http_code_name = 'Unknown'
            if httplib.responses.has_key(http_code):
                http_code_name = httplib.responses[http_code]
            outfile.write("%d %s\n" % (http_code, http_code_name))
            for key in headers.keys():
                outfile.write("%s: %s\n" % (key, headers[key]))
            outfile.write("\n")

        decompressor = None
        if headers.has_key('content-encoding'):
            encoding = headers['content-encoding'].lower()
            if encoding in ['gzip', 'deflate', 'zlib']:
                decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)

        max_read = min(1024, dl_limit)
        total_len = 0
        remaining = timeout
        rin = [infile]
        win = []
        xin = []
        try:
            try:
                while total_len < dl_limit:
                    # Workaround for closed socket
                    try:
                        infile.fileno()
                    except:
                        break
                    t1 = time.time()
                    rout, wout, xout = select.select(rin, win, xin, remaining)
                    if infile not in rout:
                        logger.warn("Read timeout on %s:%d (proto %s)" % (address, port, proto))
                        break
                    data = infile.read(max_read)
                    data_len = len(data)
                    if data_len <= 0:
                        break
                    if decompressor:
                        ddata = decompressor.decompress(data)
                        outfile.write(ddata)
                        total_len += len(ddata)
                    else:
                        outfile.write(data)
                        total_len += data_len
                    remaining = util.time_remaining(t1, remaining)
            finally:
                outfile.close()
        except Exception as err:
            logger.error("Error while downloading to %s: %s" % (outfilename, err))
            return None

        return total_len