示例#1
0
    def __init__(self, aws_region, s3_bucket, s3_prefix, invoker,
                 runtime_s3_bucket, runtime_s3_key, job_max_runtime):
        self.aws_region = aws_region
        self.s3_bucket = s3_bucket
        self.s3_prefix = s3_prefix

        self.session = botocore.session.get_session()
        self.invoker = invoker
        self.s3client = self.session.create_client('s3',
                                                   region_name=aws_region)
        self.job_max_runtime = job_max_runtime

        self.runtime_bucket = runtime_s3_bucket
        self.runtime_key = runtime_s3_key
        self.runtime_meta_info = runtime.get_runtime_info(
            runtime_s3_bucket, runtime_s3_key)
        if not runtime.runtime_key_valid(self.runtime_meta_info):
            raise Exception(
                "The indicated runtime: s3://{}/{} is not approprite for this python version"
                .format(runtime_s3_bucket, runtime_s3_key))

        if 'preinstalls' in self.runtime_meta_info:
            logger.info("using serializer with meta-supplied preinstalls")
            self.serializer = serialize.SerializeIndependent(
                self.runtime_meta_info['preinstalls'])
        else:
            self.serializer = serialize.SerializeIndependent()
示例#2
0
    def __init__(self, invoker, config, job_max_runtime):
        self.invoker = invoker
        self.job_max_runtime = job_max_runtime

        self.config = config
        self.storage_config = wrenconfig.extract_storage_config(self.config)
        self.storage = storage.Storage(self.storage_config)
        self.runtime_meta_info = runtime.get_runtime_info(config['runtime'])
        # print('runtime_meta_info: ', self.runtime_meta_info)

        self.runtime_meta_info['preinstalls'].append(['pandas', True])
        self.runtime_meta_info['preinstalls'].append(['thrift', True])
        self.runtime_meta_info['preinstalls'].append(['Thrift', True])

        if 'preinstalls' in self.runtime_meta_info:
            logger.info("using serializer with meta-supplied preinstalls")
            self.serializer = serialize.SerializeIndependent(
                self.runtime_meta_info['preinstalls'])
        else:
            self.serializer = serialize.SerializeIndependent()

        self.map_item_limit = None
        if 'scheduler' in self.config:
            if 'map_item_limit' in config['scheduler']:
                self.map_item_limit = config['scheduler']['map_item_limit']
示例#3
0
    def __init__(self, invoker, config, job_max_runtime):
        self.invoker = invoker
        self.job_max_runtime = job_max_runtime

        self.config = config
        self.storage_config = wrenconfig.extract_storage_config(self.config)
        self.storage = storage.Storage(self.storage_config)
        self.runtime_meta_info = runtime.get_runtime_info(config['runtime'])


        if 'preinstalls' in self.runtime_meta_info:
            logger.info("using serializer with meta-supplied preinstalls")
            self.serializer = serialize.SerializeIndependent(self.runtime_meta_info['preinstalls'])
        else:
            self.serializer = serialize.SerializeIndependent()
示例#4
0
def run_experiment(problem_size, shard_size, pipeline, priority, lru, eager,
                   truncate, max_cores, start_cores, trial, launch_granularity,
                   timeout, log_granularity, autoscale_policy,
                   failure_percentage, max_failure_events, failure_time):
    # set up logging
    logger = logging.getLogger()
    for key in logging.Logger.manager.loggerDict:
        logging.getLogger(key).setLevel(logging.CRITICAL)
    logger.setLevel(logging.DEBUG)
    arg_bytes = pickle.dumps(
        (problem_size, shard_size, pipeline, priority, lru, eager, truncate,
         max_cores, start_cores, trial, launch_granularity, timeout,
         log_granularity, autoscale_policy, failure_percentage,
         max_failure_events, failure_time))
    arg_hash = hashlib.md5(arg_bytes).hexdigest()
    log_file = "failure_experiments/{0}.log".format(arg_hash)
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(ch)
    logger.info("Logging to {0}".format(log_file))

    X = np.random.randn(problem_size, 1)
    pwex = pywren.default_executor()
    shard_sizes = [shard_size, 1]
    X_sharded = BigMatrix("cholesky_test_{0}_{1}".format(
        problem_size, shard_size),
                          shape=X.shape,
                          shard_sizes=shard_sizes,
                          write_header=True)
    shard_matrix(X_sharded, X)
    print("Generating PSD matrix...")
    XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False)
    XXT_sharded.lambdav = problem_size * 10
    instructions, L_sharded, trailing = lp._chol(XXT_sharded)
    pipeline_width = args.pipeline
    if (priority):
        num_priorities = 5
    else:
        num_priorities = 1
    if (lru):
        cache_size = 5
    else:
        cache_size = 0

    REDIS_CLIENT = redis.StrictRedis(REDIS_ADDR,
                                     port=REDIS_PORT,
                                     password=REDIS_PASS,
                                     db=0,
                                     socket_timeout=5)

    if (truncate is not None):
        instructions = instructions[:truncate]
    config = pwex.config

    program = lp.LambdaPackProgram(instructions,
                                   executor=pywren.lambda_executor,
                                   pywren_config=config,
                                   num_priorities=num_priorities,
                                   eager=eager)
    redis_env = {
        "REDIS_ADDR": os.environ.get("REDIS_ADDR", ""),
        "REDIS_PASS": os.environ.get("REDIS_PASS", "")
    }

    done_counts = []
    ready_counts = []
    post_op_counts = []
    not_ready_counts = []
    running_counts = []
    sqs_invis_counts = []
    sqs_vis_counts = []
    up_workers_counts = []
    busy_workers_counts = []
    times = []
    flops = []
    reads = []
    writes = []
    failure_times = []
    exp = {}
    exp["redis_done_counts"] = done_counts
    exp["redis_ready_counts"] = ready_counts
    exp["redis_post_op_counts"] = post_op_counts
    exp["redis_not_ready_counts"] = not_ready_counts
    exp["redis_running_counts"] = running_counts
    exp["sqs_invis_counts"] = sqs_invis_counts
    exp["sqs_vis_counts"] = sqs_vis_counts
    exp["busy_workers"] = busy_workers_counts
    exp["up_workers"] = up_workers_counts
    exp["times"] = times
    exp["lru"] = lru
    exp["priority"] = priority
    exp["eager"] = eager
    exp["truncate"] = truncate
    exp["max_cores"] = max_cores
    exp["problem_size"] = problem_size
    exp["shard_size"] = shard_size
    exp["pipeline"] = pipeline
    exp["flops"] = flops
    exp["reads"] = reads
    exp["writes"] = writes
    exp["trial"] = trial
    exp["launch_granularity"] = launch_granularity
    exp["log_granularity"] = log_granularity
    exp["autoscale_policy"] = autoscale_policy
    exp["failure_times"] = failure_times

    logger.info("Longest Path: {0}".format(program.longest_path))
    program.start()
    t = time.time()
    logger.info("Starting with {0} cores".format(start_cores))
    failure_keys = [
        "{0}_failure_{1}_{2}".format(program.hash, i, 0)
        for i in range(start_cores)
    ]
    all_futures = pwex.map(lambda x: job_runner.lambdapack_run_with_failures(
        failure_keys[x],
        program,
        pipeline_width=pipeline_width,
        cache_size=cache_size,
        timeout=timeout),
                           range(start_cores),
                           extra_env=redis_env)
    start_time = time.time()
    last_run_time = start_time
    last_failure = time.time()
    num_failure_events = 0

    while (program.program_status() == lp.PS.RUNNING):
        curr_time = int(time.time() - start_time)
        max_pc = program.get_max_pc()
        times.append(int(time.time()))
        time.sleep(log_granularity)
        waiting = 0
        running = 0
        for i, queue_url in enumerate(program.queue_urls):
            client = boto3.client('sqs')
            attrs = client.get_queue_attributes(
                QueueUrl=queue_url,
                AttributeNames=[
                    'ApproximateNumberOfMessages',
                    'ApproximateNumberOfMessagesNotVisible'
                ])['Attributes']
            waiting += int(attrs["ApproximateNumberOfMessages"])
            running += int(attrs["ApproximateNumberOfMessagesNotVisible"])
        sqs_invis_counts.append(running)
        sqs_vis_counts.append(waiting)
        busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash))
        if (busy_workers == None):
            busy_workers = 0
        else:
            busy_workers = int(busy_workers)
        up_workers = program.get_up()

        if (up_workers == None):
            up_workers = 0
        else:
            up_workers = int(up_workers)
        up_workers_counts.append(up_workers)
        busy_workers_counts.append(busy_workers)

        logger.debug("Waiting: {0}, Currently Processing: {1}".format(
            waiting, running))
        logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format(
            up_workers, busy_workers, curr_time))
        if ((curr_time % INFO_FREQ) == 0):
            logger.info("Max PC is {0}".format(max_pc))
            logger.info("Waiting: {0}, Currently Processing: {1}".format(
                waiting, running))
            logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                up_workers, busy_workers, curr_time))

        #print("{5}: Not Ready: {0}, Ready: {1}, Running: {4}, Post OP: {2},  Done: {3}".format(not_ready_count, ready_count, post_op_count, done_count, running_count, curr_time))
        current_gflops = program.get_flops()
        if (current_gflops is None):
            current_gflops = 0
        else:
            current_gflops = int(current_gflops) / 1e9

        flops.append(current_gflops)
        current_gbytes_read = program.get_read()
        if (current_gbytes_read is None):
            current_gbytes_read = 0
        else:
            current_gbytes_read = int(current_gbytes_read) / 1e9

        reads.append(current_gbytes_read)
        current_gbytes_write = program.get_write()
        if (current_gbytes_write is None):
            current_gbytes_write = 0
        else:
            current_gbytes_write = int(current_gbytes_write) / 1e9
        writes.append(current_gbytes_write)
        #print("{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}".format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write))

        time_since_launch = time.time() - last_run_time
        if (autoscale_policy == "dynamic"):
            if (time_since_launch > launch_granularity
                    and up_workers < np.ceil(waiting * 0.5 / pipeline_width)
                    and up_workers < max_cores):
                cores_to_launch = int(
                    min(
                        np.ceil(waiting / pipeline_width) - up_workers,
                        max_cores - up_workers))
                logger.info(
                    "launching {0} new tasks....".format(cores_to_launch))
                _failure_keys = [
                    "{0}_failure_{1}_{2}".format(program.hash, i, curr_time)
                    for i in range(cores_to_launch)
                ]
                new_futures = pwex.map(
                    lambda x: job_runner.lambdapack_run_with_failures(
                        _failure_keys[x],
                        program,
                        pipeline_width=pipeline_width,
                        cache_size=cache_size,
                        timeout=timeout),
                    range(cores_to_launch),
                    extra_env=redis_env)
                last_run_time = time.time()
                # check if we OOM-erred
                # [x.result() for x in all_futures]
                all_futures.extend(new_futures)
        elif (autoscale_policy == "constant_timeout"):
            if (time_since_launch > (0.75 * timeout)):
                cores_to_launch = max_cores
                logger.info(
                    "launching {0} new tasks....".format(cores_to_launch))
                _failure_keys = [
                    "{0}_failure_{1}_{2}".format(program.hash, i, curr_time)
                    for i in range(cores_to_launch)
                ]
                new_futures = pwex.map(
                    lambda x: job_runner.lambdapack_run_with_failures(
                        _failure_keys[x],
                        program,
                        pipeline_width=pipeline_width,
                        cache_size=cache_size,
                        timeout=timeout),
                    range(cores_to_launch),
                    extra_env=redis_env)
                last_run_time = time.time()
                failure_keys += _failure_keys
                # check if we OOM-erred
                # [x.result() for x in all_futures]
                all_futures.extend(new_futures)
        else:
            raise Exception("unknown autoscale policy")

        if ((time.time() - last_failure) > failure_time
                and num_failure_events < max_failure_events):
            logging.info("Killing some jobs")
            idxs = np.random.choice(len(failure_keys),
                                    int(failure_percentage *
                                        len(failure_keys)),
                                    replace=False)
            num_failure_events += 1
            last_failure = time.time()
            failure_times.append(last_failure)
            for i in idxs:
                logging.info("Killing: job {0}".format(i))
                REDIS_CLIENT.set(failure_keys[i], 1)

    exp["all_futures"] = all_futures
    for pc in range(program.num_inst_blocks):
        run_count = REDIS_CLIENT.get("{0}_{1}_start".format(program.hash, pc))
        if (run_count is None):
            run_count = 0
        else:
            run_count = int(run_count)

        if (run_count != 1):
            logger.info("PC: {0}, Run Count: {1}".format(pc, run_count))

    e = time.time()
    logger.info(program.program_status())
    logger.info("PROGRAM STATUS " + str(program.program_status()))
    logger.info("PROGRAM HASH " + str(program.hash))
    logger.info("Took {0} seconds".format(e - t))
    exp["total_runtime"] = e - t
    exp["num_failure_events"] = num_failure_events
    # collect in
    executor = fs.ThreadPoolExecutor(72)
    futures = []
    for i in range(0, program.num_inst_blocks, 1):
        futures.append(executor.submit(program.get_profiling_info, i))
    res = fs.wait(futures)
    profiled_blocks = [f.result() for f in futures]
    serializer = serialize.SerializeIndependent()
    byte_string = serializer([profiled_blocks])[0][0]
    exp["profiled_block_pickle_bytes"] = byte_string

    read, write, total_flops, bins, instructions, runtimes = lp.perf_profile(
        profiled_blocks, num_bins=100)
    flop_rate = sum(total_flops) / max(bins)
    exp["flop_rate"] = flop_rate
    print("Average Flop rate of {0}".format(flop_rate))
    # save other stuff
    try:
        os.mkdir("failure_experiments/")
    except FileExistsError:
        pass
    exp_bytes = pickle.dumps(exp)
    dump_path = "failure_experiments/{0}.pickle".format(arg_hash)
    print("Dumping experiment pickle to {0}".format(dump_path))
    with open(dump_path, "wb+") as f:
        f.write(exp_bytes)