Пример #1
0
    def __init__(self, invoker, config, job_max_runtime):
        self.invoker = invoker
        self.job_max_runtime = job_max_runtime

        self.config = config
        self.storage_config = wrenconfig.extract_storage_config(self.config)
        self.storage = storage.Storage(self.storage_config)
        self.runtime_meta_info = runtime.get_runtime_info(config['runtime'])
        # print('runtime_meta_info: ', self.runtime_meta_info)

        self.runtime_meta_info['preinstalls'].append(['pandas', True])
        self.runtime_meta_info['preinstalls'].append(['thrift', True])
        self.runtime_meta_info['preinstalls'].append(['Thrift', True])

        if 'preinstalls' in self.runtime_meta_info:
            logger.info("using serializer with meta-supplied preinstalls")
            self.serializer = serialize.SerializeIndependent(
                self.runtime_meta_info['preinstalls'])
        else:
            self.serializer = serialize.SerializeIndependent()

        self.map_item_limit = None
        if 'scheduler' in self.config:
            if 'map_item_limit' in config['scheduler']:
                self.map_item_limit = config['scheduler']['map_item_limit']
Пример #2
0
    def cancel(self, storage_handler=None):
        # TODO Figure out a better way for this function to have
        # access to a custom storage handler

        if storage_handler is None:
            storage_config = wrenconfig.extract_storage_config(
                wrenconfig.default())
            storage_handler = storage.Storage(storage_config)

        storage_handler.put_cancelled(self.callset_id, self.call_id, "CANCEL")
Пример #3
0
def _wait(fs, THREADPOOL_SIZE):
    """
    internal function that performs the majority of the WAIT task
    work.
    """

    # get all the futures that are not yet done
    not_done_futures = [
        f for f in fs if f._state not in [JobState.success, JobState.error]
    ]
    if len(not_done_futures) == 0:
        return fs, []

    # check if the not-done ones have the same callset_id
    present_callsets = set([f.callset_id for f in not_done_futures])
    if len(present_callsets) > 1:
        raise NotImplementedError()

    # get the list of all objects in this callset
    callset_id = present_callsets.pop()  # FIXME assume only one

    storage_config = wrenconfig.extract_storage_config(wrenconfig.default())
    storage_handler = storage.Storage(storage_config)
    callids_done = storage_handler.get_callset_status(callset_id)

    callids_done = set(callids_done)

    fs_dones = []
    fs_notdones = []

    f_to_wait_on = []
    for f in fs:
        if f._state in [JobState.success, JobState.error]:
            # done, don't need to do anything
            fs_dones.append(f)
        else:
            if f.call_id in callids_done:
                f_to_wait_on.append(f)
                fs_dones.append(f)
            else:
                fs_notdones.append(f)

    def test(f):
        f.result(throw_except=False, storage_handler=storage_handler)

    pool = ThreadPool(THREADPOOL_SIZE)
    pool.map(test, f_to_wait_on)

    pool.close()
    pool.join()

    return fs_dones, fs_notdones
Пример #4
0
    def __init__(self, invoker, config, job_max_runtime):
        self.invoker = invoker
        self.job_max_runtime = job_max_runtime

        self.config = config
        self.storage_config = wrenconfig.extract_storage_config(self.config)
        self.storage = storage.Storage(self.storage_config)
        self.runtime_meta_info = runtime.get_runtime_info(config['runtime'])


        if 'preinstalls' in self.runtime_meta_info:
            logger.info("using serializer with meta-supplied preinstalls")
            self.serializer = serialize.SerializeIndependent(self.runtime_meta_info['preinstalls'])
        else:
            self.serializer = serialize.SerializeIndependent()
Пример #5
0
def get_runtime_info(runtime_config, storage_handler=None):
    """
    Download runtime information from storage at deserialize
    """
    if storage_handler is None:
        storage_config = wrenconfig.extract_storage_config(
            wrenconfig.default())
        storage_handler = storage.Storage(storage_config)

    runtime_meta = storage_handler.get_runtime_info(runtime_config)

    if not runtime_valid(runtime_meta):
        raise Exception(("The indicated runtime: {} " +
                         "is not approprite for this python version."
                         ).format(runtime_config))

    return runtime_meta
Пример #6
0
def handle_generic_failure(future, failed_warc_paths, exc):
    if future not in failed_warc_paths:
        failed_warc_paths.add(future)

        print('A future failed with error: %s' % exc)
        print('')

        storage_config = wrenconfig.extract_storage_config(
            wrenconfig.default())
        storage_handler = storage.Storage(storage_config)

        call_status = storage_handler.get_call_status(future.callset_id,
                                                      future.call_id)
        exception_traceback = call_status.get('exception_traceback', None)

        if exception_traceback is not None:
            print(exception_traceback)
        else:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            traceback.print_tb(exc_traceback)
Пример #7
0
    def result(self,
               timeout=None,
               check_only=False,
               throw_except=True,
               storage_handler=None):
        """

        check_only = True implies we only check if the job is completed.

        # FIXME check_only is the worst API and should be refactored
        # out to be part of done()

        From the python docs:

        Return the value returned by the call. If the call hasn't yet
        completed then this method will wait up to timeout seconds. If
        the call hasn't completed in timeout seconds then a
        TimeoutError will be raised. timeout can be an int or float.If
        timeout is not specified or None then there is no limit to the
        wait time.

        Return the value returned by the call.
        If the call raised an exception, this method will raise the same exception
        If the future is cancelled before completing then CancelledError will be raised.

        :param timeout: This method will wait up to timeout seconds before raising
            a TimeoutError if function hasn't completed. If None, wait indefinitely. Default None.
        :param check_only: Return None immediately if job is not complete. Default False.
        :param throw_except: Reraise exception if call raised. Default true.
        :param storage_handler: Storage handler to poll cloud storage. Default None.
        :return: Result of the call.
        :raises CancelledError: If the job is cancelled before completed.
        :raises TimeoutError: If job is not complete after `timeout` seconds.

        """
        if self._state == JobState.new:
            raise ValueError("job not yet invoked")

        if check_only:
            if self._state == JobState.success or self._state == JobState.error:
                return True

        if self._state == JobState.success:
            return self._return_val

        if self._state == JobState.error:
            if throw_except:
                raise self._exception
            else:
                return None

        if storage_handler is None:
            storage_config = wrenconfig.extract_storage_config(
                wrenconfig.default())
            storage_handler = storage.Storage(storage_config)

        storage_utils.check_storage_path(storage_handler.get_storage_config(),
                                         self.storage_path)

        call_status = storage_handler.get_call_status(self.callset_id,
                                                      self.call_id)

        self.status_query_count += 1

        ## FIXME implement timeout
        if timeout is not None:
            raise NotImplementedError()

        if check_only:
            if call_status is None:
                return False
            else:
                return True

        while call_status is None:
            time.sleep(self.GET_RESULT_SLEEP_SECS)
            call_status = storage_handler.get_call_status(
                self.callset_id, self.call_id)

            self.status_query_count += 1
        self._invoke_metadata['status_done_timestamp'] = time.time()
        self._invoke_metadata['status_query_count'] = self.status_query_count

        self.run_status = call_status  # this is the remote status information
        self.invoke_status = self._invoke_metadata  # local status information

        if call_status['exception'] is not None:
            # the wrenhandler had an exception
            exception_str = call_status['exception']

            exception_args = call_status['exception_args']
            if exception_args[0] == "WRONGVERSION":
                if throw_except:
                    raise Exception("Pywren version mismatch: remote " + \
                        "expected version {}, local library is version {}".format(
                            exception_args[2], exception_args[3]))
                return None
            elif exception_args[0] == "OUTATIME":
                if throw_except:
                    raise Exception("process ran out of time")
                return None
            elif exception_args[0] == "CANCELLED":
                if throw_except:
                    raise Exception("job was cancelled")
            elif exception_args[0] == "RETCODE":
                if throw_except:
                    raise Exception(
                        "python process failed, returned a non-zero return code"
                        "(check stdout for information)")
                return None
            else:
                if throw_except:
                    if 'exception_traceback' in call_status:
                        logger.error(call_status['exception_traceback'])
                    raise Exception(exception_str, *exception_args)
                return None

        # FIXME this shouldn't be called if check_only is True
        call_output_time = time.time()
        call_invoker_result = pickle.loads(
            storage_handler.get_call_output(self.callset_id, self.call_id))

        call_output_time_done = time.time()
        self._invoke_metadata[
            'download_output_time'] = call_output_time_done - call_output_time

        self._invoke_metadata[
            'download_output_timestamp'] = call_output_time_done
        call_success = call_invoker_result['success']
        logger.info("ResponseFuture.result() {} {} call_success {}".format(
            self.callset_id, self.call_id, call_success))

        self._call_invoker_result = call_invoker_result

        if call_success:

            self._return_val = call_invoker_result['result']
            self._set_state(JobState.success)
            return self._return_val
        else:
            self._set_state(JobState.error)
            self._exception = call_invoker_result['result']
            self._traceback = (call_invoker_result['exc_type'],
                               call_invoker_result['exc_value'],
                               call_invoker_result['exc_traceback'])

            if throw_except:

                if call_invoker_result.get('pickle_fail', False):
                    logging.warning(
                        "there was an error pickling. The original exception: " + \
                            "{}\nThe pickling exception: {}".format(
                                call_invoker_result['exc_value'],
                                str(call_invoker_result['pickle_exception'])))

                    reraise(Exception, call_invoker_result['exc_value'],
                            call_invoker_result['exc_traceback'])
                else:
                    # reraise the exception
                    reraise(*self._traceback)
            else:
                return None  # nothing, don't raise, no value
Пример #8
0
    def result(self,
               timeout=None,
               check_only=False,
               throw_except=True,
               storage_handler=None):
        """


        From the python docs:

        Return the value returned by the call. If the call hasn't yet
        completed then this method will wait up to timeout seconds. If
        the call hasn't completed in timeout seconds then a
        TimeoutError will be raised. timeout can be an int or float.If
        timeout is not specified or None then there is no limit to the
        wait time.

        If the future is cancelled before completing then CancelledError will be raised.

        If the call raised then this method will raise the same exception.

        """
        if self._state == JobState.new:
            raise ValueError("job not yet invoked")

        if self._state == JobState.success:
            return self._return_val

        if self._state == JobState.error:
            if throw_except:
                raise self._exception
            else:
                return None

        if storage_handler is None:
            storage_config = wrenconfig.extract_storage_config(
                wrenconfig.default())
            storage_handler = storage.Storage(storage_config)

        storage_utils.check_storage_path(storage_handler.get_storage_config(),
                                         self.storage_path)

        call_status = storage_handler.get_call_status(self.callset_id,
                                                      self.call_id)
        self.status_query_count += 1

        ## FIXME implement timeout
        if timeout is not None:
            raise NotImplementedError()

        if check_only is True:
            if call_status is None:
                return None

        while call_status is None:
            time.sleep(self.GET_RESULT_SLEEP_SECS)
            call_status = storage_handler.get_call_status(
                self.callset_id, self.call_id)

            self.status_query_count += 1
        self._invoke_metadata['status_done_timestamp'] = time.time()
        self._invoke_metadata['status_query_count'] = self.status_query_count

        self.run_status = call_status  # this is the remote status information
        self.invoke_status = self._invoke_metadata  # local status information
        print self.callset_id
        if call_status['exception'] is not None:
            # the wrenhandler had an exception
            exception_str = call_status['exception']
            print(call_status)
            exception_args = call_status['exception_args']
            if exception_args[0] == "WRONGVERSION":
                if throw_except:
                    raise Exception("Pywren version mismatch: remote " + \
                        "expected version {}, local library is version {}".format(
                            exception_args[2], exception_args[3]))
                return None
            elif exception_args[0] == "OUTATIME":
                if throw_except:
                    raise Exception("process ran out of time")
                return None
            else:
                if throw_except:
                    if 'exception_traceback' in call_status:
                        logger.error(call_status['exception_traceback'])
                    raise Exception(exception_str, *exception_args)
                return None

        call_output_time = time.time()
        call_invoker_result = pickle.loads(
            storage_handler.get_call_output(self.callset_id, self.call_id))

        call_output_time_done = time.time()
        self._invoke_metadata[
            'download_output_time'] = call_output_time_done - call_output_time

        self._invoke_metadata[
            'download_output_timestamp'] = call_output_time_done
        call_success = call_invoker_result['success']
        logger.info("ResponseFuture.result() {} {} call_success {}".format(
            self.callset_id, self.call_id, call_success))

        self._call_invoker_result = call_invoker_result

        if call_success:

            self._return_val = call_invoker_result['result']
            self._state = JobState.success
            return self._return_val

        elif throw_except:
            print "exception"
            self._exception = call_invoker_result['result']
            self._traceback = (call_invoker_result['exc_type'],
                               call_invoker_result['exc_value'],
                               call_invoker_result['exc_traceback'])

            self._state = JobState.error
            if call_invoker_result.get('pickle_fail', False):
                logging.warning(
                    "there was an error pickling. The original exception: " + \
                        "{}\nThe pickling exception: {}".format(
                            call_invoker_result['exc_value'],
                            str(call_invoker_result['pickle_exception'])))

                reraise(Exception, call_invoker_result['exc_value'],
                        call_invoker_result['exc_traceback'])
            else:
                # reraise the exception
                reraise(*self._traceback)
        else:
            return None  # nothing, don't raise, no value
Пример #9
0
def _wait(fs,
          return_early_n,
          max_direct_query_n,
          random_query=False,
          THREADPOOL_SIZE=16):
    """
    internal function that performs the majority of the WAIT task
    work.

    For the list of futures fn, we will check at a minimum `max_direct_query_n`
    futures at least once. Internally we :
    1. use list() to quickly get a list of which ones are done (but
    list can be behind due to eventual consistency issues)
    2. then individually call get_status on at most `max_direct_query_n` returning
       early if we have found at least `return_early_n`

    This can mitigate the stragglers.

    random_query decides whether we get the fs in the order they are presented
    or in a random order.
    """

    # get all the futures that are not yet done
    not_done_futures = [
        f for f in fs if f._state not in [JobState.success, JobState.error]
    ]
    if len(not_done_futures) == 0:
        return fs, []

    storage_config = wrenconfig.extract_storage_config(wrenconfig.default())
    storage_handler = storage.Storage(storage_config)

    ### Callset optimization via object store convenience functions:
    # check if the not-done ones have the same callset_id
    present_callsets = set([f.callset_id for f in not_done_futures])
    if len(present_callsets) > 1:
        raise NotImplementedError()

    # get the list of all objects in this callset
    callset_id = present_callsets.pop()  # FIXME assume only one

    # note this returns everything done, so we have to figure out
    # the intersection of those that are done
    callids_done_in_callset = set(
        storage_handler.get_callset_status(callset_id))

    not_done_call_ids = set([f.call_id for f in not_done_futures])

    done_call_ids = not_done_call_ids.intersection(callids_done_in_callset)
    not_done_call_ids = not_done_call_ids - done_call_ids

    still_not_done_futures = [
        f for f in not_done_futures if (f.call_id in not_done_call_ids)
    ]

    def fetch_future_status(f):
        return storage_handler.get_call_status(f.callset_id, f.call_id)

    pool = ThreadPool(THREADPOOL_SIZE)

    # now try up to max_direct_query_n direct status queries, quitting once
    # we have return_n done.
    query_count = 0
    max_queries = min(max_direct_query_n, len(still_not_done_futures))

    if random_query:
        random.shuffle(still_not_done_futures)

    while query_count < max_queries:

        if len(done_call_ids) >= return_early_n:
            break
        num_to_query_at_once = THREADPOOL_SIZE
        fs_to_query = still_not_done_futures[query_count:query_count +
                                             num_to_query_at_once]

        fs_statuses = pool.map(fetch_future_status, fs_to_query)

        callids_found = [
            fs_to_query[i].call_id for i in range(len(fs_to_query))
            if (fs_statuses[i] is not None)
        ]
        done_call_ids = done_call_ids.union(set(callids_found))

        # # update done call_ids
        # callids_done.update(callids_found)

        # # break if not all N tasks completed
        # if (len(callids_found) < len(fs_samples)):
        #     break
        # # calculate new still_not_done_futures
        # still_not_done_futures = [f for f in not_done_futures if (f.call_id not in callids_done)]
        query_count += len(fs_to_query)

    # now we walk through all the original queries and get
    # the ones that are actually done.
    fs_dones = []
    fs_notdones = []

    f_to_wait_on = []
    for f in fs:
        if f._state in [JobState.success, JobState.error]:
            # done, don't need to do anything
            fs_dones.append(f)
        else:
            if f.call_id in done_call_ids:
                f_to_wait_on.append(f)
                fs_dones.append(f)
            else:
                fs_notdones.append(f)

    def get_result(f):
        f.result(throw_except=False, storage_handler=storage_handler)

    pool.map(get_result, f_to_wait_on)

    pool.close()
    pool.join()

    return fs_dones, fs_notdones