Exemplo n.º 1
0
def host_dump(client, payload, target_filename, host_view=None, pre_warm=True):
    """Send payload to each host and dump it on the filesystem

    Nothing is done in case the file already exists.

    The payload is shipped only once per node in the cluster.

    """
    if host_view is None:
        host_view = get_host_view(client)

    client = host_view.client

    @interactive
    def dump_payload(payload, filename):
        from sklearn.externals import joblib
        import os
        folder = os.path.dirname(filename)
        if not os.path.exists(folder):
            os.makedirs(folder)
        return joblib.dump(payload, filename)

    missing_ids = _missing_file_engine_ids(host_view, target_filename)
    if missing_ids:
        first_id = missing_ids[0]

        # Do a first dispatch to the first node to avoid concurrent write in
        # case of shared filesystem
        client[first_id].apply_sync(dump_payload, payload, target_filename)

        # Refetch the list of engine ids where the file is missing
        missing_ids = _missing_file_engine_ids(host_view, target_filename)

        # Restrict the view to hosts where the target data file is still
        # missing for the final dispatch
        client[missing_ids].apply_sync(dump_payload, payload, target_filename)

    if pre_warm:
        warm_mmap(client, [target_filename], host_view=host_view)
Exemplo n.º 2
0
def host_dump(client, payload, target_filename, host_view=None, pre_warm=True):
    """Send payload to each host and dump it on the filesystem

    Nothing is done in case the file already exists.

    The payload is shipped only once per node in the cluster.

    """
    if host_view is None:
        host_view = get_host_view(client)

    client = host_view.client

    @interactive
    def dump_payload(payload, filename):
        from sklearn.externals import joblib
        import os
        folder = os.path.dirname(filename)
        if not os.path.exists(folder):
            os.makedirs(folder)
        return joblib.dump(payload, filename)

    missing_ids = _missing_file_engine_ids(host_view, target_filename)
    if missing_ids:
        first_id = missing_ids[0]

        # Do a first dispatch to the first node to avoid concurrent write in
        # case of shared filesystem
        client[first_id].apply_sync(dump_payload, payload, target_filename)

        # Refetch the list of engine ids where the file is missing
        missing_ids = _missing_file_engine_ids(host_view, target_filename)

        # Restrict the view to hosts where the target data file is still
        # missing for the final dispatch
        client[missing_ids].apply_sync(dump_payload, payload, target_filename)

    if pre_warm:
        warm_mmap(client, [target_filename], host_view=host_view)
Exemplo n.º 3
0
def warm_mmap(client, data_filenames, host_view=None):
    """Trigger a disk load on all the arrays data_filenames.

    Assume the files are shared on all the hosts using NFS or
    have been previously been dumped there with the host_dump function.
    """
    if host_view is None:
        host_view = get_host_view(client)

    # Second step: for each data file and host, mmap the arrays of the file
    # and trigger a sequential read of all the arrays' data
    @interactive
    def load_in_memory(filenames):
        from sklearn.externals import joblib
        for filename in filenames:
            arrays = joblib.load(filename, mmap_mode='r')
            for array in arrays:
                if hasattr(array, 'max'):
                    array.max()  # trigger the disk read

    data_filenames = [os.path.abspath(f) for f in data_filenames]
    host_view.apply_sync(load_in_memory, data_filenames)
Exemplo n.º 4
0
def warm_mmap(client, data_filenames, host_view=None):
    """Trigger a disk load on all the arrays data_filenames.

    Assume the files are shared on all the hosts using NFS or
    have been previously been dumped there with the host_dump function.
    """
    if host_view is None:
        host_view = get_host_view(client)

    # Second step: for each data file and host, mmap the arrays of the file
    # and trigger a sequential read of all the arrays' data
    @interactive
    def load_in_memory(filenames):
        from sklearn.externals import joblib
        for filename in filenames:
            arrays = joblib.load(filename, mmap_mode='r')
            for array in arrays:
                if hasattr(array, 'max'):
                    array.max()  # trigger the disk read

    data_filenames = [os.path.abspath(f) for f in data_filenames]
    host_view.apply_sync(load_in_memory, data_filenames)