def subparser(subparsers):
    '''Register subparser modules'''
    deployparser = subparsers.add_parser(
        'deploy', help='Deploy data on a RADOS-Ceph cluster.')
    deployparser.add_argument(
        'paths',
        metavar='path',
        type=str,
        nargs='+',
        help='Data path(s) to deploy on the remote cluster.')
    deployparser.add_argument('--admin',
                              metavar='id',
                              dest='admin_id',
                              type=int,
                              default=None,
                              help='ID of the Ceph admin node.')
    deployparser.add_argument(
        '--mountpoint',
        metavar='path',
        type=str,
        default=start_defaults.mountpoint_path(),
        help='Mountpoint for CephFS on all nodes (default={}).'.format(
            start_defaults.mountpoint_path()))
    deployparser.add_argument(
        '--stripe',
        metavar='amount',
        type=int,
        default=defaults.stripe(),
        help=
        'Striping, in megabytes (default={}MB). Must be a multiple of 4. Make sure that every file is smaller than set stripe size.'
        .format(defaults.stripe()))
    deployparser.add_argument(
        '--copy-multiplier',
        metavar='amount',
        dest='copy_multiplier',
        type=int,
        default=1,
        help=
        'Copy multiplier (default=1). Every file will be copied "amount"-1 times on the remote, to make the data look "amount" times larger. This multiplier is applied first.'
    )
    deployparser.add_argument(
        '--link-multiplier',
        metavar='amount',
        dest='link_multiplier',
        type=int,
        default=1,
        help=
        'Link multiplier (default=1). Every file will receive "amount"-1 hardlinks on the remote, to make the data look "amount" times larger. This multiplier is applied second. Note that we first apply the copy multiplier, meaning: the link multiplier is applied on copies of files, and the dataset inflation stacks.'
    )
    deployparser.add_argument('--silent',
                              help='If set, less boot output is shown.',
                              action='store_true')
    return [deployparser]
Пример #2
0
def subparser(subparsers):
    '''Register subparser modules'''
    deployparser = subparsers.add_parser(
        'deploy', help='Deploy data generators on a RADOS-Ceph cluster.')
    submitparser.add_argument(
        'cmd',
        metavar='cmd',
        type=str,
        help=
        'Command to execute on the remote cluster. Note: $JAVA_HOME/bin/java is available for java applications. python3 is available for python applications. If you need to use flags in the command with "-" signs, use e.g. "-- -h" to ignore "-" signs for the rest of the command.'
    )
    deployparser.add_argument('--admin',
                              metavar='id',
                              dest='admin_id',
                              type=int,
                              default=None,
                              help='ID of the Ceph admin node.')
    deployparser.add_argument(
        '--paths',
        metavar='path',
        type=str,
        nargs='+',
        help=
        'Path(s) to applications to deploy on the remote cluster. Given applications will be available in the CWD for command execution.'
    )
    deployparser.add_argument(
        '--mountpoint',
        metavar='path',
        type=str,
        default=start_defaults.mountpoint_path(),
        help='Mountpoint for CephFS on all nodes (default={}).'.format(
            start_defaults.mountpoint_path()))
    deployparser.add_argument(
        '--stripe',
        metavar='amount',
        type=int,
        default=defaults.stripe(),
        help=
        'Striping, in megabytes (default={}MB). Must be a multiple of 4. Make sure that every file is smaller than set stripe size.'
        .format(defaults.stripe()))
    deployparser.add_argument(
        '--multiplier',
        metavar='amount',
        type=int,
        default=1,
        help=
        'Data multiplier (default=1). Every file copied will receive "amount"-1 of hardlinks, to make the data look "amount" times larger.'
    )
    deployparser.add_argument('--silent',
                              help='If set, less boot output is shown.',
                              action='store_true')
    return [deployparser]
def deploy(reservation,
           paths=None,
           key_path=None,
           admin_id=None,
           connectionwrapper=None,
           stripe=defaults.stripe(),
           copy_multiplier=1,
           link_multiplier=1,
           mountpoint_path=start_defaults.mountpoint_path(),
           silent=False):
    '''Deploy data on remote RADOS-Ceph clusters, on an existing reservation.
    Dataset sizes can be inflated on the remote, using 2 strategies:
     1. link multiplication: Every dataset file receives `x` hardlinks.
        The hardlinks ensure the dataset size appears to be `x` times larger, but in reality, just the original file consumes space.
        This method is very fast, but has drawbacks: Only the original files are stored by Ceph.
        When using the RADOS-Arrow connector, this means Arrow will spam only the nodes that contain the original data.
        E.g: If we deploy 1 file of 64MB, with link multiplier 1024, the data will apear to be 64GB.
             The storage space used on RADOS-Ceph will still be 64MB, because we have 1 real file of 64MB, and 1023 hardlinks to that 1 file.
             The actual data is only stored on 3 OSDs (with default Ceph Striping factor 3).
             Now, Arrow will spam all work to those 3 OSDs containing the data, while the rest is idle.
     2. file multiplication: Every dataset file receives `x` copies.
        This method is slower than the one listed above, because real data has to be copied. 
        It also actually increases storage usage, contrary to above. 
        However, because we multiply real data, the load is guaranteed to be balanced across nodes, as far as Ceph does that.

    Note that mutiple multiplication techniques can be combined, in which case they stack.
    E.g: If we deploy 1 file of 64MB, with a copy multiplier 4 and a link multiplier 1024, we get 4 real files (1 original + 3 copies),
         and each file gets 1023 hardlinks assigned to it.
    Args:
        reservation (`metareserve.Reservation`): Reservation object with all nodes to start RADOS-Ceph on.
        key_path (optional str): Path to SSH key, which we use to connect to nodes. If `None`, we do not authenticate using an IdentityFile.
        admin_id (optional int): Node id of the ceph admin. If `None`, the node with lowest public ip value (string comparison) will be picked.
        connectionwrapper (optional RemotoSSHWrapper): If set, uses given connection, instead of building a new one.
        paths (optional list(str)): Data paths to offload to the remote cluster. Can be relative to CWD or absolute.
        stripe (optional int): Ceph object stripe property, in megabytes.
        copy_multiplier (optional int): If set to a value `x`, makes the dataset appear `x` times larger by copying every file `x`-1 times. Does nothing if `x`<=1.
        link_multiplier (optional int): If set to a value `x`, makes the dataset appear `x` times larger by adding `x`-1 hardlinks for every transferred file. Does nothing if `x`<=1.
        mountpoint_path (optional str): Path where CephFS is mounted on all nodes.
        silent (optional bool): If set, we only print errors and critical info. Otherwise, more verbose output.

    Returns:
        `True` on success, `False` otherwise.'''
    module = importer.import_full_path(
        fs.join(fs.dirname(fs.abspath(__file__)), 'internal', 'data_deploy',
                'rados_deploy.deploy.plugin.py'))
    args = []
    kwargs = {
        'admin_id': admin_id,
        'connectionwrapper': connectionwrapper,
        'stripe': stripe,
        'copy_multiplier': copy_multiplier,
        'link_multiplier': link_multiplier
    }
    return module.execute(reservation, key_path, paths, dest, silent, *args,
                          **kwargs)
Пример #4
0
def execute(reservation, key_path, paths, dest, silent, copy_multiplier, link_multiplier, *args, **kwargs):
    '''Deploy data on remote RADOS-Ceph clusters, on an existing reservation.
    Dataset sizes can be inflated on the remote, using 2 strategies:
     1. link multiplication: Every dataset file receives `x` hardlinks.
        The hardlinks ensure the dataset size appears to be `x` times larger, but in reality, just the original file consumes space.
        This method is very fast, but has drawbacks: Only the original files are stored by Ceph.
        When using the RADOS-Arrow connector, this means Arrow will spam only the nodes that contain the original data.
        E.g: If we deploy 1 file of 64MB, with link multiplier 1024, the data will apear to be 64GB.
             The storage space used on RADOS-Ceph will still be 64MB, because we have 1 real file of 64MB, and 1023 hardlinks to that 1 file.
             The actual data is only stored on 3 OSDs (with default Ceph Striping factor 3).
             Now, Arrow will spam all work to those 3 OSDs containing the data, while the rest is idle.
     2. file multiplication: Every dataset file receives `x` copies.
        This method is slower than the one listed above, because real data has to be copied. 
        It also actually increases storage usage, contrary to above. 
        However, because we multiply real data, the load is guaranteed to be balanced across nodes, as far as Ceph does that.

    Note that mutiple multiplication techniques can be combined, in which case they stack.
    E.g: If we deploy 1 file of 64MB, with a copy multiplier 4 and a link multiplier 1024, we get 4 real files (1 original + 3 copies),
         and each file gets 1023 hardlinks assigned to it.

    Returns:
        `True` on success, `False` otherwise.'''
    connectionwrapper = kwargs.get('connectionwrapper')
    admin_id = kwargs.get('admin_id')
    stripe = kwargs.get('stripe') or defaults.stripe()

    if stripe < 4:
        raise ValueError('Stripe size must be equal to or greater than 4MB (and a multiple of 4MB)!')
    if stripe % 4 != 0:
        raise ValueError('Stripe size must be a multiple of 4MB!')

    admin_node, _ = _pick_admin(reservation, admin=admin_id)
    use_local_connections = connectionwrapper == None
    if use_local_connections: # We did not get any connections, so we must make them
        ssh_kwargs = {'IdentitiesOnly': 'yes', 'StrictHostKeyChecking': 'no'}
        if key_path:
            ssh_kwargs['IdentityFile'] = key_path

        connectionwrapper = ssh_wrapper.get_wrapper(admin_node, admin_node.ip_public, ssh_params=_merge_kwargs(ssh_kwargs, {'User': admin_node.extra_info['user']}), silent=silent)
    else: # We received connections, need to check if they are valid.
        if not connectionwrapper.open:
            raise ValueError('Provided connection is not open.')

    retval = _execute_internal(connectionwrapper, reservation, paths, dest, silent, copy_multiplier, link_multiplier, admin_node, stripe)
    if not use_local_connections:
        ssh_wrapper.close_wrappers([connectionwrapper])
    return retval
def generate(reservation,
             key_path=None,
             admin_id=None,
             cmd=None,
             paths=None,
             stripe=defaults.stripe(),
             multiplier=1,
             mountpoint_path=start_defaults.mountpoint_path(),
             silent=False):
    '''Deploy data on the RADOS-Ceph cluster, on an existing reservation.
    Args:
        reservation (`metareserve.Reservation`): Reservation object with all nodes to start RADOS-Ceph on.
        key_path (optional str): Path to SSH key, which we use to connect to nodes. If `None`, we do not authenticate using an IdentityFile.
        admin_id (optional int): Node id of the ceph admin. If `None`, the node with lowest public ip value (string comparison) will be picked.
        cmd (optional str): Command to execute on the remote cluster to generate the data.
        paths (optional list(str)): Data paths to offload to the remote cluster. Can be relative to CWD or absolute.
        stripe (optional int): Ceph object stripe property, in megabytes.
        multiplier (optional int): If set to a value `x`, makes the dataset appear `x` times larger by adding `x`-1 hardlinks for every transferred file. Does nothing if `x`<=1.
        mountpoint_path (optional str): Path where CephFS is mounted on all nodes.
        silent (optional bool): If set, we only print errors and critical info. Otherwise, more verbose output.

    Returns:
        `True` on success, `False` otherwise.'''
    if not reservation or len(reservation) == 0:
        raise ValueError('Reservation does not contain any items' +
                         (' (reservation=None)' if not reservation else ''))
    if stripe < 4:
        raise ValueError(
            'Stripe size must be equal to or greater than 4MB (and a multiple of 4MB)!'
        )
    if stripe % 4 != 0:
        raise ValueError('Stripe size must be a multiple of 4MB!')
    if not cmd:
        raise ValueError('Command to generate data not provided.')
    raise NotImplementedError
    return True
Пример #6
0
def parse(args):
    parser = argparse.ArgumentParser(prog='...')
    parser.add_argument('--admin', metavar='id', dest='admin_id', type=int, default=None, help='ID of the node that will be the primary or admin node.')
    parser.add_argument('--stripe', metavar='amount', type=int, default=defaults.stripe(), help='Striping, in megabytes (default={}MB). Must be a multiple of 4. Make sure that every file is smaller than set stripe size.'.format(defaults.stripe()))
    args = parser.parse_args(args)
    return True, [], {'admin_id': args.admin_id, 'stripe': args.stripe}