Пример #1
0
def calculate_splits(config):
    """reads config to find out what type of split to perform"""
    #if the user does not specify an inputURI we will need to construct it from
    #the db/collection name TODO

    uri = config.get("input_uri", "mongodb://localhost/test.in")
    config['input_uri'] = uri
    uri_info = uri_parser.parse_uri(uri)

    #database_name = uri_info['database']
    collection_name = uri_info['collection']

    db = get_database(uri)
    stats = db.command("collstats", collection_name)

    is_sharded = False if "sharded" not in stats else stats["sharded"]
    use_shards = config.get("use_shards", False)
    use_chunks = config.get("use_chunks", False)
    slave_ok = config.get("slave_ok", False)

    logging.info(" Calculate Splits Code ... Use Shards? - %s\nUse Chunks? \
        - %s\nCollection Sharded? - %s" % (use_shards, use_chunks, is_sharded))

    logging.info("WRAPP")
    logging.info(config)
    logging.info("WRAPP")
    if config.get("create_input_splits"):
        logging.info("Creation of Input Splits is enabled.")
        if is_sharded and (use_shards or use_chunks):
            if use_shards and use_chunks:
                logging.warn("Combining 'use chunks' and 'read from shards \
                    directly' can have unexpected & erratic behavior in a live \
                    system due to chunk migrations. ")

            logging.info("Sharding mode calculation entering.")
            return calculate_sharded_splits(config, use_shards, use_chunks,
                                            uri)
        # perfectly ok for sharded setups to run with a normally calculated split.
        #May even be more efficient for some cases
        else:
            logging.info("Using Unsharded Split mode \
                    (Calculating multiple splits though)")
            return calculate_unsharded_splits(config, uri)

    else:
        logging.info("Creation of Input Splits is disabled;\
                Non-Split mode calculation entering.")

        return calculate_single_split(config)
Пример #2
0
def calculate_splits(config):
    """reads config to find out what type of split to perform"""
    #if the user does not specify an inputURI we will need to construct it from
    #the db/collection name TODO

    uri = config.get("input_uri", "mongodb://localhost/test.in")
    config['input_uri'] = uri
    uri_info = uri_parser.parse_uri(uri)

    #database_name = uri_info['database']
    collection_name = uri_info['collection']

    db = get_database(uri)
    stats = db.command("collstats", collection_name)

    is_sharded = False if "sharded" not in stats else stats["sharded"]
    use_shards = config.get("use_shards", False)
    use_chunks = config.get("use_chunks", False)
    slave_ok = config.get("slave_ok", False)

    logging.info(" Calculate Splits Code ... Use Shards? - %s\nUse Chunks? \
        - %s\nCollection Sharded? - %s" % (use_shards, use_chunks, is_sharded))

    logging.info("WRAPP")
    logging.info(config)
    logging.info("WRAPP")
    if config.get("create_input_splits"):
        logging.info("Creation of Input Splits is enabled.")
        if is_sharded and (use_shards or use_chunks):
            if use_shards and use_chunks:
                logging.warn("Combining 'use chunks' and 'read from shards \
                    directly' can have unexpected & erratic behavior in a live \
                    system due to chunk migrations. ")

            logging.info("Sharding mode calculation entering.")
            return calculate_sharded_splits(config, use_shards, use_chunks, uri)
        # perfectly ok for sharded setups to run with a normally calculated split.
        #May even be more efficient for some cases
        else:
            logging.info("Using Unsharded Split mode \
                    (Calculating multiple splits though)")
            return calculate_unsharded_splits(config, uri)

    else:
        logging.info("Creation of Input Splits is disabled;\
                Non-Split mode calculation entering.")

        return calculate_single_split(config)
Пример #3
0
def calculate_splits(config):
    """reads config to find out what type of split to perform"""
    #if the user does not specify an inputURI we will need to construct it from
    #the db/collection name TODO

    uri = config.setdefault("input_uri", "mongodb://localhost/test.in")
    uri_info = uri_parser.parse_uri(uri)

    db = get_database(uri)
    stats = db.command("collstats", uri_info['collection'])

    is_sharded = stats.get('sharded', False)
    use_shards = config.get("use_shards", False)
    use_chunks = config.get("use_chunks", False)

    if config.get("create_input_splits"):
        logging.info("Creation of Input Splits is enabled.")
        if is_sharded and (use_shards or use_chunks):
            if use_shards and use_chunks:
                logging.warn("Combining 'use chunks' and 'read from shards \
                    directly' can have unexpected & erratic behavior in a live \
                    system due to chunk migrations. ")

            logging.info("Sharding mode calculation entering.")
            return calculate_sharded_splits(config, use_shards, use_chunks, uri)
        # perfectly ok for sharded setups to run with a normally calculated split.
        #May even be more efficient for some cases
        else:
            adminuri = config.setdefault('admin_uri', 'mongodb://localhost/admin')
            logging.info("Using Unsharded Split mode \
                    (Calculating multiple splits though)")
            return calculate_unsharded_splits(config, uri, adminuri)

    else:
        logging.info("Creation of Input Splits is disabled;\
                Non-Split mode calculation entering.")

        return calculate_single_split(config)
Пример #4
0
def calculate_unsharded_splits(config, uri, adminuri):
    """@todo: Docstring for calculate_unsharded_splits

    :returns: @todo

    Note: collection_name seems unnecessary --CW

    """
    splits = []  # will return this
    logging.info("Calculating unsharded splits")

    coll = get_collection(uri)
    admindb = get_database(adminuri)

    q = {} if not "query" in config else config.get("query")

    # create the command to do the splits
    # command to split should look like this VV
    # SON([('splitVector', u'test.test_data'), ('maxChunkSize', 2),
    #    ('force', True), ('keyPattern', {'x': 1})])

    split_key = config.get('split_key')
    split_size = config.get('split_size')
    full_name  = coll.full_name
    logging.info("Calculating unsharded splits on collection %s with Split Key %s" %
            (full_name, split_key))
    logging.info("Max split size :: %sMB" % split_size)

    cmd = bson.son.SON()
    cmd["splitVector"]  = full_name
    cmd["maxChunkSize"] = split_size
    cmd["keyPattern"]   = split_key
    cmd["force"]        = False

    split_max = config.get('split_max')
    split_min = config.get('split_min')
    if split_min is not None and split_max is not None:
        cmd["min"] = split_min
        cmd["max"] = split_max

    logging.debug("Issuing Command: %s" % cmd)
    data = admindb.command(cmd)
    logging.debug("%r" % data)

    # results should look like this
    # {u'ok': 1.0, u'splitKeys': [{u'_id': ObjectId('4f49775348d9846c5e582b00')},
    # {u'_id': ObjectId('4f49775548d9846c5e58553b')}]}

    if data.get("err"):
        raise Exception(data.get("err"))
    elif data.get("ok") != 1.0:
        raise Exception("Unable to calculate splits")

    split_data = data.get('splitKeys')
    if not split_data:
        logging.warning("WARNING: No Input Splits were calculated by the split code. \
                Proceeding with a *single* split. Data may be too small, try lowering \
                'mongo.input.split_size'  if this is undesirable.")
    else:
        logging.info("Calculated %s splits" % len(split_data))

        last_key = split_min
        for bound in split_data:
            splits.append(_split(config, q, last_key, bound))
            last_key = bound
        splits.append(_split(config, q, last_key, split_max))

    return [s.format_uri_with_query() for s in splits]