def calculate_splits(config): """reads config to find out what type of split to perform""" #if the user does not specify an inputURI we will need to construct it from #the db/collection name TODO uri = config.get("input_uri", "mongodb://localhost/test.in") config['input_uri'] = uri uri_info = uri_parser.parse_uri(uri) #database_name = uri_info['database'] collection_name = uri_info['collection'] db = get_database(uri) stats = db.command("collstats", collection_name) is_sharded = False if "sharded" not in stats else stats["sharded"] use_shards = config.get("use_shards", False) use_chunks = config.get("use_chunks", False) slave_ok = config.get("slave_ok", False) logging.info(" Calculate Splits Code ... Use Shards? - %s\nUse Chunks? \ - %s\nCollection Sharded? - %s" % (use_shards, use_chunks, is_sharded)) logging.info("WRAPP") logging.info(config) logging.info("WRAPP") if config.get("create_input_splits"): logging.info("Creation of Input Splits is enabled.") if is_sharded and (use_shards or use_chunks): if use_shards and use_chunks: logging.warn("Combining 'use chunks' and 'read from shards \ directly' can have unexpected & erratic behavior in a live \ system due to chunk migrations. ") logging.info("Sharding mode calculation entering.") return calculate_sharded_splits(config, use_shards, use_chunks, uri) # perfectly ok for sharded setups to run with a normally calculated split. #May even be more efficient for some cases else: logging.info("Using Unsharded Split mode \ (Calculating multiple splits though)") return calculate_unsharded_splits(config, uri) else: logging.info("Creation of Input Splits is disabled;\ Non-Split mode calculation entering.") return calculate_single_split(config)
def calculate_splits(config): """reads config to find out what type of split to perform""" #if the user does not specify an inputURI we will need to construct it from #the db/collection name TODO uri = config.setdefault("input_uri", "mongodb://localhost/test.in") uri_info = uri_parser.parse_uri(uri) db = get_database(uri) stats = db.command("collstats", uri_info['collection']) is_sharded = stats.get('sharded', False) use_shards = config.get("use_shards", False) use_chunks = config.get("use_chunks", False) if config.get("create_input_splits"): logging.info("Creation of Input Splits is enabled.") if is_sharded and (use_shards or use_chunks): if use_shards and use_chunks: logging.warn("Combining 'use chunks' and 'read from shards \ directly' can have unexpected & erratic behavior in a live \ system due to chunk migrations. ") logging.info("Sharding mode calculation entering.") return calculate_sharded_splits(config, use_shards, use_chunks, uri) # perfectly ok for sharded setups to run with a normally calculated split. #May even be more efficient for some cases else: adminuri = config.setdefault('admin_uri', 'mongodb://localhost/admin') logging.info("Using Unsharded Split mode \ (Calculating multiple splits though)") return calculate_unsharded_splits(config, uri, adminuri) else: logging.info("Creation of Input Splits is disabled;\ Non-Split mode calculation entering.") return calculate_single_split(config)
def calculate_unsharded_splits(config, uri, adminuri): """@todo: Docstring for calculate_unsharded_splits :returns: @todo Note: collection_name seems unnecessary --CW """ splits = [] # will return this logging.info("Calculating unsharded splits") coll = get_collection(uri) admindb = get_database(adminuri) q = {} if not "query" in config else config.get("query") # create the command to do the splits # command to split should look like this VV # SON([('splitVector', u'test.test_data'), ('maxChunkSize', 2), # ('force', True), ('keyPattern', {'x': 1})]) split_key = config.get('split_key') split_size = config.get('split_size') full_name = coll.full_name logging.info("Calculating unsharded splits on collection %s with Split Key %s" % (full_name, split_key)) logging.info("Max split size :: %sMB" % split_size) cmd = bson.son.SON() cmd["splitVector"] = full_name cmd["maxChunkSize"] = split_size cmd["keyPattern"] = split_key cmd["force"] = False split_max = config.get('split_max') split_min = config.get('split_min') if split_min is not None and split_max is not None: cmd["min"] = split_min cmd["max"] = split_max logging.debug("Issuing Command: %s" % cmd) data = admindb.command(cmd) logging.debug("%r" % data) # results should look like this # {u'ok': 1.0, u'splitKeys': [{u'_id': ObjectId('4f49775348d9846c5e582b00')}, # {u'_id': ObjectId('4f49775548d9846c5e58553b')}]} if data.get("err"): raise Exception(data.get("err")) elif data.get("ok") != 1.0: raise Exception("Unable to calculate splits") split_data = data.get('splitKeys') if not split_data: logging.warning("WARNING: No Input Splits were calculated by the split code. \ Proceeding with a *single* split. Data may be too small, try lowering \ 'mongo.input.split_size' if this is undesirable.") else: logging.info("Calculated %s splits" % len(split_data)) last_key = split_min for bound in split_data: splits.append(_split(config, q, last_key, bound)) last_key = bound splits.append(_split(config, q, last_key, split_max)) return [s.format_uri_with_query() for s in splits]