def fetch_splits_from_shards(config, uri): """Internal method to fetch splits from shareded db :returns: The splits """ logging.warn( "WARNING getting splits that connect directly to the backend mongods is risky and might not produce correct results" ) connection = get_connection(uri) configDB = connection["config"] shardsColl = configDB["shards"] shardSet = [] splits = [] cur = shardsColl.find() for row in cur: host = row.get('host') slashIndex = host.find("/") if slashIndex > 0: host = host[slashIndex + 1:] shardSet.append(host) splits = [] for host in shardSet: new_uri = get_new_URI(uri, host) config['input_uri'] = new_uri splits += calculate_unsharded_splits(config, new_uri) #I think this is better than commented way return splits '''
def fetch_splits_from_shards(config, uri): """Internal method to fetch splits from shareded db :returns: The splits """ logging.warn("WARNING getting splits that connect directly to the backend mongods is risky and might not produce correct results") connection = get_connection(uri) configDB = connection["config"] shardsColl = configDB["shards"] shardSet = [] splits = [] cur = shardsColl.find() for row in cur: host = row.get('host') slashIndex = host.find("/") if slashIndex > 0: host = host[slashIndex + 1:] shardSet.append(host) splits = [] for host in shardSet: new_uri = get_new_URI(uri,host) config['input_uri'] = new_uri splits += calculate_unsharded_splits(config,new_uri) #I think this is better than commented way return splits '''
def __init__(self, stream, params): config = {} for key, value in params.__dict__.iteritems(): config[key] = value self.uri = config.get("output_uri") self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get("job_output_key", "_id") self.value_name = config.get("job_output_value")
def __init__(self, stream, params): config = {} for key, value in params.__dict__.iteritems(): config[key] = value self.uri = config.get('output_uri') self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get('job_output_key', '_id') self.value_name = config.get('job_output_value')
def __init__(self,stream,params): config = {} for key, value in params.__dict__.iteritems(): config[key] = value self.uri = config.get('output_uri') self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get('job_output_key','_id') self.value_name = config.get('job_output_value') self.add_action = config.get('add_action', 'insert') self.add_upsert = config.get('add_upsert', False)
def __init__(self, params): config = {} for key, value in params.get('mongodb', {}).items(): config[key] = value self.uri = config.get('output_uri') self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get('job_output_key','_id') self.value_name = config.get('job_output_value') self.add_action = config.get('add_action', 'insert') self.add_upsert = config.get('add_upsert', False) self.base_doc = config.get('base_doc', {})
def fetch_splits_via_chunks(config, uri, use_shards): """Retrieves split objects based on chunks in mongo :returns: The splits """ originalQuery = config.get("query") if use_shards: logging.warn("WARNING getting splits that connect directly to the \ backend mongods is risky and might not produce correct results") logging.debug("fetch_splits_via_chunks: originalQuery: %s" % originalQuery) connection = get_connection(uri) configDB = connection["config"] shardMap = {} if use_shards: shardsColl = configDB["shards"] cur = shardsColl.find() for row in cur: host = row.get('host') slashIndex = host.find("/") if slashIndex > 0: host = host[slashIndex + 1:] shardMap[row.get('_id')] = host logging.debug("MongoInputFormat.getSplitsUsingChunks(): shard map is: %s" % shardMap) chunksCollection = configDB["chunks"] logging.info(configDB.collection_names()) query = bson.son.SON() uri_info = uri_parser.parse_uri(uri) query["ns"] = uri_info['database'] + '.' + uri_info['collection'] cur = chunksCollection.find(query) logging.info("query is ", query) logging.info(cur.count()) logging.info(chunksCollection.find().count()) numChunks = 0 splits = [] for row in cur: numChunks += 1 minObj = row.get('min') shardKeyQuery = bson.son.SON() min = bson.son.SON() max = bson.son.SON() for key in minObj: tMin = minObj[key] tMax = (row.get('max'))[key] #@to-do do type comparison first? min[key] = tMin max[key] = tMax if originalQuery == None: originalQuery = bson.son.SON() shardKeyQuery["$query"] = originalQuery shardKeyQuery["$min"] = min shardKeyQuery["$max"] = max inputURI = config.get("input_uri") if use_shards: shardName = row.get('shard') host = shardMap[shardName] inputURI = get_new_URI(inputURI, host) splits.append(MongoInputSplit( inputURI, config.get("input_key"), shardKeyQuery, config.get("fields"), config.get("sort"), config.get("limit", 0), config.get("skip", 0), config.get("timeout", True), config.get("slave_ok",False))) # return splits in uri format for disco return [s.format_uri_with_query() for s in splits]
def fetch_splits_via_chunks(config, uri, use_shards): """Retrieves split objects based on chunks in mongo :returns: The splits """ originalQuery = config.get("query") if use_shards: logging.warn("WARNING getting splits that connect directly to the \ backend mongods is risky and might not produce correct results" ) logging.debug("fetch_splits_via_chunks: originalQuery: %s" % originalQuery) connection = get_connection(uri) configDB = connection["config"] shardMap = {} if use_shards: shardsColl = configDB["shards"] cur = shardsColl.find() for row in cur: host = row.get('host') slashIndex = host.find("/") if slashIndex > 0: host = host[slashIndex + 1:] shardMap[row.get('_id')] = host logging.debug("MongoInputFormat.getSplitsUsingChunks(): shard map is: %s" % shardMap) chunksCollection = configDB["chunks"] logging.info(configDB.collection_names()) query = bson.son.SON() uri_info = uri_parser.parse_uri(uri) query["ns"] = uri_info['database'] + '.' + uri_info['collection'] cur = chunksCollection.find(query) logging.info("query is ", query) logging.info(cur.count()) logging.info(chunksCollection.find().count()) numChunks = 0 splits = [] for row in cur: numChunks += 1 minObj = row.get('min') shardKeyQuery = bson.son.SON() min = bson.son.SON() max = bson.son.SON() for key in minObj: tMin = minObj[key] tMax = (row.get('max'))[key] #@to-do do type comparison first? min[key] = tMin max[key] = tMax if originalQuery == None: originalQuery = bson.son.SON() shardKeyQuery["$query"] = originalQuery shardKeyQuery["$min"] = min shardKeyQuery["$max"] = max inputURI = config.get("input_uri") if use_shards: shardName = row.get('shard') host = shardMap[shardName] inputURI = get_new_URI(inputURI, host) splits.append( MongoInputSplit(inputURI, config.get("input_key"), shardKeyQuery, config.get("fields"), config.get("sort"), config.get("limit", 0), config.get("skip", 0), config.get("timeout", True), config.get("slave_ok", False))) # return splits in uri format for disco return [s.format_uri_with_query() for s in splits]