def fetch_splits_from_shards(config, uri, slave_ok): """Internal method to fetch splits from shareded db :returns: The splits """ logging.warn("WARNING getting splits that connect directly to the backend mongods is risky and might not produce correct results") connection = get_connection(uri) configDB = connection["config"] shardsColl = configDB["shards"] shardSet = [] cur = shardsColl.find() try: for row in cur: host = row.get('host') slashIndex = host.find("/") if slashIndex > 0: host = host[slashIndex + 1:] shardSet.append(host) splits = [] for host in shardSet: new_uri = get_new_URI(uri,host,slave_ok) config['input_uri'] = new_uri splits += calculate_unsharded_splits(config,slave_ok,new_uri) #I think this is better than commented way return splits '''
def __init__(self,stream,params): from mongo_util import get_connection,get_collection config = {} for key, value in params.__dict__.iteritems(): config[key] = value self.uri = config.get('output_uri') self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get('job_output_key','_id') self.value_name = config.get('job_output_value')
def runTest(self): #put 20000 objects in a database, call for a split by hand, then a split by the class conn = get_connection( "mongodb://ec2-23-20-75-24.compute-1.amazonaws.com:27020/test.twitter" ) db = conn[config.get('db_name')] coll = db[config.get('collection_name')] #print db.command("collstats", coll.full_name) #NOTE: need to run this code once to populate the database, after that comment it out ''' for i in range(40000): post = {"name" : i, "date": datetime.datetime.utcnow()} coll.insert(post) ''' #print coll.count() command = bson.son.SON() command['splitVector'] = coll.full_name command['maxChunkSize'] = config.get('split_size') command['force'] = False command['keyPattern'] = {'_id': 1} #SON([('splitVector', u'test.twitter'), ('maxChunkSize', 1), ('keyPattern', {'_id': 1}), ('force', False)]) results = db.command(command) man_splits = results.get("splitKeys") assert results.get( 'ok') == 1.0, 'split command did not return with 1.0 ok' print results print 'man_splits = ', len(man_splits) assert man_splits, 'no splitKeys returned' #now do it through MongoSplit splits = calculate_splits(config) assert splits, "MongoSplitter did not return the right splits" logging.info("Calculated %s MongoInputSplits" % len(splits)) #assert len(man_splits) + 1 == len(splits) , "MongoSplitter returned a different number of splits than manual splits" '''
def runTest(self): # put 20000 objects in a database, call for a split by hand, then a split by the class conn = get_connection("mongodb://ec2-23-20-75-24.compute-1.amazonaws.com:27020/test.twitter") db = conn[config.get("db_name")] coll = db[config.get("collection_name")] # print db.command("collstats", coll.full_name) # NOTE: need to run this code once to populate the database, after that comment it out """ for i in range(40000): post = {"name" : i, "date": datetime.datetime.utcnow()} coll.insert(post) """ # print coll.count() command = bson.son.SON() command["splitVector"] = coll.full_name command["maxChunkSize"] = config.get("split_size") command["force"] = False command["keyPattern"] = {"_id": 1} # SON([('splitVector', u'test.twitter'), ('maxChunkSize', 1), ('keyPattern', {'_id': 1}), ('force', False)]) results = db.command(command) man_splits = results.get("splitKeys") assert results.get("ok") == 1.0, "split command did not return with 1.0 ok" print results print "man_splits = ", len(man_splits) assert man_splits, "no splitKeys returned" # now do it through MongoSplit splits = calculate_splits(config) assert splits, "MongoSplitter did not return the right splits" logging.info("Calculated %s MongoInputSplits" % len(splits)) # assert len(man_splits) + 1 == len(splits) , "MongoSplitter returned a different number of splits than manual splits" """
def init_databases(config_file): global userdata_db, plog_db, report_db userdata_db = mongo_util.get_db('entities_main', config_file) plog_db = mongo_util.get_connection('datastore', config_file)['kadb_pl'] report_db = mongo_util.get_db('reporting', config_file)
def fetch_splits_via_chunks(config, uri, useShards, slaveOk): """Retrieves split objects based on chunks in mongo :returns: The splits """ originalQuery = config.get("query") if useShards: logging.warn("WARNING getting splits that connect directly to the \ backend mongods is risky and might not produce correct results") logging.debug("fetch_splits_via_chunks: originalQuery: %s" % originalQuery) connection = get_connection(uri) configDB = connection["config"] shardMap = {} if useShards: shardsColl = configDB["shards"] cur = shardsColl.find() try: for row in cur: host = row.get('host') slashIndex = host.find("/") if slashIndex > 0: host = host[slashIndex + 1:] shardMap[row.get('_id')] = host logging.debug("MongoInputFormat.getSplitsUsingChunks(): shard map is: %s" % shardMap) chunksCollection = configDB["chunks"] logging.info(configDB.collection_names()) query = bson.son.SON() uri_info = uri_parser.parse_uri(uri) query["ns"] = uri_info['database'] + '.' + uri_info['collection'] cur = chunksCollection.find(query) logging.info("query is ", query) logging.info(cur.count()) logging.info(chunksCollection.find().count()) try: numChunks = 0 splits = [] for row in cur: numChunks += 1 minObj = row.get('min') shardKeyQuery = bson.son.SON() min = bson.son.SON() max = bson.son.SON() for key in minObj: tMin = minObj[key] tMax = (row.get('max'))[key] #@to-do do type comparison first? min[key] = tMin max[key] = tMax if originalQuery == None: originalQuery = bson.son.SON() shardKeyQuery["$query"] = originalQuery shardKeyQuery["$min"] = min shardKeyQuery["$max"] = max inputURI = config.get("input_uri") if useShards: shardName = row.get('shard') host = shardMap[shardName] inputURI = get_new_URI(inputURI, host, slaveOk) splits.append(MongoInputSplit( inputURI, config.get("input_key"), shardKeyQuery, config.get("fields"), config.get("sort"), config.get("limit", 0), config.get("skip", 0), config.get("timeout", True))) # return splits in uri format for disco return [s.format_uri_with_query() for s in splits]