def open(url=None, task=None): from mongo_util import get_collection query = son.SON(json.loads(url, object_hook=json_util.object_hook)) uri = query['inputURI'] uri_info = uri_parser.parse_uri(uri) spec = query['query'] fields = query['fields'] skip = query['skip'] limit = query['limit'] timeout = query['timeout'] sort = query['sort'] slave_ok = query['slave_ok'] #go around: connect to the sonnection then choose db by ['dbname'] collection = get_collection(uri) cursor = collection.find(spec=spec, fields=fields, skip=skip, limit=limit, sort=sort, timeout=timeout, slave_okay=slave_ok) wrapper = MongoWrapper(cursor) return wrapper
def __init__(self,stream,params): from mongo_util import get_connection,get_collection config = {} for key, value in params.__dict__.iteritems(): config[key] = value self.uri = config.get('output_uri') self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get('job_output_key','_id') self.value_name = config.get('job_output_value')
def test_traditional_way(): start = time.clock() col = get_collection(config['input_uri']) count = {} cur = col.find() for row in cur: age = row['age']/10 if age in count: count[age] += 1 else: count[age] = 1 end = time.clock() print "Time used: ", end-start for key in count: print key,count[key]
def open(url=None, task=None): from mongo_util import get_collection query = son.SON(json.loads(url, object_hook=json_util.object_hook)) uri = query['inputURI'] uri_info = uri_parser.parse_uri(uri) spec = query['query'] fields = query['fields'] skip = query['skip'] limit = query['limit'] timeout = query['timeout'] sort = query['sort'] #go around: connect to the sonnection then choose db by ['dbname'] collection = get_collection(uri) cursor = collection.find(spec = spec, fields = fields, skip = skip, limit = limit, sort = sort, timeout = timeout) wrapper = MongoWrapper(cursor) return wrapper
def calculate_unsharded_splits(config, slaveOk, uri): """@todo: Docstring for calculate_unsharded_splits :returns: @todo Note: collection_name seems unnecessary --CW """ splits = [] # will return this logging.info("Calculating unsharded splits") coll = get_collection(uri) q = {} if not "query" in config else config.get("query") # create the command to do the splits # command to split should look like this VV # SON([('splitVector', u'test.test_data'), ('maxChunkSize', 2), # ('force', True), ('keyPattern', {'x': 1})]) split_key = config.get('split_key') split_size = config.get('split_size') full_name = coll.full_name logging.info("Calculating unsharded splits on collection %s with Split Key %s" % (full_name, split_key)) logging.info("Max split size :: %sMB" % split_size) cmd = bson.son.SON() cmd["splitVector"] = full_name cmd["maxChunkSize"] = split_size cmd["keyPattern"] = split_key cmd["force"] = False logging.debug("Issuing Command: %s" % cmd) data = coll.database.command(cmd) logging.debug("%r" % data) # results should look like this # {u'ok': 1.0, u'splitKeys': [{u'_id': ObjectId('4f49775348d9846c5e582b00')}, # {u'_id': ObjectId('4f49775548d9846c5e58553b')}]} if data.get("err"): raise Exception(data.get("err")) elif data.get("ok") != 1.0: raise Exception("Unable to calculate splits") split_data = data.get('splitKeys') if not split_data: logging.warning("WARNING: No Input Splits were calculated by the split code. \ Proceeding with a *single* split. Data may be too small, try lowering \ 'mongo.input.split_size' if this is undesirable.") else: logging.info("Calculated %s splits" % len(split_data)) last_key = None for bound in split_data: splits.append(_split(config, q, last_key, bound)) last_key = bound splits.append(_split(config, q, last_key, None)) return [s.format_uri_with_query() for s in splits]