예제 #1
0
def _open(input_description, task=None):
    """Return a :class:`~mongodisco.mongodb_input.MongoWrapper`
    which wraps a cursor selecting just those documents relevant
    to one particular map operation. `input_description` is
    a JSON string describing the documents to find, and looks like::

        {  "inputURI": "mongodb://discomaster.zeroclues.net:27017/test.twitter",
           "keyField": null,
           "query": {
             "$query": {},
             "$min": {"_id": {"$oid": "4fae7a97fa22c41aeb5d78f8"}},
             "$max": {"_id": {"$oid": "4fae7b27fa22c41aeb5d96b5"}}},
           "fields": null,
           "sort": null,
           "limit": 0,
           "skip": 0,
           "timeout": false  }
    """
    parsed = json.loads(input_description, object_hook=object_hook)
    collection = get_collection(parsed['inputURI'])

    return MongoWrapper(collection.find(
        spec=parsed['query'],
        fields=parsed['fields'],
        skip=parsed['skip'],
        limit=parsed['limit'],
        sort=parsed['sort'],
        timeout=parsed['timeout'],
        slave_okay=parsed['slave_ok']
    ))
예제 #2
0
    def __init__(self, stream, params):

        config = {}
        for key, value in params.__dict__.iteritems():
            config[key] = value

        self.uri = config.get("output_uri")
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get("job_output_key", "_id")
        self.value_name = config.get("job_output_value")
예제 #3
0
    def __init__(self, stream, params):

        config = {}
        for key, value in params.__dict__.iteritems():
            config[key] = value

        self.uri = config.get('output_uri')
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get('job_output_key', '_id')
        self.value_name = config.get('job_output_value')
예제 #4
0
    def __init__(self,stream,params):

        config = {}
        for key, value in params.__dict__.iteritems():
            config[key] = value

        self.uri =  config.get('output_uri')
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get('job_output_key','_id')
        self.value_name = config.get('job_output_value')
        self.add_action = config.get('add_action', 'insert')
        self.add_upsert = config.get('add_upsert', False)
예제 #5
0
    def __init__(self, params):

        config = {}
        for key, value in params.get('mongodb', {}).items():
            config[key] = value

        self.uri =  config.get('output_uri')
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get('job_output_key','_id')
        self.value_name = config.get('job_output_value')
        self.add_action = config.get('add_action', 'insert')
        self.add_upsert = config.get('add_upsert', False)
        self.base_doc = config.get('base_doc', {})
예제 #6
0
def test_traditional_way():
    start = time.clock()

    col = get_collection(config['input_uri'])
    count = {}
    cur = col.find()
    for row in cur:
        age = row['age'] / 10
        if age in count:
            count[age] += 1
        else:
            count[age] = 1

    end = time.clock()
    print "Time used: ", end - start
    for key in count:
        print key, count[key]
예제 #7
0
def test_traditional_way():
    start = time.clock()

    col = get_collection(config['input_uri'])
    count = {}
    cur = col.find()
    for row in cur:
        age = row['age']/10
        if age in count:
            count[age] += 1
        else:
            count[age] = 1

    end = time.clock()
    print "Time used: ", end-start
    for key in count:
        print key,count[key]
예제 #8
0
def calculate_unsharded_splits(config, uri, adminuri):
    """@todo: Docstring for calculate_unsharded_splits

    :returns: @todo

    Note: collection_name seems unnecessary --CW

    """
    splits = []  # will return this
    logging.info("Calculating unsharded splits")

    coll = get_collection(uri)
    admindb = get_database(adminuri)

    q = {} if not "query" in config else config.get("query")

    # create the command to do the splits
    # command to split should look like this VV
    # SON([('splitVector', u'test.test_data'), ('maxChunkSize', 2),
    #    ('force', True), ('keyPattern', {'x': 1})])

    split_key = config.get('split_key')
    split_size = config.get('split_size')
    full_name  = coll.full_name
    logging.info("Calculating unsharded splits on collection %s with Split Key %s" %
            (full_name, split_key))
    logging.info("Max split size :: %sMB" % split_size)

    cmd = bson.son.SON()
    cmd["splitVector"]  = full_name
    cmd["maxChunkSize"] = split_size
    cmd["keyPattern"]   = split_key
    cmd["force"]        = False

    split_max = config.get('split_max')
    split_min = config.get('split_min')
    if split_min is not None and split_max is not None:
        cmd["min"] = split_min
        cmd["max"] = split_max

    logging.debug("Issuing Command: %s" % cmd)
    data = admindb.command(cmd)
    logging.debug("%r" % data)

    # results should look like this
    # {u'ok': 1.0, u'splitKeys': [{u'_id': ObjectId('4f49775348d9846c5e582b00')},
    # {u'_id': ObjectId('4f49775548d9846c5e58553b')}]}

    if data.get("err"):
        raise Exception(data.get("err"))
    elif data.get("ok") != 1.0:
        raise Exception("Unable to calculate splits")

    split_data = data.get('splitKeys')
    if not split_data:
        logging.warning("WARNING: No Input Splits were calculated by the split code. \
                Proceeding with a *single* split. Data may be too small, try lowering \
                'mongo.input.split_size'  if this is undesirable.")
    else:
        logging.info("Calculated %s splits" % len(split_data))

        last_key = split_min
        for bound in split_data:
            splits.append(_split(config, q, last_key, bound))
            last_key = bound
        splits.append(_split(config, q, last_key, split_max))

    return [s.format_uri_with_query() for s in splits]
예제 #9
0
def calculate_unsharded_splits(config, uri):
    """@todo: Docstring for calculate_unsharded_splits

    :returns: @todo

    Note: collection_name seems unnecessary --CW

    """
    splits = []  # will return this
    logging.info("Calculating unsharded splits")

    coll = get_collection(uri)

    q = {} if not "query" in config else config.get("query")

    # create the command to do the splits
    # command to split should look like this VV
    # SON([('splitVector', u'test.test_data'), ('maxChunkSize', 2),
    #    ('force', True), ('keyPattern', {'x': 1})])

    split_key = config.get('split_key')
    split_size = config.get('split_size')
    full_name = coll.full_name
    logging.info(
        "Calculating unsharded splits on collection %s with Split Key %s" %
        (full_name, split_key))
    logging.info("Max split size :: %sMB" % split_size)

    cmd = bson.son.SON()
    cmd["splitVector"] = full_name
    cmd["maxChunkSize"] = split_size
    cmd["keyPattern"] = split_key
    cmd["force"] = False

    split_max = config.get('split_max')
    split_min = config.get('split_min')
    if split_min is not None and split_max is not None:
        cmd["min"] = split_min
        cmd["max"] = split_max

    logging.debug("Issuing Command: %s" % cmd)
    data = coll.database.command(cmd)
    logging.debug("%r" % data)

    # results should look like this
    # {u'ok': 1.0, u'splitKeys': [{u'_id': ObjectId('4f49775348d9846c5e582b00')},
    # {u'_id': ObjectId('4f49775548d9846c5e58553b')}]}

    if data.get("err"):
        raise Exception(data.get("err"))
    elif data.get("ok") != 1.0:
        raise Exception("Unable to calculate splits")

    split_data = data.get('splitKeys')
    if not split_data:
        logging.warning(
            "WARNING: No Input Splits were calculated by the split code. \
                Proceeding with a *single* split. Data may be too small, try lowering \
                'mongo.input.split_size'  if this is undesirable.")
    else:
        logging.info("Calculated %s splits" % len(split_data))

        last_key = split_min
        for bound in split_data:
            splits.append(_split(config, q, last_key, bound))
            last_key = bound
        splits.append(_split(config, q, last_key, split_max))

    return [s.format_uri_with_query() for s in splits]