示例#1
0
def get_cluster_state(mongos):
    """
    returns a dictionary that contains the subset of cluster state we care about
    """
    global shard_clients

    # this won't work well with a large (thousands?) number of shards
    shards_collection = client['config']['shards']
    shards = [shard for shard in shards_collection.find()]

    state = {}
    state['shard_names'] = [shard['_id'] for shard in shards]
    state['shard_names'].sort()

    members = {}
    oplog_positions = {}
    for shard in shards:
        # get statuses for all replica set members
        try:
            repl_set, host = shard['host'].split('/')
        except ValueError:
            print >> sys.stderr, "ERROR: can't get replica set status for %s" % shard[
                '_id']
            sys.exit(1)

        # get cached connection, if one exists
        if repl_set in shard_clients:
            shard_client = shard_clients[repl_set]
        else:
            shard_client = pymongo.MongoClient(
                host,
                replicaSet=repl_set,
                read_preference=ReadPreference.PRIMARY,
                socketTimeoutMS=120000)
            shard_clients[repl_set] = shard_client

        rs_status = shard_client.admin.command('replSetGetStatus')
        for member in rs_status['members']:
            members[member['name']] = member['stateStr']

        # get last oplog positions
        last_oplog_entry = utils.get_last_oplog_entry(shard_client)
        oplog_positions[repl_set] = last_oplog_entry['ts']

    state['members'] = members
    state['oplog_positions'] = oplog_positions

    return state
示例#2
0
def get_cluster_state(mongos):
    """
    returns a dictionary that contains the subset of cluster state we care about
    """
    global shard_clients

    # this won't work well with a large (thousands?) number of shards
    shards_collection = client['config']['shards']
    shards = [shard for shard in shards_collection.find()]

    state = {}
    state['shard_names'] = [shard['_id'] for shard in shards]
    state['shard_names'].sort()

    members = {}
    oplog_positions = {}
    for shard in shards:
        # get statuses for all replica set members
        try:
            repl_set, host = shard['host'].split('/')
        except ValueError:
            print >>sys.stderr, "ERROR: can't get replica set status for %s" % shard['_id']
            sys.exit(1)

        # get cached connection, if one exists
        if repl_set in shard_clients:
            shard_client = shard_clients[repl_set]
        else:
            shard_client = pymongo.MongoClient(host, replicaSet=repl_set,
                                               read_preference=ReadPreference.PRIMARY,
                                               socketTimeoutMS=120000)
            shard_clients[repl_set] = shard_client

        rs_status = shard_client.admin.command('replSetGetStatus')
        for member in rs_status['members']:
            members[member['name']] = member['stateStr']

        # get last oplog positions
        last_oplog_entry = utils.get_last_oplog_entry(shard_client)
        oplog_positions[repl_set] = last_oplog_entry['ts']


    state['members'] = members
    state['oplog_positions'] = oplog_positions

    return state
示例#3
0
def copy_collection(source, dest, state_path, percent):
    """
    Copies all documents from source to destination collection. Inserts documents in
    batches using insert workers, which are each run in their own greenlet. Ensures that
    the destination is empty before starting the copy.

    Does no safety checks -- this is up to the caller.

    @param source      dict of (host, port, db, collection) for the source
    @param dest        dict of (host, port, db, collection) for the destination
    @param state_path  path of state database
    @param percent     percentage of documents to copy
    """
    gevent.monkey.patch_socket()

    # open state database
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(
        source['host'],
        source['port'],
        ensure_direct=True,
        max_pool_size=30,
        read_preference=ReadPreference.SECONDARY,
        document_class=FasterOrderedDict)

    source_collection = source_client[source['db']][source['collection']]
    if source_client.is_mongos:
        raise Exception(
            "for performance reasons, sources must be mongod instances; %s:%d is not",
            source['host'], source['port'])

    dest_client = utils.mongo_connect(dest['host'],
                                      dest['port'],
                                      max_pool_size=30,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    # record timestamp of last oplog entry, so that we know where to start applying ops
    # later
    oplog_ts = utils.get_last_oplog_entry(source_client)['ts']
    state_db.update_oplog_ts(source, dest, oplog_ts)

    # for testing copying of indices quickly
    if percent == 0:
        log.info("skipping copy because of --percent 0 parameters")
        state_db.update_state(source, dest,
                              CopyStateDB.STATE_WAITING_FOR_INDICES)
        return

    stats = Stats()
    stats.total_docs = int(source_collection.count())
    if percent:
        # hack-ish but good enough for a testing-only feature
        stats.total_docs = int(stats.total_docs * (float(percent) / 100.0))

    # get all _ids, which works around a mongo bug/feature that causes massive slowdowns
    # of long-running, large reads over time
    ids = []
    cursor = source_collection.find(fields=["_id"],
                                    snapshot=True,
                                    timeout=False)
    cursor.batch_size(5000)
    insert_pool = Pool(INSERT_POOL_SIZE)
    stats_greenlet = gevent.spawn(_copy_stats_worker, stats)
    for doc in cursor:
        _id = doc['_id']

        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        # when we've gathered enough _ids, spawn a worker greenlet to batch copy the
        # documents corresponding to them
        ids.append(_id)
        if len(ids) % INSERT_SIZE == 0:
            outgoing_ids = ids
            ids = []
            insert_pool.spawn(_find_and_insert_batch_worker,
                              source_collection=source_collection,
                              dest_collection=dest_collection,
                              ids=outgoing_ids,
                              stats=stats)
        gevent.sleep()

    # insert last batch of documents
    if len(ids) > 0:
        _find_and_insert_batch_worker(source_collection=source_collection,
                                      dest_collection=dest_collection,
                                      ids=ids,
                                      stats=stats)
        stats.log()

    # wait until all other outstanding inserts have finished
    insert_pool.join()
    stats_greenlet.kill()
    log.info("done with initial copy")

    state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
示例#4
0
文件: copier.py 项目: pebble/hydra
def copy_collection(source, dest, state_path, percent):
    """
    Copies all documents from source to destination collection. Inserts documents in
    batches using insert workers, which are each run in their own greenlet. Ensures that
    the destination is empty before starting the copy.

    Does no safety checks -- this is up to the caller.

    @param source      dict of (host, port, db, collection) for the source
    @param dest        dict of (host, port, db, collection) for the destination
    @param state_path  path of state database
    @param percent     percentage of documents to copy
    """
    gevent.monkey.patch_socket()

    # open state database
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(source,
                                        ensure_direct=True,
                                        max_pool_size=30,
                                        read_preference=ReadPreference.SECONDARY,
                                        document_class=FasterOrderedDict)

    source_collection = source_client[source['db']][source['collection']]
    if source_client.is_mongos:
        raise Exception("for performance reasons, sources must be mongod instances; %s:%d is not",
                        source['host'], source['port'])

    dest_client = utils.mongo_connect(dest,
                                      max_pool_size=30,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    # record timestamp of last oplog entry, so that we know where to start applying ops
    # later
    oplog_ts = utils.get_last_oplog_entry(source_client)['ts']
    state_db.update_oplog_ts(source, dest, oplog_ts)

    # for testing copying of indices quickly
    if percent == 0:
        log.info("skipping copy because of --percent 0 parameters")
        state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
        return

    stats = Stats()
    stats.total_docs = int(source_collection.count())
    if percent:
        # hack-ish but good enough for a testing-only feature
        stats.total_docs = int(stats.total_docs * (float(percent)/100.0))

    # get all _ids, which works around a mongo bug/feature that causes massive slowdowns
    # of long-running, large reads over time
    ids = []
    cursor = source_collection.find(
        projection={'_id':True},
        modifiers={'$snapshot':True}
    )
    cursor.batch_size(5000)
    insert_pool = Pool(INSERT_POOL_SIZE)
    stats_greenlet = gevent.spawn(_copy_stats_worker, stats)
    for doc in cursor:
        _id = doc['_id']

        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        # when we've gathered enough _ids, spawn a worker greenlet to batch copy the
        # documents corresponding to them
        ids.append(_id)
        if len(ids) % INSERT_SIZE == 0:
            outgoing_ids = ids
            ids = []
            insert_pool.spawn(_find_and_insert_batch_worker,
                              source_collection=source_collection,
                              dest_collection=dest_collection,
                              ids=outgoing_ids,
                              stats=stats)
        gevent.sleep()

    # insert last batch of documents
    if len(ids) > 0:        
        _find_and_insert_batch_worker(source_collection=source_collection,
                                      dest_collection=dest_collection,
                                      ids=ids,
                                      stats=stats)
        stats.log()

    # wait until all other outstanding inserts have finished
    insert_pool.join()
    stats_greenlet.kill()
    log.info("done with initial copy")

    state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)