示例#1
0
def fetch_splits_from_shards(config, uri):
    """Internal method to fetch splits from shareded db

    :returns: The splits
    """
    logging.warn(
        "WARNING getting splits that connect directly to the backend mongods is risky and might not produce correct results"
    )
    connection = get_connection(uri)

    configDB = connection["config"]
    shardsColl = configDB["shards"]

    shardSet = []
    splits = []
    cur = shardsColl.find()

    for row in cur:
        host = row.get('host')
        slashIndex = host.find("/")
        if slashIndex > 0:
            host = host[slashIndex + 1:]
        shardSet.append(host)

    splits = []
    for host in shardSet:
        new_uri = get_new_URI(uri, host)
        config['input_uri'] = new_uri
        splits += calculate_unsharded_splits(config, new_uri)
        #I think this is better than commented way

    return splits
    '''
示例#2
0
def fetch_splits_from_shards(config, uri):
    """Internal method to fetch splits from shareded db

    :returns: The splits
    """
    logging.warn("WARNING getting splits that connect directly to the backend mongods is risky and might not produce correct results")
    connection = get_connection(uri)

    configDB = connection["config"]
    shardsColl = configDB["shards"]

    shardSet = []
    splits = []
    cur = shardsColl.find()

    for row in cur:
        host = row.get('host')
        slashIndex = host.find("/")
        if slashIndex > 0:
            host = host[slashIndex + 1:]
        shardSet.append(host)

    splits = []
    for host in shardSet:
        new_uri = get_new_URI(uri,host)
        config['input_uri'] = new_uri
        splits += calculate_unsharded_splits(config,new_uri)
        #I think this is better than commented way

    return splits

    '''
示例#3
0
    def __init__(self, stream, params):

        config = {}
        for key, value in params.__dict__.iteritems():
            config[key] = value

        self.uri = config.get("output_uri")
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get("job_output_key", "_id")
        self.value_name = config.get("job_output_value")
示例#4
0
    def __init__(self, stream, params):

        config = {}
        for key, value in params.__dict__.iteritems():
            config[key] = value

        self.uri = config.get('output_uri')
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get('job_output_key', '_id')
        self.value_name = config.get('job_output_value')
示例#5
0
    def __init__(self,stream,params):

        config = {}
        for key, value in params.__dict__.iteritems():
            config[key] = value

        self.uri =  config.get('output_uri')
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get('job_output_key','_id')
        self.value_name = config.get('job_output_value')
        self.add_action = config.get('add_action', 'insert')
        self.add_upsert = config.get('add_upsert', False)
示例#6
0
    def __init__(self, params):

        config = {}
        for key, value in params.get('mongodb', {}).items():
            config[key] = value

        self.uri =  config.get('output_uri')
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get('job_output_key','_id')
        self.value_name = config.get('job_output_value')
        self.add_action = config.get('add_action', 'insert')
        self.add_upsert = config.get('add_upsert', False)
        self.base_doc = config.get('base_doc', {})
示例#7
0
def fetch_splits_via_chunks(config, uri, use_shards):
    """Retrieves split objects based on chunks in mongo

    :returns: The splits
    """
    originalQuery = config.get("query")
    if use_shards:
        logging.warn("WARNING getting splits that connect directly to the \
                backend mongods is risky and might not produce correct results")

    logging.debug("fetch_splits_via_chunks: originalQuery: %s" % originalQuery)

    connection = get_connection(uri)

    configDB = connection["config"]

    shardMap = {}

    if use_shards:
        shardsColl = configDB["shards"]
        cur = shardsColl.find()

        for row in cur:
            host = row.get('host')
            slashIndex = host.find("/")
            if slashIndex > 0:
                host = host[slashIndex + 1:]
            shardMap[row.get('_id')] = host

    logging.debug("MongoInputFormat.getSplitsUsingChunks(): shard map is: %s" % shardMap)

    chunksCollection = configDB["chunks"]
    logging.info(configDB.collection_names())
    query = bson.son.SON()

    uri_info = uri_parser.parse_uri(uri)
    query["ns"] = uri_info['database'] + '.' + uri_info['collection']

    cur = chunksCollection.find(query)
    logging.info("query is ", query)
    logging.info(cur.count())
    logging.info(chunksCollection.find().count())

    numChunks = 0

    splits = []

    for row in cur:
        numChunks += 1
        minObj = row.get('min')
        shardKeyQuery = bson.son.SON()
        min = bson.son.SON()
        max = bson.son.SON()

        for key in minObj:
            tMin = minObj[key]
            tMax = (row.get('max'))[key]

            #@to-do do type comparison first?
            min[key] = tMin
            max[key] = tMax

        if originalQuery == None:
            originalQuery = bson.son.SON()

        shardKeyQuery["$query"] = originalQuery
        shardKeyQuery["$min"] = min
        shardKeyQuery["$max"] = max

        inputURI = config.get("input_uri")

        if use_shards:
            shardName = row.get('shard')
            host = shardMap[shardName]
            inputURI = get_new_URI(inputURI, host)

        splits.append(MongoInputSplit(
            inputURI,
            config.get("input_key"),
            shardKeyQuery,
            config.get("fields"),
            config.get("sort"),
            config.get("limit", 0),
            config.get("skip", 0),
            config.get("timeout", True),
            config.get("slave_ok",False)))


    # return splits in uri format for disco
    return [s.format_uri_with_query() for s in splits]
示例#8
0
def fetch_splits_via_chunks(config, uri, use_shards):
    """Retrieves split objects based on chunks in mongo

    :returns: The splits
    """
    originalQuery = config.get("query")
    if use_shards:
        logging.warn("WARNING getting splits that connect directly to the \
                backend mongods is risky and might not produce correct results"
                     )

    logging.debug("fetch_splits_via_chunks: originalQuery: %s" % originalQuery)

    connection = get_connection(uri)

    configDB = connection["config"]

    shardMap = {}

    if use_shards:
        shardsColl = configDB["shards"]
        cur = shardsColl.find()

        for row in cur:
            host = row.get('host')
            slashIndex = host.find("/")
            if slashIndex > 0:
                host = host[slashIndex + 1:]
            shardMap[row.get('_id')] = host

    logging.debug("MongoInputFormat.getSplitsUsingChunks(): shard map is: %s" %
                  shardMap)

    chunksCollection = configDB["chunks"]
    logging.info(configDB.collection_names())
    query = bson.son.SON()

    uri_info = uri_parser.parse_uri(uri)
    query["ns"] = uri_info['database'] + '.' + uri_info['collection']

    cur = chunksCollection.find(query)
    logging.info("query is ", query)
    logging.info(cur.count())
    logging.info(chunksCollection.find().count())

    numChunks = 0

    splits = []

    for row in cur:
        numChunks += 1
        minObj = row.get('min')
        shardKeyQuery = bson.son.SON()
        min = bson.son.SON()
        max = bson.son.SON()

        for key in minObj:
            tMin = minObj[key]
            tMax = (row.get('max'))[key]

            #@to-do do type comparison first?
            min[key] = tMin
            max[key] = tMax

        if originalQuery == None:
            originalQuery = bson.son.SON()

        shardKeyQuery["$query"] = originalQuery
        shardKeyQuery["$min"] = min
        shardKeyQuery["$max"] = max

        inputURI = config.get("input_uri")

        if use_shards:
            shardName = row.get('shard')
            host = shardMap[shardName]
            inputURI = get_new_URI(inputURI, host)

        splits.append(
            MongoInputSplit(inputURI, config.get("input_key"), shardKeyQuery,
                            config.get("fields"), config.get("sort"),
                            config.get("limit", 0), config.get("skip", 0),
                            config.get("timeout", True),
                            config.get("slave_ok", False)))

    # return splits in uri format for disco
    return [s.format_uri_with_query() for s in splits]