示例#1
0
def limit_rdd(rdd, limit=None, randomSeed=1234):
    if not limit:
        return rdd
    else:
        # Originally:
        # rdd_ingest = sc.parallelize(rdd_ingest.take(limit))
        # However:
        # Because take/takeSample collects back to master, can needlessly 
        # create "task too large" condition
        # Therefore, instead, sample approximately 'limit' elements
        ratio = float(limit) / rdd.count()
        rdd_limited = rdd.sample(False, ratio, seed=randomSeed)
        logging.info("RDD limited to {} elements (requested: {})".format(rdd_limited.count(), limit))
        return rdd_limited
示例#2
0
def show_partitioning(rdd, show=True):
    """Seems to be significantly more expensive on cluster than locally"""
    if show:
        partitionCount = rdd.getNumPartitions()
        try:
            valueCount = rdd.countApprox(1000, confidence=0.50)
        except:
            valueCount = -1
        try:
            name = rdd.name() or None
        except:
            pass
        name = name or "anonymous"
        logging.info("For RDD %s, there are %d partitions with on average %s values" % 
                     (name, partitionCount, int(valueCount/float(partitionCount))))
示例#3
0
def debug_dump_rdd(rdd, 
                   debugLevel=0,
                   keys=True, 
                   listElements=False,
                   outputDir=None,
                   name=None):

    debugOutputRootDir = outputDir + '_debug'
    try:
        name = name or rdd.name() or None
    except:
        pass
    name = name or "anonymous-%d" % randint(10000,99999)
    outdir = os.path.join(debugOutputRootDir, name)

    if debugLevel > 0:
        startTime = time.time()
        # compute key count
        keyCount = None
        try:
            keyCount = rdd.keys().count() if keys else None
        except:
            pass
        # compute row count
        rowCount = None
        try:
            rowCount = rdd.count()
        except:
            pass
        # compute element count
        elementCount = None
        try:
            elementCount = rdd.mapValues(lambda x: len(x) if isinstance(x, (list, tuple)) else 0).values().sum() if listElements else None
        except:
            pass

        rdd.saveAsTextFile(outdir)
        endTime = time.time()
        elapsedTime = endTime - startTime
        logging.info("Wrote [%s] to outdir %r: [keys=%s, rows=%s, elements=%s]" % 
                     (str(timedelta(seconds=elapsedTime)), outdir, keyCount, rowCount, elementCount))