Пример #1
0
def make_filter(feature, feature_point, data, np):
    feature_point = unicode(feature_point.decode('utf8'))
    rdd = dpark.parallelize(data, numSlices=1)

    def _has_feature(item):
        if item[feature] == None:
            return False
        return feature_point in item[feature]

    def _is_feature(item):
        return item[feature] == feature_point

    def _compare_feature(item):
        try:
            result = float(item[feature]) >= float(feature_point)
            return result
        except Exception:
            return False

    def _has_not_feature(item):
        if item[feature] == None:
            return True
        feature_point not in item[feature]

    def _is_not_feature(item):
        return item[feature] != feature_point

    def _not_compare_feature(item):
        try:
            return float(item[feature]) < float(feature_point)
        except Exception:
            return True

    np_map = {
        0: set([_has_not_feature, _is_not_feature, _not_compare_feature]),
        1: set([_has_feature, _is_feature, _compare_feature])
    }

    feature_map = {
        'language': set([_has_feature, _has_not_feature]),
        'countries': set([_has_feature, _has_not_feature]),
        'tags': set([_has_feature, _has_not_feature]),
        'rate': set([_compare_feature, _not_compare_feature]),
        'people': set([_compare_feature, _not_compare_feature]),
        'editors': set([_has_feature, _has_not_feature]),
        'directors': set([_has_feature, _has_not_feature]),
        'actors': set([_has_feature, _has_not_feature]),
        'year': set([_compare_feature, _not_compare_feature]),
        'length': set([_compare_feature, _not_compare_feature]),
        'types': set([_has_feature, _has_not_feature])
    }

    decision = list(np_map[np] & feature_map[feature])[0]
    return rdd.filter(decision).collect()
Пример #2
0
def pixelStats(urls,
               variable,
               nPartitions,
               timeFromFilename=TimeFromFilenameDOY,
               groupByKeys=GroupByKeys,
               accumulators=Accumulators,
               cachePath=CachePath,
               mode='dpark',
               modes=Modes):
    '''Compute a global (or regional) pixel mean field in parallel, given a list of URL's pointing to netCDF files.'''
    baseKey = groupByKeys[0]
    if baseKey == 'month':
        urlsByKey = splitByMonth(urls, timeFromFilename)
    else:
        print('pixelStats: Unrecognized groupByKey "%s".  Must be in %s' %
              (baseKey, str(groupByKeys)),
              file=sys.stderr)
        sys.exit(1)

    if mode == 'sequential':
        accum = [accumulate(u, variable, accumulators) for u in urlsByKey]
        merged = reduce(combine, accum)
        stats = statsFromAccumulators(merged)

    elif mode == 'dpark':
        import dpark
        urls = dpark.parallelize(urlsByKey,
                                 nPartitions)  # returns RDD of URL lists
        accum = urls.map(lambda urls: accumulate(urls, variable, accumulators)
                         )  # returns RDD of stats accumulators
        merged = accum.reduce(combine)  # merged accumulators on head node
        stats = statsFromAccumulators(
            merged)  # compute final stats from accumulators

    elif mode == 'spark':
        from pyspark import SparkContext
        sc = SparkContext(appName="PixelStats")
        urls = sc.parallelize(urlsByKey,
                              nPartitions)  # returns RDD of URL lists
        accum = urls.map(lambda urls: accumulate(urls, variable, accumulators)
                         )  # returns RDD of stats accumulators
        merged = accum.reduce(combine)  # merged accumulators on head node
        stats = statsFromAccumulators(
            merged)  # compute final stats from accumulators

    else:
        stats = None
        if mode not in modes:
            print('pixelStats: Unrecognized mode  "%s".  Must be in %s' %
                  (mode, str(modes)),
                  file=sys.stderr)
            sys.exit(1)
    return stats
Пример #3
0
def make_filter(feature, feature_point, data, np):
    print np
    print "===================================="
    feature_point = unicode(feature_point.decode('utf8'))
    rdd = dpark.parallelize(data, numSlices=1)

    def _has_feature(item):
        if item[feature] == None:
            return False
        return feature_point in item[feature]

    def _is_feature(item):
        return item[feature] == feature_point

    def _compare_feature(item):
        try:
            result = float(item[feature]) >= float(feature_point)
            return result
        except Exception:
            return False

    def _has_not_feature(item):
        if item[feature] == None:
            return True
        feature_point not in item[feature]

    def _is_not_feature(item):
        return item[feature] != feature_point

    def _not_compare_feature(item):
        try:
            return float(item[feature]) < float(feature_point)
        except Exception:
            return True

    np_map = {0: set(
        [_has_not_feature, _is_not_feature, _not_compare_feature]),
        1: set([_has_feature, _is_feature, _compare_feature])}

    feature_map = {'language': set([_has_feature, _has_not_feature]),
                   'countries': set([_has_feature, _has_not_feature]),
                   'tags': set([_has_feature, _has_not_feature]),
                   'rate': set([_compare_feature, _not_compare_feature]),
                   'people': set([_compare_feature, _not_compare_feature]),
                   'editors': set([_has_feature, _has_not_feature]),
                   'directors': set([_has_feature, _has_not_feature]),
                   'actors': set([_has_feature, _has_not_feature]),
                   'year': set([_compare_feature, _not_compare_feature]),
                   'length': set([_compare_feature, _not_compare_feature]),
                   'types': set([_has_feature, _has_not_feature])}

    decision = list(np_map[np] & feature_map[feature])[0]
    return rdd.filter(decision).collect()
Пример #4
0
def get_phidias_point(feature_set, citerion=CrossEncropyCiterion):
    """
    through all feature_set to find
    a best split citerion
    """
    feature_rdd = dpark.parallelize(feature_set)
    total_count = feature_rdd.count()

    def _label_count(item):
        return (item, 1)

    def _count_stat(item):
        return (item[0], sum(item[1]))

    def _compute_criterion(item):
        return (item[0], citerion()(item[1], total_count))

    def _max_criterion(item1, item2):
        return item1 if item1[1] > item2[1] else item2

    return feature_rdd.map(_label_count).groupByKey().map(_count_stat).map(_compute_criterion).collect()
Пример #5
0
def get_phidias_point(feature_set, citerion=CrossEncropyCiterion):
    """
    through all feature_set to find
    a best split citerion
    """
    feature_rdd = dpark.parallelize(feature_set, numSlices=1)
    total_count = feature_rdd.count()

    def _label_count(item):
        return (item, 1)

    def _count_stat(item):
        return (item[0], sum(item[1]))

    def _compute_criterion(item):
        return (item[0], citerion()(item[1], total_count))

    def _max_criterion(item1, item2):
        return item1 if item1[1] > item2[1] else item2

    return feature_rdd.map(_label_count).groupByKey()\
        .map(_count_stat).map(_compute_criterion).sort(key=lambda x: x[1], reverse=True).take(20)
Пример #6
0
def get_phidias_point(feature_set, citerion=CrossEncropyCiterion):
    """
    through all feature_set to find
    a best split citerion
    """
    feature_rdd = dpark.parallelize(feature_set, numSlices=1)
    total_count = feature_rdd.count()

    def _label_count(item):
        return (item, 1)

    def _count_stat(item):
        return (item[0], sum(item[1]))

    def _compute_criterion(item):
        return (item[0], citerion()(item[1], total_count))

    def _max_criterion(item1, item2):
        return item1 if item1[1] > item2[1] else item2

    return feature_rdd.map(_label_count).groupByKey()\
        .map(_count_stat).map(_compute_criterion).sort(key=lambda x: x[1], reverse=True).take(20)
    def _expand_xs(self):
        past_xs = self.xs[-self.d:][::-1]

        def n_to_b(n):
            string = bin(n)[2:]
            b_s = [int(s) for s in string][::-1]
            return b_s

        def add(n):
            b_s = n_to_b(n)
            xs = [past_x for past_x, i in zip(past_xs, b_s)]
            if xs[-1] == '*':
                return 0
            else:
                for i, x in zip(b_s, xs)[:-1]:
                    if i < (x != '*'):
                        return 0
            return xs[-1]

        rdd = dpark.parallelize([i for i in range(2**self.d-1)], 5)
        rdd = rdd.map(add)
        self.expand_xs = rdd.collect()
Пример #8
0
# coding: utf-8
import dpark


def set_diff(rdd1, rdd2):
    """
    Return an RDD with elements in rdd1 but not in rdd2.
    """
    pair_rdd1 = rdd1.map(lambda x: (x, None))
    pair_rdd2 = rdd2.map(lambda x: (x, 1))
    return pair_rdd1.leftOuterJoin(pair_rdd2)\
                    .filter(lambda x: not x[1][1])\
                    .map(lambda x: x[0])


if __name__ == '__main__':
    rdd1 = dpark.parallelize([1, 2, 3, 4])
    rdd2 = dpark.parallelize([3, 4, 5, 6])
    diff = set_diff(rdd1, rdd2)
    rs = diff.collect()
    assert sorted(rs) == [1, 2]  # DPark 不保证顺序
Пример #9
0
import math
import random
import os, sys
from pprint import pprint
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import dpark

# range
nums = dpark.parallelize(range(100), 4)
print nums.count()
print nums.reduce(lambda x,y:x+y)

# text search
f = dpark.textFile("./", ext='py').map(lambda x:x.strip())
log = f.filter(lambda line: 'logging' in line).cache()
print 'logging', log.count()
print 'error', log.filter(lambda line: 'error' in line).count()
for line in log.filter(lambda line: 'error' in line).collect():
    print line

# word count
counts = f.flatMap(lambda x:x.split()).map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).cache()
pprint(counts.filter(lambda (_,v): v>50).collectAsMap())
pprint(sorted(counts.filter(lambda (_,v): v>20).map(lambda (x,y):(y,x)).groupByKey().collect()))
pprint(counts.map(lambda v: "%s:%s"%v ).saveAsTextFile("wc/"))

# Pi
import random
def rand(i):
    x = random.random()
    y = random.random()