def gen_rdd(self, **kwargs): use_limit = kwargs.get('use_limit', False) rdd = self.st.gen_rdd(use_limit=use_limit) if self.st.limit: rdd = dpark.makeRDD(take(rdd, self.st.limit)) return rdd
for x in range(m): for y in range(v): pred = Wi[x].dot(Hj[y]) err = int(Oij[x][y]) - int(pred) w = Wi[x] + GAMMA * (Hj[y] * err - LAMBDA * Wi[x]) h = Hj[y] + GAMMA * (Wi[x] * err - LAMBDA * Hj[y]) Wi[x] = w Hj[y] = h W.put(i, Wi) H.put(j, Hj) rdd = dpark.makeRDD(list(range(d))) rdd = rdd.cartesian(rdd).cache() def calc_err(i_j): (i, j) = i_j Wi = W.get(i) Hj = H.get(j) ori = ori_b.value Rij = Wi.dot(Hj.T) Oij = ori[i * m:(i + 1) * m, j * v:(j + 1) * v] return ((Rij - Oij)**2).sum() J = list(range(d))
for y in xrange(v): pred = Wi[x].dot(Hj[y]) err = int(Oij[x][y]) - int(pred) w = Wi[x] + GAMMA * (Hj[y]*err - LAMBDA*Wi[x]) h = Hj[y] + GAMMA * (Wi[x]*err - LAMBDA*Hj[y]) Wi[x] = w Hj[y] = h W.put(i, Wi) H.put(j, Hj) W.flush() H.flush() rdd = dpark.makeRDD(range(d)) rdd = rdd.cartesian(rdd).cache() def calc_err((i, j)): Wi = W.get(i) Hj = H.get(j) ori = ori_b.value Rij = Wi.dot(Hj.T) Oij = ori[i*m:(i+1)*m, j*v:(j+1)*v] return ((Rij - Oij) ** 2).sum() J = range(d) while True: for i in xrange(d): dpark.makeRDD(zip(range(d), J), d).foreach(sgd)
2. For the usage of jit types and signatures, please refer Numba documentation <http://numba.github.com/numba-doc/0.10/index.html> ''' from dpark import _ctx as dpark, jit, autojit import numpy @jit('f8(f8[:])') def add1(x): sum = 0.0 for i in xrange(x.shape[0]): sum += i*x[i] return sum @autojit def add2(x): sum = 0.0 for i in xrange(x.shape[0]): sum += i*x[i] return sum def add3(x): sum = 0.0 for i in xrange(x.shape[0]): sum += i*x[i] return sum rdd = dpark.makeRDD(range(0, 10)).map(lambda x: numpy.arange(x*1e7, (x+1)*1e7)) print rdd.map(add1).collect() print rdd.map(add2).collect() print rdd.map(add3).collect()
pred = Wi[x].dot(Hj[y]) err = int(Oij[x][y]) - int(pred) w = Wi[x] + GAMMA * (Hj[y] * err - LAMBDA * Wi[x]) h = Hj[y] + GAMMA * (Wi[x] * err - LAMBDA * Hj[y]) Wi[x] = w Hj[y] = h W.put(i, Wi) H.put(j, Hj) W.flush() H.flush() rdd = dpark.makeRDD(range(d)) rdd = rdd.cartesian(rdd).cache() def calc_err((i, j)): Wi = W.get(i) Hj = H.get(j) ori = ori_b.value Rij = Wi.dot(Hj.T) Oij = ori[i * m:(i + 1) * m, j * v:(j + 1) * v] return ((Rij - Oij)**2).sum() J = range(d) while True:
from __future__ import print_function from dpark import _ctx as dpark, jit import numpy from six.moves import range @jit('f8(f8[:])') def add1(x): sum = 0.0 for i in range(x.shape[0]): sum += i*x[i] return sum @jit def add2(x): sum = 0.0 for i in range(x.shape[0]): sum += i*x[i] return sum def add3(x): sum = 0.0 for i in range(x.shape[0]): sum += i*x[i] return sum rdd = dpark.makeRDD(list(range(0, 10))).map(lambda x: numpy.arange(x*1e7, (x+1)*1e7)) print(rdd.map(add1).collect()) print(rdd.map(add2).collect()) print(rdd.map(add3).collect())
def gen_rdd(self, **kwargs): use_limit = kwargs.get('use_limist', False) table = self.table q = [x[0] for x in self.select_list if isinstance(x, tuple) and isinstance(x[0], Expr)] scope = set() while q: expr = q.pop(0) if isinstance(expr, SetExpr): scope.add(expr) q += expr.get_subexpr() schema = Schema(table, self.select_list) use_limit = use_limit or ( self.limit is not None and not scope and self.group_by is None \ and self.where is None and self.having is None ) rdd = table.gen_rdd(use_limit=use_limit) if self.where: rdd = rdd.filter(self.where(schema)) if scope or self.group_by is not None: column_len = len(table.get_columns()) creators = [lambda x:x] mergers = [lambda r, x: r] combiners = [lambda r1, r2: r1] mappers = [lambda r: r] for i, s in enumerate(scope): schema.mappers[s] = lambda _, i=column_len+i:lambda row: row[i] creator, merger, combiner, mapper = s.compute(schema) creators.append(creator) mergers.append(merger) combiners.append(combiner) mappers.append(mapper) if self.group_by is not None: keys = [c(schema) for c in self.group_by] key = lambda row:tuple(k(row) for k in keys) agg = Aggregator( lambda x:[c(x) for c in creators], lambda r, x:[m(r[i], x) for i, m in enumerate(mergers)], lambda r1, r2:[c(r1[i], r2[i]) for i, c in enumerate(combiners)], ) rdd = rdd.map(lambda x:(key(x), x)).combineByKey(agg)\ .mapValue(lambda r:[m(r[i]) for i, m in enumerate(mappers)])\ .map(lambda (k,v): v[0] + v[1:]) else: def fun(split): r = None for x in iter(split): if r is None: r = [c(x) for c in creators] else: r = [m(r[i], x) for i, m in enumerate(mergers)] return [r] rdd = rdd.mapPartitions(fun).filter(lambda x: x is not None) result = rdd.reduce( lambda r1, r2:[c(r1[i], r2[i]) for i, c in enumerate(combiners)]) result = [m(result[i]) for i, m in enumerate(mappers)] rdd = dpark.makeRDD([result[0] + result[1:]]) if self.select_list == '*': output_mapper = lambda row: row else: fun_list = [c(schema) for c,_ in self.select_list] output_mapper = lambda row: [f(row) for f in fun_list] if self.having: rdd = rdd.filter(self.having(schema)) if self.order: reverse = (self.order[1] == 'DESC') rdd = rdd.sort(key = self.order[0](schema), reverse=reverse) rdd = rdd.map(output_mapper) return rdd