def test_left_outer_join(self): sc = SparkContext(master='', conf=SparkConf()) rdd1 = sc.parallelize([('A', [1, 2, 3]), ('B', [2,3,4])]) rdd2 = sc.parallelize([('A', [1, 2, 3]), ('B', [2,3,4]), ('B', [4,5,6])]) out = rdd1.leftOuterJoin(rdd2).collect() print(out) self.assertEqual(len(out), 2)
def test_word_count_3(self): lines = [ 'apple', 'apple banana', 'apple banana', 'apple banana grape', 'banana grape', 'banana' ] expected_output = [ ('apple', 4), ('banana', 5), ('grape', 2), ] sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize(lines) rdd = rdd.flatMap(lambda x: x.split(' ')) rdd = rdd.map(lambda word: (word, 1)) rdd = rdd.reduceByKey(lambda a, b: a + b) output = rdd.collect() self.assertEquals(sorted(output), sorted(expected_output))
def test_empty_RDD(self): ctx = SparkContext() rdd = ctx.emptyRDD() self.assertEquals(type(rdd), RDD) l = rdd.collect() self.assertEqual(type(l), list) self.assertEquals(len(l), 0)
def test_combineByKey(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([ ('A', 1), ('B', 2), ('B', 3), ('C', 4), ('C', 5), ('A', 6), ]) def create_combiner(a): return [a] def merge_value(a, b): a.append(b) return a def merge_combiners(a, b): a.extend(b) return a rdd = rdd.combineByKey(create_combiner, merge_value, merge_combiners) self.assertListEqual( rdd.collect(), [('A', [1, 6]), ('B', [2, 3]), ('C', [4, 5])], )
def test_minion_perform_deliver_success(): workflow_id = '6666' app_id = '1000' job_id = '1' out_queue = 'queue_2000' sconf = SparkConf() sc = SparkContext(master='', conf=sconf) rdd = sc.parallelize(get_records()) df0 = DataFrame(rdd=rdd) with mock.patch('redis.StrictRedis', mock_strict_redis_client) as mocked_redis: redis_conn = mocked_redis() state_control = StateControlRedis(redis_conn) data = { 'workflow_id': workflow_id, 'app_id': app_id, 'job_id': job_id, 'type': 'deliver', 'task_id': '033f-284ab-28987e', 'port': 'port0', 'output': out_queue, 'workflow': '' } state_control.push_app_queue(app_id, json.dumps(data)) minion = SparkMinion(redis_conn=redis_conn, workflow_id=workflow_id, app_id=app_id, config=config) minion._emit_event = dummy_emit_event minion._state = { data['task_id']: { 'port0': { 'output': df0, 'sample': [] }, 'time': 35.92 } } minion._process_message() # Discard first status message state_control.pop_app_output_queue(app_id, False) msg = json.loads(state_control.pop_app_output_queue(app_id, False)) assert msg['status'] == 'SUCCESS', 'Invalid status' assert msg['code'] == minion.MNN002[0], 'Invalid code' # CSV data csv_records = '\n'.join( map(dataframe_util.convert_to_csv, get_records())) result = json.loads(state_control.pop_queue(out_queue, False)) assert result['sample'] == csv_records, 'Wrong CSV generated'
def test_text_file(self): ctx = SparkContext() for start, stop, step in self.TEST_RANGES: with NamedTemporaryFile(mode='w') as f: l = ['{}\n'.format(x) for x in range(start, stop, step)] for x in l: f.write(x) f.flush() f.seek(0) rdd = ctx.textFile(f.name) self.assertEquals(l, rdd.collect())
def spark_ctx(): """A simple spark context.""" if IF_DUMMY_SPARK: from dummy_spark import SparkConf, SparkContext conf = SparkConf() ctx = SparkContext(master='', conf=conf) else: from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster('local[2]').setAppName('drudge-unittest') ctx = SparkContext(conf=conf) return ctx
def test_nonzero_by_cartan(): # For pairing algebra, N_p * P_p or Pdag_p * N_p should be ZERO # That is what we test here # Initialise drudge ctx = SparkContext() dr = AGPFermi(ctx) # namespace names = dr.names # Indices p = names.A_dumms[0] # Operators N_p = names.N_[p] Pdag_p = names.P_dag[p] P_p = names.P_[p] # expressions expr1 = dr.simplify(Pdag_p * N_p) expr2 = dr.simplify(N_p * P_p) # assertions assert expr1 == 0 assert expr2 == 0
def test_fermi_anti_comm_rules(): # Test commutation relations for the fermionic algebra # Initialise drudge ctx = SparkContext() dr = AGPFermi(ctx) # namespace names = dr.names # Indices p, q = names.A_dumms[:2] # fermion operators cdag_p_up = names.c_dag[p, UP] cdag_p_dn = names.c_dag[p, DN] c_q_up = names.c_[q, UP] c_q_dn = names.c_[q, DN] # Anti-commutation relations expr1 = dr.simplify(cdag_p_up * c_q_dn + c_q_dn * cdag_p_up) expr2 = dr.simplify(cdag_p_dn * c_q_up + c_q_up * cdag_p_dn) expr3 = dr.simplify(cdag_p_up * c_q_up + c_q_up * cdag_p_up) expr4 = dr.simplify(cdag_p_dn * c_q_dn + c_q_dn * cdag_p_dn) # Assertions assert expr1 == 0 assert expr2 == 0 assert dr.simplify(expr3 - delK(p, q)) == 0 assert dr.simplify(expr4 - delK(p, q)) == 0
def test_spinflip_su2_comm_rules(): # Test commutation relations for the spin-flip SU2 algebra # Initialise drudge ctx = SparkContext() dr = AGPFermi(ctx) # namespace names = dr.names # Indices p, q = names.A_dumms[:2] # BCS Operators Jp_p = names.J_p[p] Jm_q = names.J_m[q] Jz_p = names.J_z[p] Jz_q = names.J_z[q] # Commutation relations expr1 = dr.simplify(Jp_p * Jm_q - Jm_q * Jp_p) expr2 = dr.simplify(Jz_q * Jp_p - Jp_p * Jz_q) expr3 = dr.simplify(Jz_p * Jm_q - Jm_q * Jz_p) # Assertions assert dr.simplify(expr1 - delK(p, q) * 2 * Jz_p) == 0 assert dr.simplify(expr2 - delK(p, q) * Jp_p) == 0 assert dr.simplify(expr3 + delK(p, q) * Jm_q) == 0
def test_unique_indices_functionality(): # Test for unique indices functionality # Initialise drudge ctx = SparkContext() dr = AGPFermi(ctx) # namespace names = dr.names # Indices p, q, r, s = names.A_dumms[:4] # list of unique indices shoule be empty assert dr.unique_del_lists == [] # declare r and s to be unique indices dr.unique_indices([r, s]) # check unique indices list now # Basically, unique ind list is a list of tuples assert dr.unique_del_lists[0] == {r, s} # Expression evaluation e_pq = names.e_[p, q] expr = dr.simplify((delK(r, s) + delK(p, r)) * e_pq) expr2 = dr.simplify(delK(p, r) * e_pq) # assertion assert dr.simplify(expr - expr2) == 0
def test_sortByKey_descending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([ ('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ]).sortByKey(ascending=False)) self.assertListEqual( rdd.collect(), [ ('e', 5), ('d', 4), ('c', 3), ('b', 2), ('a', 1), ], )
def test_canonical_ordering(): # Test the canonical ordering functionality # Initialise drudge ctx = SparkContext() dr = AGPFermi(ctx) # namespace names = dr.names # Indices p, q, r, s = names.A_dumms[:4] # Operators cdag_p_up = names.c_dag[p, UP] cdag_p_dn = names.c_dag[p, DN] c_q_up = names.c_[q, UP] c_q_dn = names.c_[q, DN] Pdag_p = names.P_dag[p] N_q = names.N_[q] P_r = names.P_[r] Jp_p = names.J_p[p] Jz_q = names.J_z[q] Jm_r = names.J_m[r] # Let all the indices be unique - so no commutation terms arise dr.unique_indices([p, q, r, s]) # expression for intra algebra ordering expr1 = dr.simplify(c_q_up * c_q_dn * cdag_p_up * cdag_p_dn) expr2 = dr.simplify(P_r * N_q * Pdag_p) expr3 = dr.simplify(Jm_r * Jz_q * Jp_p) # assertions assert dr.simplify(expr1 + cdag_p_up * cdag_p_dn * c_q_dn * c_q_up) == 0 assert dr.simplify(expr2 - Pdag_p * N_q * P_r) == 0 assert dr.simplify(expr3 - Jp_p * Jz_q * Jm_r) == 0 # expressions for inter algebra ordering Pdag_r = names.P_dag[r] N_r = names.N_[r] expr1a = dr.simplify(cdag_p_up * cdag_p_dn * Pdag_r * N_r * P_r) Jp_q = names.J_p[q] Jm_q = names.J_m[q] expr2a = dr.simplify(cdag_p_up * cdag_p_dn * Pdag_r * Jp_q * Jz_q * Jm_q) # assertions assert dr.simplify(expr1a - Pdag_r * N_r * P_r * cdag_p_up * cdag_p_dn) == 0 assert dr.simplify(expr2a - Pdag_r * Jp_q * Jz_q * Jm_q * cdag_p_up * cdag_p_dn) == 0
def test_nilpotency_of_operators(): # Test the nilpotency of fermi and Pairing-SU2 operators # Initialise drudge ctx = SparkContext() dr = AGPFermi(ctx) # namespace names = dr.names # Indices p, q, r = names.A_dumms[:3] # Operators cdag_p = names.c_dag[p, UP] c_p = names.c_[p, UP] N_q = names.N_[q] Pdag_q = names.P_dag[q] P_q = names.P_[q] Jp_r = names.J_p[r] Jm_r = names.J_m[r] Jz_r = names.J_z[r] # Expressions expr1a = dr.simplify(cdag_p * cdag_p) expr1b = dr.simplify(c_p * c_p) expr2a = dr.simplify(Pdag_q * Pdag_q) expr2b = dr.simplify(P_q * P_q) expr3a = dr.simplify(Jp_r * Jp_r) expr3b = dr.simplify(Jm_r * Jm_r) # assertions assert expr1a == 0 assert expr1b == 0 assert expr2a == 0 assert expr2b == 0 assert expr3a == 0 assert expr3b == 0
class Wavelet: def __init__(self, context, file, sample_size): self.sc = SparkContext(context, 'Wavelet') self.file_size = self.sc.textFile(file).count() self.sample_size = sample_size self.graph_size = int(self.file_size / self.sample_size) self.file = file def wavelet(self, column, name): sample_size = self.sample_size sc = self.sc link = self.file length = self.file_size tab = [] for i in range(0, length): tab.append(length - i) def get_key(iterator, size): key = int(iterator/size) iterator += 1 return key rdd = sc\ .textFile(link)\ .filter(lambda line: name not in line)\ .map(lambda line: (get_key(tab.pop(), sample_size), re.split(r';', line)[column]))\ .groupByKey().mapValues(list)\ .map(lambda line: (line[0], pywt.dwt(line[1], 'db1')[1])) def get_previous_line(line): iterator = line[0] if iterator == 0: prev = rdd.filter(lambda my_line: my_line[0] == iterator).collect()[0][1] else: prev = rdd.filter(lambda my_line: my_line[0] == iterator - 1).collect()[0][1] d = distance.euclidean(line[1], prev) return d return rdd\ .map(lambda line: get_previous_line(line))\ .collect()
def test_pairing_comm_rules(): # Test commutation relations for the pairing SU2 algebra # Initialise drudge ctx = SparkContext() dr = AGPFermi(ctx) # namespace names = dr.names # Indices p, q = names.A_dumms[:2] # BCS Operators Pdag_p = names.P_dag[p] P_q = names.P_[q] N_p = names.N[p] Nup_p = dr.N_up[p] Ndn_p = dr.N_dn[p] N_q = names.N[q] Nup_q = dr.N_up[q] Ndn_q = dr.N_dn[q] # Commutation relations expr1 = dr.simplify(Pdag_p * P_q - P_q * Pdag_p) expr2 = dr.simplify(N_q * Pdag_p - Pdag_p * N_q) expr2a = dr.simplify(Nup_q * Pdag_p - Pdag_p * Nup_q) expr2b = dr.simplify(Ndn_q * Pdag_p - Pdag_p * Ndn_q) expr3 = dr.simplify(N_p * P_q - P_q * N_p) expr3a = dr.simplify(Nup_p * P_q - P_q * Nup_p) expr3b = dr.simplify(Ndn_p * P_q - P_q * Ndn_p) # Assertions assert dr.simplify(expr1 - delK(p, q) * (names.N[p] - 1)) == 0 assert dr.simplify(expr2 - 2 * delK(p, q) * Pdag_p) == 0 assert dr.simplify(expr2a - delK(p, q) * Pdag_p) == 0 assert dr.simplify(expr2b - delK(p, q) * Pdag_p) == 0 assert dr.simplify(expr3 + 2 * delK(p, q) * P_q) == 0 assert dr.simplify(expr3a + delK(p, q) * P_q) == 0 assert dr.simplify(expr3b + delK(p, q) * P_q) == 0
def test_get_seniority_zero(): # Get seniority zero expressions corresponding to some test results that we know already # This will indirectly also include testing of extract_su2 # Initialise drudge ctx = SparkContext() dr = AGPFermi(ctx) # namespace names = dr.names # Indices p, q, r, s = names.A_dumms[:4] # Operators cdag_p_up = names.c_dag[p, UP] cdag_p_dn = names.c_dag[p, DN] c_p_up = names.c_[p, UP] c_p_dn = names.c_[p, DN] # expression1: should simplify to Np Np /4 expr1a = dr.simplify(cdag_p_up * cdag_p_dn * c_p_dn * c_p_up) expr1 = dr.get_seniority_zero(expr1a) res1 = dr.simplify(names.N_[p] * names.N_[p] / 4) # expression2: should simplify to 2 Pdag_p P_q (when p not= q) e_pq = names.e_[p, q] dr.unique_indices([p, q]) expr2a = dr.simplify(e_pq * e_pq) expr2 = dr.get_seniority_zero(expr2a) res2 = dr.simplify(names.P_dag[p] * names.P_[q] * 2) # assertions assert dr.simplify(expr1 - res1) == 0 assert dr.simplify(expr2 - res2) == 0
def test_values(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) self.assertListEqual(rdd.values().collect(), [1, 2, 3])
def __init__(self, context, file, sample_size): self.sc = SparkContext(context, 'Wavelet') self.file_size = self.sc.textFile(file).count() self.sample_size = sample_size self.graph_size = int(self.file_size / self.sample_size) self.file = file
def test_not_implemented_methods(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([]) with self.assertRaises(NotImplementedError): rdd._pickled() with self.assertRaises(NotImplementedError): rdd.mapPartitionsWithIndex(None, None,) with self.assertRaises(NotImplementedError): rdd._computeFractionForSampleSize(None, None, None) with self.assertRaises(NotImplementedError): rdd.pipe(None, None) with self.assertRaises(NotImplementedError): rdd.reduce(None) with self.assertRaises(NotImplementedError): rdd.treeReduce(None, None) with self.assertRaises(NotImplementedError): rdd.fold(None, None,) with self.assertRaises(NotImplementedError): rdd.aggregate(None, None, None) with self.assertRaises(NotImplementedError): rdd.treeAggregate(None, None, None, None) with self.assertRaises(NotImplementedError): rdd.stats() with self.assertRaises(NotImplementedError): rdd.histogram(None) with self.assertRaises(NotImplementedError): rdd.variance() with self.assertRaises(NotImplementedError): rdd.stdev() with self.assertRaises(NotImplementedError): rdd.sampleStdev() with self.assertRaises(NotImplementedError): rdd.sampleVariance() with self.assertRaises(NotImplementedError): rdd.countByValue() with self.assertRaises(NotImplementedError): rdd.top(None, None) with self.assertRaises(NotImplementedError): rdd.takeOrdered(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopFile(None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopFile(None, None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsSequenceFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsPickleFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsTextFile(None, None) with self.assertRaises(NotImplementedError): rdd.collectAsMap() with self.assertRaises(NotImplementedError): rdd.keys() with self.assertRaises(NotImplementedError): rdd.values() with self.assertRaises(NotImplementedError): rdd.reduceByKeyLocally(None) with self.assertRaises(NotImplementedError): rdd.countByKey() with self.assertRaises(NotImplementedError): rdd.join(None, None) with self.assertRaises(NotImplementedError): rdd.leftOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.rightOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.fullOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.partitionBy(None, None) with self.assertRaises(NotImplementedError): rdd.combineByKey(None, None, None, None) with self.assertRaises(NotImplementedError): rdd.aggregateByKey(None, None, None, None) with self.assertRaises(NotImplementedError): rdd.foldByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd._can_spill() with self.assertRaises(NotImplementedError): rdd._memory_limit() with self.assertRaises(NotImplementedError): rdd.groupWith(None, None) with self.assertRaises(NotImplementedError): rdd.sampleByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd.subtractByKey(None, None) with self.assertRaises(NotImplementedError): rdd.subtract(None, None) with self.assertRaises(NotImplementedError): rdd.keyBy(None) with self.assertRaises(NotImplementedError): rdd.repartition(None) with self.assertRaises(NotImplementedError): rdd.coalesce(None, None) with self.assertRaises(NotImplementedError): rdd.zipWithUniqueId() with self.assertRaises(NotImplementedError): rdd.toDebugString() with self.assertRaises(NotImplementedError): rdd.getStorageLevel() with self.assertRaises(NotImplementedError): rdd._to_java_object_rdd()
"""Configures a simple drudge for reduced BCS model.""" from dummy_spark import SparkContext #from pyspark import SparkContext from sympy import Symbol, collect, Add, Mul, Integer, symbols, factor, diff, IndexedBase from bcs import ReducedBCSDrudge from drudge import InvariantIndexable, Perm, IDENT, NEG ctx = SparkContext() raise_ = ReducedBCSDrudge.DEFAULT_RAISE lower = ReducedBCSDrudge.DEFAULT_LOWER dr = ReducedBCSDrudge(ctx, interact=InvariantIndexable(Symbol('G')), specials={(raise_, lower): 2 * raise_ * lower - 1}) #==================================== # AGP expected values: #==================================== # case: z00 Z00 = IndexedBase('Z00') # case: z02 Z02 = IndexedBase('Z02') dr.set_symm(Z02, Perm([1, 0], IDENT), valence=2) # case: z04 Z04 = IndexedBase('Z04') dr.set_symm(Z04, Perm([1, 0, 2, 3], IDENT), Perm([0, 1, 3, 2], IDENT),
def test_not_implemented_methods(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([]) with self.assertRaises(NotImplementedError): rdd._pickled() with self.assertRaises(NotImplementedError): rdd.mapPartitionsWithIndex( None, None, ) with self.assertRaises(NotImplementedError): rdd._computeFractionForSampleSize(None, None, None) with self.assertRaises(NotImplementedError): rdd.pipe(None, None) with self.assertRaises(NotImplementedError): rdd.reduce(None) with self.assertRaises(NotImplementedError): rdd.treeReduce(None, None) with self.assertRaises(NotImplementedError): rdd.fold( None, None, ) with self.assertRaises(NotImplementedError): rdd.aggregate(None, None, None) with self.assertRaises(NotImplementedError): rdd.treeAggregate(None, None, None, None) with self.assertRaises(NotImplementedError): rdd.stats() with self.assertRaises(NotImplementedError): rdd.histogram(None) with self.assertRaises(NotImplementedError): rdd.variance() with self.assertRaises(NotImplementedError): rdd.stdev() with self.assertRaises(NotImplementedError): rdd.sampleStdev() with self.assertRaises(NotImplementedError): rdd.sampleVariance() with self.assertRaises(NotImplementedError): rdd.countByValue() with self.assertRaises(NotImplementedError): rdd.top(None, None) with self.assertRaises(NotImplementedError): rdd.takeOrdered(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopFile(None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopFile(None, None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsSequenceFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsPickleFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsTextFile(None, None) with self.assertRaises(NotImplementedError): rdd.collectAsMap() with self.assertRaises(NotImplementedError): rdd.reduceByKeyLocally(None) with self.assertRaises(NotImplementedError): rdd.countByKey() with self.assertRaises(NotImplementedError): rdd.join(None, None) with self.assertRaises(NotImplementedError): rdd.rightOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.fullOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.foldByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd._can_spill() with self.assertRaises(NotImplementedError): rdd._memory_limit() with self.assertRaises(NotImplementedError): rdd.groupWith(None, None) with self.assertRaises(NotImplementedError): rdd.sampleByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd.subtract(None, None) with self.assertRaises(NotImplementedError): rdd.coalesce(None, None) with self.assertRaises(NotImplementedError): rdd.toDebugString() with self.assertRaises(NotImplementedError): rdd.getStorageLevel() with self.assertRaises(NotImplementedError): rdd._to_java_object_rdd()
def test_sortBy_descending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([1, 2, 3, 4, 5]).sortBy(lambda x: x, ascending=False)) self.assertListEqual(rdd.collect(), [5, 4, 3, 2, 1])
def test_with_block(self): with SparkContext(): pass self.assertTrue(True)
def test_version(self): ctx = SparkContext() self.assertEquals(ctx.version, SparkContext.DUMMY_VERSION)
import os import random from dummy_spark import SparkContext, SparkConf from dummy_spark.sql import SQLContext __author__ = 'willmcginnis' # make a spark conf sconf = SparkConf() # set some property (won't do anything) sconf.set('spark.executor.extraClassPath', 'foo') # use the spark conf to make a spark context sc = SparkContext(master='', conf=sconf) # set the log level (also doesn't do anything) sc.setLogLevel('INFO') # maybe make a useless sqlcontext (nothing implimented here yet) sqlctx = SQLContext(sc) # add pyfile just appends to the sys path sc.addPyFile(os.path.dirname(__file__)) # do some hadoop configuration into the ether sc._jsc.hadoopConfiguration().set('foo', 'bar') # maybe make some data rdd = sc.parallelize([1, 2, 3, 4, 5])
def test_parallelize_set(self): ctx = SparkContext() for start, stop, step in self.TEST_RANGES: l = list(range(start, stop, step)) rdd = ctx.parallelize(set(l)) self.assertEquals(sorted(l), sorted(rdd.collect()))
def test_range(self): ctx = SparkContext() for start, stop, step in self.TEST_RANGES: l = list(range(start, stop, step)) rdd = ctx.range(start, stop, step) self.assertEquals(l, rdd.collect())
def test_add_py_file(self): with SparkContext() as ctx: ctx.addPyFile(__file__) self.assertTrue(True)
def test_hadoop_config(self): ctx = SparkContext() jvm = ctx._jsc hc = jvm.hadoopConfiguration() hc.set('key', 'value') self.assertEquals(hc.get('key'), 'value')
def test_subtractByKey(self): sc = SparkContext(master='', conf=SparkConf()) rdd1 = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) rdd2 = sc.parallelize([('A', None), ('C', None)]) self.assertListEqual(rdd1.subtractByKey(rdd2).collect(), [('B', 2)])
def test_not_implemented_methods(self): ctx = SparkContext() with self.assertRaises(NotImplementedError): ctx._checkpointFile(None, None) with self.assertRaises(NotImplementedError): ctx._dictToJavaMap(None) with self.assertRaises(NotImplementedError): ctx._getJavaStorageLevel(None) with self.assertRaises(NotImplementedError): ctx.accumulator(None) with self.assertRaises(NotImplementedError): ctx.addFile(None) with self.assertRaises(NotImplementedError): ctx.binaryFiles(None) with self.assertRaises(NotImplementedError): ctx.binaryRecords(None, None) with self.assertRaises(NotImplementedError): ctx.broadcast(None) with self.assertRaises(NotImplementedError): ctx.cancelAllJobs() with self.assertRaises(NotImplementedError): ctx.cancelJobGroup(None) with self.assertRaises(NotImplementedError): ctx.clearFiles() with self.assertRaises(NotImplementedError): ctx.dump_profiles(None) with self.assertRaises(NotImplementedError): ctx.getLocalProperty(None) with self.assertRaises(NotImplementedError): ctx.hadoopFile(None, None, None, None) with self.assertRaises(NotImplementedError): ctx.hadoopRDD(None, None, None) with self.assertRaises(NotImplementedError): ctx.newAPIHadoopFile(None, None, None, None) with self.assertRaises(NotImplementedError): ctx.pickleFile(None) with self.assertRaises(NotImplementedError): ctx.runJob(None, None) with self.assertRaises(NotImplementedError): ctx.sequenceFile(None) with self.assertRaises(NotImplementedError): ctx.setCheckpointDir(None) with self.assertRaises(NotImplementedError): ctx.setJobGroup(None, None) with self.assertRaises(NotImplementedError): ctx.setLocalProperty(None, None) with self.assertRaises(NotImplementedError): ctx.show_profiles() with self.assertRaises(NotImplementedError): ctx.sparkUser() with self.assertRaises(NotImplementedError): ctx.statusTracker() with self.assertRaises(NotImplementedError): ctx.union(None) with self.assertRaises(NotImplementedError): ctx.wholeTextFiles(None)
class RDDTests(unittest.TestCase): SPARK_CONTEXT = SparkContext(master='', conf=SparkConf()) TEST_RANGES = [ (0, 0, 1), (0, 10, 1), (0, 10, 2), (0, 100, 13), (0, 1000, 17), (0, 10000, 31), ] SAMPLE_FRACTION = 0.10 SAMPLE_SEED = 1234 def test_init(self): for start, stop, step in self.TEST_RANGES: l = list(range(start, stop, step)) rdd = RDD(l, self.SPARK_CONTEXT) self.assertEquals(l, rdd.collect()) s = set(range(100)) rdd = RDD(s, self.SPARK_CONTEXT) self.assertEquals(sorted(list(s)), sorted(rdd.collect())) t = (1, 2, 3) with self.assertRaises(AttributeError): RDD(t, self.SPARK_CONTEXT) with self.assertRaises(AttributeError): RDD('', self.SPARK_CONTEXT) def test_ctx(self): rdd = RDD([], self.SPARK_CONTEXT) self.assertEquals(rdd.ctx, self.SPARK_CONTEXT) @staticmethod def square(x): return x**2 def test_map(self): for start, stop, step in self.TEST_RANGES: l1 = range(start, stop, step) l2 = map(RDDTests.square, l1) rdd = RDD(list(l1), self.SPARK_CONTEXT) rdd = rdd.map(RDDTests.square) self.assertEquals(rdd.collect(), list(l2)) @staticmethod def triplicate(x): return [x, x, x] def test_flat_map(self): for start, stop, step in self.TEST_RANGES: l1 = range(start, stop, step) l2 = map(RDDTests.triplicate, l1) l3 = [] for sl in l2: l3.extend(sl) rdd = RDD(list(l1), self.SPARK_CONTEXT) rdd = rdd.flatMap(RDDTests.triplicate) self.assertEquals(rdd.collect(), list(l3)) @staticmethod def is_square(x): return x == x**2 def test_filter(self): for start, stop, step in self.TEST_RANGES: l1 = range(start, stop, step) l2 = filter(RDDTests.is_square, l1) rdd = RDD(list(l1), self.SPARK_CONTEXT) rdd = rdd.filter(RDDTests.is_square) self.assertEquals(rdd.collect(), list(l2)) @staticmethod def return_one(x): return x - x + 1 def test_distinct(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) rdd = rdd.map(RDDTests.return_one) rdd = rdd.distinct() if len(l) > 0: self.assertEquals(rdd.collect(), [1]) else: self.assertEquals(rdd.collect(), []) def test_sample_with_replacement(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) sample = rdd.sample(True, self.SAMPLE_FRACTION).collect() self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION)) for item in sample: self.assertTrue(item in l) def test_sample_with_replacement_with_seed(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) sample1 = rdd.sample(True, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect() sample2 = rdd.sample(True, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect() self.assertEquals(sorted(sample1), sorted(sample2)) sample = sample1 self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION)) for item in sample: self.assertTrue(item in l) def test_sample_without_replacement(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) sample = rdd.sample(False, self.SAMPLE_FRACTION).collect() self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION)) self.assertEquals(sorted(l), sorted(set(l))) for item in sample: self.assertTrue(item in l) def test_sample_without_replacement_with_seed(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) sample1 = rdd.sample(False, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect() sample2 = rdd.sample(False, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect() self.assertEquals(sorted(sample1), sorted(sample2)) sample = sample1 self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION)) self.assertEquals(sorted(l), sorted(set(l))) for item in sample: self.assertTrue(item in l) def test_union(self): for start1, stop1, step1 in self.TEST_RANGES: for start2, stop2, step2 in self.TEST_RANGES: l1 = range(start1, stop1, step1) l2 = range(start2, stop2, step2) rdd1 = RDD(list(l1), self.SPARK_CONTEXT) rdd2 = RDD(list(l2), self.SPARK_CONTEXT) rdd = rdd1.union(rdd2) self.assertEquals(sorted(rdd.collect()), sorted(list(l1) + list(l2))) def test_intersection(self): for start1, stop1, step1 in self.TEST_RANGES: for start2, stop2, step2 in self.TEST_RANGES: l1 = range(start1, stop1, step1) l2 = range(start2, stop2, step2) rdd1 = RDD(list(l1), self.SPARK_CONTEXT) rdd2 = RDD(list(l2), self.SPARK_CONTEXT) rdd = rdd1.intersection(rdd2) self.assertEquals(sorted(rdd.collect()), sorted([x for x in l1 if x in l2])) def test_group_by_key(self): l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] rdd = RDD(l, self.SPARK_CONTEXT) rdd = rdd.groupByKey() r = rdd.collect() r = [(kv[0], list(kv[1])) for kv in r] self.assertEquals(sorted(r), sorted([(1, [1]), (2, [1, 2]), (3, [1, 2, 3])])) def test_reduce_by_key(self): l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] rdd = RDD(l, self.SPARK_CONTEXT) rdd = rdd.reduceByKey(lambda a, b: a + b) print(rdd) self.assertEquals(sorted(rdd.collect()), sorted([(1, 1), (2, 3), (3, 6)])) def test_cartesian(self): for start1, stop1, step1 in self.TEST_RANGES: for start2, stop2, step2 in self.TEST_RANGES: l1 = range(start1, stop1, step1) l2 = range(start2, stop2, step2) rdd1 = RDD(list(l1), self.SPARK_CONTEXT) rdd2 = RDD(list(l2), self.SPARK_CONTEXT) rdd = rdd1.cartesian(rdd2) r = rdd.collect() self.assertEquals(len(r), len(l1) * len(l2)) for t, u in r: self.assertTrue(t in l1) self.assertTrue(u in l2) def test_cogroup(self): l1 = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] l2 = [(2, 10), (2, 20), (3, 10), (3, 20), (3, 30), (4, 40)] rdd1 = RDD(l1, self.SPARK_CONTEXT) rdd2 = RDD(l2, self.SPARK_CONTEXT) rdd = rdd1.cogroup(rdd2) l = rdd.collect() self.assertEquals( sorted(l), sorted([(1, [1], []), (2, [1, 2], [10, 20]), (3, [1, 2, 3], [10, 20, 30]), (4, [], [40])])) def test_word_count_1(self): lines = [ 'grape banana apple', ] expected_output = [ ('apple', 1), ('banana', 1), ('grape', 1), ] sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize(lines) rdd = rdd.flatMap(lambda x: x.split(' ')) rdd = rdd.map(lambda word: (word, 1)) rdd = rdd.reduceByKey(lambda a, b: a + b) output = rdd.collect() self.assertEquals(sorted(output), sorted(expected_output)) def test_word_count_2(self): lines = [ 'apple', 'apple banana', 'apple banana', 'apple banana grape', ] expected_output = [ ('apple', 4), ('banana', 3), ('grape', 1), ] sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize(lines) rdd = rdd.flatMap(lambda x: x.split(' ')) rdd = rdd.map(lambda word: (word, 1)) rdd = rdd.reduceByKey(lambda a, b: a + b) output = rdd.collect() self.assertEquals(sorted(output), sorted(expected_output)) def test_word_count_3(self): lines = [ 'apple', 'apple banana', 'apple banana', 'apple banana grape', 'banana grape', 'banana' ] expected_output = [ ('apple', 4), ('banana', 5), ('grape', 2), ] sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize(lines) rdd = rdd.flatMap(lambda x: x.split(' ')) rdd = rdd.map(lambda word: (word, 1)) rdd = rdd.reduceByKey(lambda a, b: a + b) output = rdd.collect() self.assertEquals(sorted(output), sorted(expected_output)) def test_left_outer_join(self): sc = SparkContext(master='', conf=SparkConf()) rdd1 = sc.parallelize([('A', [1, 2, 3]), ('B', [2, 3, 4])]) rdd2 = sc.parallelize([('A', [1, 2, 3]), ('B', [2, 3, 4]), ('B', [4, 5, 6])]) out = rdd1.leftOuterJoin(rdd2).collect() print(out) self.assertEqual(len(out), 2) def test_keys(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) self.assertListEqual(rdd.keys().collect(), ['A', 'B', 'C']) def test_values(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) self.assertListEqual(rdd.values().collect(), [1, 2, 3]) def test_combineByKey(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([ ('A', 1), ('B', 2), ('B', 3), ('C', 4), ('C', 5), ('A', 6), ]) def create_combiner(a): return [a] def merge_value(a, b): a.append(b) return a def merge_combiners(a, b): a.extend(b) return a rdd = rdd.combineByKey(create_combiner, merge_value, merge_combiners) self.assertListEqual( rdd.collect(), [('A', [1, 6]), ('B', [2, 3]), ('C', [4, 5])], ) def test_sortByKey_ascending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([ ('e', 5), ('d', 4), ('c', 3), ('b', 2), ('a', 1), ]).sortByKey(ascending=True)) self.assertListEqual( rdd.collect(), [ ('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ], ) def test_sortByKey_descending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([ ('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ]).sortByKey(ascending=False)) self.assertListEqual( rdd.collect(), [ ('e', 5), ('d', 4), ('c', 3), ('b', 2), ('a', 1), ], ) def test_sortBy_ascending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([5, 4, 3, 2, 1]).sortBy(lambda x: x, ascending=True)) self.assertListEqual(rdd.collect(), [1, 2, 3, 4, 5]) def test_sortBy_descending(self): sc = SparkContext(master='', conf=SparkConf()) rdd = (sc.parallelize([1, 2, 3, 4, 5]).sortBy(lambda x: x, ascending=False)) self.assertListEqual(rdd.collect(), [5, 4, 3, 2, 1]) def test_subtractByKey(self): sc = SparkContext(master='', conf=SparkConf()) rdd1 = sc.parallelize([('A', 1), ('B', 2), ('C', 3)]) rdd2 = sc.parallelize([('A', None), ('C', None)]) self.assertListEqual(rdd1.subtractByKey(rdd2).collect(), [('B', 2)]) def test_not_implemented_methods(self): sc = SparkContext(master='', conf=SparkConf()) rdd = sc.parallelize([]) with self.assertRaises(NotImplementedError): rdd._pickled() with self.assertRaises(NotImplementedError): rdd.mapPartitionsWithIndex( None, None, ) with self.assertRaises(NotImplementedError): rdd._computeFractionForSampleSize(None, None, None) with self.assertRaises(NotImplementedError): rdd.pipe(None, None) with self.assertRaises(NotImplementedError): rdd.reduce(None) with self.assertRaises(NotImplementedError): rdd.treeReduce(None, None) with self.assertRaises(NotImplementedError): rdd.fold( None, None, ) with self.assertRaises(NotImplementedError): rdd.aggregate(None, None, None) with self.assertRaises(NotImplementedError): rdd.treeAggregate(None, None, None, None) with self.assertRaises(NotImplementedError): rdd.stats() with self.assertRaises(NotImplementedError): rdd.histogram(None) with self.assertRaises(NotImplementedError): rdd.variance() with self.assertRaises(NotImplementedError): rdd.stdev() with self.assertRaises(NotImplementedError): rdd.sampleStdev() with self.assertRaises(NotImplementedError): rdd.sampleVariance() with self.assertRaises(NotImplementedError): rdd.countByValue() with self.assertRaises(NotImplementedError): rdd.top(None, None) with self.assertRaises(NotImplementedError): rdd.takeOrdered(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsNewAPIHadoopFile(None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopDataset(None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsHadoopFile(None, None, None, None, None, None, None, None) with self.assertRaises(NotImplementedError): rdd.saveAsSequenceFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsPickleFile(None, None) with self.assertRaises(NotImplementedError): rdd.saveAsTextFile(None, None) with self.assertRaises(NotImplementedError): rdd.collectAsMap() with self.assertRaises(NotImplementedError): rdd.reduceByKeyLocally(None) with self.assertRaises(NotImplementedError): rdd.countByKey() with self.assertRaises(NotImplementedError): rdd.join(None, None) with self.assertRaises(NotImplementedError): rdd.rightOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.fullOuterJoin(None, None) with self.assertRaises(NotImplementedError): rdd.foldByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd._can_spill() with self.assertRaises(NotImplementedError): rdd._memory_limit() with self.assertRaises(NotImplementedError): rdd.groupWith(None, None) with self.assertRaises(NotImplementedError): rdd.sampleByKey(None, None, None) with self.assertRaises(NotImplementedError): rdd.subtract(None, None) with self.assertRaises(NotImplementedError): rdd.coalesce(None, None) with self.assertRaises(NotImplementedError): rdd.toDebugString() with self.assertRaises(NotImplementedError): rdd.getStorageLevel() with self.assertRaises(NotImplementedError): rdd._to_java_object_rdd()
""" Configuration file for SU(4) Lipkin Model in the Author: Gaurav Harsha Date: July 29, 2019 """ import collections import functools from dummy_spark import SparkContext # from pyspark import SparkContext from sympy import Symbol, collect, Add, Mul, Integer, symbols, factor, diff from su4 import * ctx = SparkContext('local[*]', 'su4') dr = SU4LatticeDrudge(ctx) nams = dr.names DRUDGE = dr
def test_not_implemented_methods(self): ctx = SparkContext() with self.assertRaises(NotImplementedError): ctx._checkpointFile(None, None) with self.assertRaises(NotImplementedError): ctx._dictToJavaMap(None) with self.assertRaises(NotImplementedError): ctx._getJavaStorageLevel(None) with self.assertRaises(NotImplementedError): ctx.accumulator(None) with self.assertRaises(NotImplementedError): ctx.addFile(None) with self.assertRaises(NotImplementedError): ctx.binaryFiles(None) with self.assertRaises(NotImplementedError): ctx.binaryRecords(None, None) with self.assertRaises(NotImplementedError): ctx.broadcast(None) with self.assertRaises(NotImplementedError): ctx.cancelAllJobs() with self.assertRaises(NotImplementedError): ctx.cancelJobGroup(None) with self.assertRaises(NotImplementedError): ctx.clearFiles() with self.assertRaises(NotImplementedError): ctx.dump_profiles(None) with self.assertRaises(NotImplementedError): ctx.getLocalProperty(None) with self.assertRaises(NotImplementedError): ctx.hadoopFile(None, None, None, None) with self.assertRaises(NotImplementedError): ctx.hadoopRDD(None, None, None) with self.assertRaises(NotImplementedError): ctx.newAPIHadoopFile(None, None, None, None) with self.assertRaises(NotImplementedError): ctx.newAPIHadoopRDD(None, None, None) with self.assertRaises(NotImplementedError): ctx.pickleFile(None) with self.assertRaises(NotImplementedError): ctx.runJob(None, None) with self.assertRaises(NotImplementedError): ctx.sequenceFile(None) with self.assertRaises(NotImplementedError): ctx.setCheckpointDir(None) with self.assertRaises(NotImplementedError): ctx.setJobGroup(None, None) with self.assertRaises(NotImplementedError): ctx.setLocalProperty(None, None) with self.assertRaises(NotImplementedError): ctx.show_profiles() with self.assertRaises(NotImplementedError): ctx.sparkUser() with self.assertRaises(NotImplementedError): ctx.statusTracker() with self.assertRaises(NotImplementedError): ctx.union(None) with self.assertRaises(NotImplementedError): ctx.wholeTextFiles(None)