class MLlibTestCase(unittest.TestCase): def setUp(self): self.sc = SparkContext('local[4]', "MLlib tests") self.spark = SparkSession(self.sc) def tearDown(self): self.spark.stop()
def benchmark_spark(ratings, factors, iterations=5): conf = (SparkConf() .setAppName("implicit_benchmark") .setMaster('local[*]') .set('spark.driver.memory', '16G') ) context = SparkContext(conf=conf) spark = SparkSession(context) times = {} try: ratings = convert_sparse_to_dataframe(spark, context, ratings) for rank in factors: als = ALS(rank=rank, maxIter=iterations, alpha=1, implicitPrefs=True, userCol="row", itemCol="col", ratingCol="data") start = time.time() als.fit(ratings) elapsed = time.time() - start times[rank] = elapsed / iterations print("spark. factors=%i took %.3f" % (rank, elapsed/iterations)) finally: spark.stop() return times
def test_active_session_with_None_and_not_None_context(self): from pyspark.context import SparkContext from pyspark.conf import SparkConf sc = None session = None try: sc = SparkContext._active_spark_context self.assertEqual(sc, None) activeSession = SparkSession.getActiveSession() self.assertEqual(activeSession, None) sparkConf = SparkConf() sc = SparkContext.getOrCreate(sparkConf) activeSession = sc._jvm.SparkSession.getActiveSession() self.assertFalse(activeSession.isDefined()) session = SparkSession(sc) activeSession = sc._jvm.SparkSession.getActiveSession() self.assertTrue(activeSession.isDefined()) activeSession2 = SparkSession.getActiveSession() self.assertNotEqual(activeSession2, None) finally: if session is not None: session.stop() if sc is not None: sc.stop()
SELECT id, COUNT(*) AS n_connections FROM ( SELECT id_1 AS id FROM connections UNION ALL SELECT id_2 AS id FROM connections ) GROUP BY 1 ORDER BY 2 DESC ''') connection_counts.show(20) connection_counts.createOrReplaceTempView('connection_counts') avg_connections = spark.sql( 'SELECT COUNT(*), AVG(n_connections) FROM connection_counts').collect()[0] msg = ''' {0} ids in the dataset, with an average connection count of {1}. '''.format(*avg_connections) print(msg) spark.stop()
class DeltaTableTests(PySparkTestCase): def setUp(self): super(DeltaTableTests, self).setUp() self.sqlContext = SQLContext(self.sc) self.spark = SparkSession(self.sc) self.tempPath = tempfile.mkdtemp() self.tempFile = os.path.join(self.tempPath, "tempFile") def tearDown(self): self.spark.stop() shutil.rmtree(self.tempPath) super(DeltaTableTests, self).tearDown() def test_forPath(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile).toDF() self.__checkAnswer(dt, [('a', 1), ('b', 2), ('c', 3)]) def test_alias_and_toDF(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile).toDF() self.__checkAnswer( dt.alias("myTable").select('myTable.key', 'myTable.value'), [('a', 1), ('b', 2), ('c', 3)]) def test_history(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)]) dt = DeltaTable.forPath(self.spark, self.tempFile) operations = dt.history().select('operation') self.__checkAnswer( operations, [Row("WRITE"), Row("WRITE")], StructType([StructField("operation", StringType(), True)])) lastMode = dt.history(1).select('operationParameters.mode') self.__checkAnswer( lastMode, [Row("Overwrite")], StructType( [StructField("operationParameters.mode", StringType(), True)])) def test_vacuum(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile) self.__createFile('abc.txt', 'abcde') self.__createFile('bac.txt', 'abcdf') self.assertEqual(True, self.__checkFileExists('abc.txt')) dt.vacuum() # will not delete files as default retention is used. self.assertEqual(True, self.__checkFileExists('bac.txt')) retentionConf = "spark.databricks.delta.retentionDurationCheck.enabled" self.spark.conf.set(retentionConf, "false") dt.vacuum(0.0) self.spark.conf.set(retentionConf, "true") self.assertEqual(False, self.__checkFileExists('bac.txt')) self.assertEqual(False, self.__checkFileExists('abc.txt')) def test_convertToDelta(self): df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) self.tempFile2 = self.tempFile + "_" dt = DeltaTable.convertToDelta(self.spark, "parquet.`" + self.tempFile + "`") self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile), [('a', 1), ('b', 2), ('c', 3)]) # test if convert to delta with partition columns work df.write.partitionBy("value").format("parquet").save(self.tempFile2) schema = StructType() schema.add("value", IntegerType(), True) dt = DeltaTable.convertToDelta(self.spark, "parquet.`" + self.tempFile2 + "`", schema) self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile2), [('a', 1), ('b', 2), ('c', 3)]) def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]): if not expectedAnswer: self.assertEqual(df.count(), 0) return expectedDF = self.spark.createDataFrame(expectedAnswer, schema) self.assertEqual(df.count(), expectedDF.count()) self.assertEqual(len(df.columns), len(expectedDF.columns)) self.assertEqual([], df.subtract(expectedDF).take(1)) self.assertEqual([], expectedDF.subtract(df).take(1)) def __writeDeltaTable(self, datalist): df = self.spark.createDataFrame(datalist, ["key", "value"]) df.write.format("delta").save(self.tempFile) def __overwriteDeltaTable(self, datalist): df = self.spark.createDataFrame(datalist, ["key", "value"]) df.write.format("delta").mode("overwrite").save(self.tempFile) def __createFile(self, fileName, content): with open(os.path.join(self.tempFile, fileName), 'w') as f: f.write(content) def __checkFileExists(self, fileName): return os.path.exists(os.path.join(self.tempFile, fileName))
class SCDHTest(testBase): def setUp(self): StockCustReturnByPrdInd.logLevel = 'debug' self.scdh = StockCustReturnByPrdInd(None) os.environ['SPARK_HOME'] = "/usr/local/Cellar/apache-spark/2.2.0/libexec" sys.path.append("/usr/local/Cellar/apache-spark/2.2.0/libexec/python") conf = SparkConf().setMaster("local").setAppName("hello") self.spark = SparkSession(SparkContext(conf=conf)) def tearDown(self): self.spark.stop() def test_local_spark(self): doc = self.spark.createDataFrame([['a', 'b', 'c'], ['b', 'd', 'd']]) print doc.show() print "successful!" def test_get_base_data(self): self.scdh._get_base_data("2017-03-16", "2017-03-18", 1, 5) def test_init_data(self): self.scdh.init_data() def test_daily_compute(self): self.scdh.daily_compute("2017-03-16", "2017-03-16") def test_check_1(self): sql = """ SELECT * from adatatest.stock_cust_daily_return where short_return_rate>1 or long_return_rate>1 or total_return_rate>1 """ self.spark.sql(sql) def test_travel_row(self): # """ # stock_cust_return_by_prd_ind.prd_ind unknown # stock_cust_return_by_prd_ind.return -44623.789999999964 # stock_cust_return_by_prd_ind.return_rate -0.006018969744297111 # stock_cust_return_by_prd_ind.trade_id 12466 # stock_cust_return_by_prd_ind.return_ratio 0.4610100448676952 # stock_cust_return_by_prd_ind.return_rank 2 # stock_cust_return_by_prd_ind.return_rate_rank 1 # stock_cust_return_by_prd_ind.busi_date 2017-03-23 # stock_cust_return_by_prd_ind.compute 7 # """ # spark.sql(""" # select trade_id,prd_ind,collect_list(detail_item) detail_list from ( # select trade_id,trim(prd_ind) prd_ind, # (str_to_map(concat( # 'pre_mkt_val:',pre_mkt_val, # ',now_mkt_val:',now_mkt_val, # ',pos_cash_flow:',pos_cash_flow, # ',neg_cash_flow:',pos_cash_flow, # ',exception_label:',exception_label, # ',trd_type:',trd_type, # ',return:',return, # ',busi_date:',busi_date),",",":")) detail_item # from adatatest.stock_cust_daily_holding # where busi_date<='2017-03-23' and trade_id='12466' and prd_ind='unknown' # ) a # GROUP by trade_id,prd_ind # """) r = Row(trade_id=u'12466', prd_ind=u'unknown', detail_list=[ {u'return': u'-13008.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1263402.0', u'now_mkt_val': u'1250394.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'6344.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'135176.0', u'now_mkt_val': u'141520.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-12803.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1308384.0', u'now_mkt_val': u'1295581.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-4.229999999999563', u'trd_type': u'long_related', u'pos_cash_flow': u'16940.23', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'16936.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'16940.23'}, {u'return': u'1612.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'208052.0', u'now_mkt_val': u'209664.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'35466.53', u'now_mkt_val': u'18526.3', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'4730.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'679400.0', u'now_mkt_val': u'684130.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-1662.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'271183.0', u'now_mkt_val': u'269521.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-693.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-130207.0', u'now_mkt_val': u'-130900.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-21138.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1284540.0', u'now_mkt_val': u'1263402.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'2079.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-132286.0', u'now_mkt_val': u'-130207.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'6771.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'128405.0', u'now_mkt_val': u'135176.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'306163.19', u'now_mkt_val': u'35466.53', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'-12470.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'691870.0', u'now_mkt_val': u'679400.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'-122.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'128527.0', u'now_mkt_val': u'128405.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'11.429999999999836', u'trd_type': u'long_related', u'pos_cash_flow': u'2273.57', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'2285.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'2273.57'}, {u'return': u'539.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-132825.0', u'now_mkt_val': u'-132286.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'8673.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1327382.0', u'now_mkt_val': u'1336055.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'15399.439999999944', u'trd_type': u'long_related', u'pos_cash_flow': u'1274560.56', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'1289960.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'1274560.56'}, {u'return': u'197.7399999999907', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'-132825.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'3497135.97', u'now_mkt_val': u'1510820.28', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'}, {u'return': u'12845.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1333311.0', u'now_mkt_val': u'-1320466.0', u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'3497135.97', u'now_mkt_val': u'3497135.97', u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'2177000.0', u'now_mkt_val': u'3497135.97', u'busi_date': u'2017-03-16', u'neg_cash_flow': u'0.0'}, {u'return': u'-17.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'2247.0', u'now_mkt_val': u'2230.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'15414.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1333311.0', u'now_mkt_val': u'-1317897.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-38.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'2285.0', u'now_mkt_val': u'2247.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'5138.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1338449.0', u'now_mkt_val': u'-1333311.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'-27671.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1336055.0', u'now_mkt_val': u'1308384.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'-2808.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'210860.0', u'now_mkt_val': u'208052.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'486.3400000000256', u'trd_type': u'long_related', u'pos_cash_flow': u'270696.66', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'271183.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'270696.66'}, {u'return': u'-2753.609999999986', u'trd_type': u'long_related', u'pos_cash_flow': u'694623.61', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'691870.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'694623.61'}, {u'return': u'-5420.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1289960.0', u'now_mkt_val': u'1284540.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'-2569.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1335880.0', u'now_mkt_val': u'-1338449.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1510820.28', u'now_mkt_val': u'306163.19', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'1299.6199999999953', u'trd_type': u'long_related', u'pos_cash_flow': u'209560.38', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'210860.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'209560.38'}, {u'return': u'-15414.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1320466.0', u'now_mkt_val': u'-1335880.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'}, {u'return': u'5451.600000000093', u'trd_type': u'long_related', u'pos_cash_flow': u'1321930.4', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'1327382.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'1321930.4'}, {u'return': u'150.9100000000035', u'trd_type': u'long_related', u'pos_cash_flow': u'128376.09', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'128527.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'128376.09'}, {u'return': u'-13175.030000000028', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'-1333311.0', u'busi_date': u'2017-03-16', u'neg_cash_flow': u'0.0'}]) r2 = _travel_row(r, '2017-03-23') self.assertTrue(int(r2.get("return")), -44623)
from pyspark import SparkContext from pyspark.sql import SparkSession sc = SparkContext(appName="Convert CSV's to Dataframe") ss = SparkSession(sc) # Convert street files filesPath = "hdfs://namenode:9000/csvfiles/street/*.csv" df = ss.read.csv(filesPath) df.printSchema() print("Number of rows: " + str(df.count())) dfPath = "hdfs://namenode:9000/dataframes/street.csv" df.write.csv(dfPath) ss.stop()
class SparkWithCustomGateway: def __init__(self): spark_conf = SparkConf() spark_conf.setAppName(spark_nlp_config.app_name) spark_conf.setMaster(spark_nlp_config.master) spark_conf.set("spark.driver.memory", memory) spark_conf.set("spark.serializer", spark_nlp_config.serializer) spark_conf.set("spark.kryoserializer.buffer.max", spark_nlp_config.serializer_max_buffer) spark_conf.set("spark.driver.maxResultSize", spark_nlp_config.driver_max_result_size) if gpu: spark_conf.set("spark.jars.packages", spark_nlp_config.maven_gpu_spark) else: spark_conf.set("spark.jars.packages", spark_nlp_config.maven_spark) if cache_folder != '': spark_conf.config("spark.jsl.settings.pretrained.cache_folder", cache_folder) if log_folder != '': spark_conf.config("spark.jsl.settings.annotator.log_folder", log_folder) if cluster_tmp_dir != '': spark_conf.config("spark.jsl.settings.storage.cluster_tmp_dir", cluster_tmp_dir) # Make the py4j JVM stdout and stderr available without buffering popen_kwargs = { 'stdout': subprocess.PIPE, 'stderr': subprocess.PIPE, 'bufsize': 0 } # Launch the gateway with our custom settings self.gateway = launch_gateway(conf=spark_conf, popen_kwargs=popen_kwargs) self.process = self.gateway.proc # Use the gateway we launched spark_context = SparkContext(gateway=self.gateway) self.spark_session = SparkSession(spark_context) self.out_thread = threading.Thread(target=self.output_reader) self.error_thread = threading.Thread(target=self.error_reader) self.std_background_listeners() def std_background_listeners(self): self.out_thread.start() self.error_thread.start() def output_reader(self): for line in iter(self.process.stdout.readline, b''): print('{0}'.format(line.decode('utf-8')), end='') def error_reader(self): RED = '\033[91m' RESET = '\033[0m' for line in iter(self.process.stderr.readline, b''): if output_level == 0: print(RED + '{0}'.format(line.decode('utf-8')) + RESET, end='') else: # output just info pass def shutdown(self): self.spark_session.stop() self.gateway.shutdown() self.process.communicate() self.out_thread.join() self.error_thread.join()
class SCDHTest(testBase): def setUp(self): StockCustReturnByPrd.logLevel = 'debug' os.environ[ 'SPARK_HOME'] = "/usr/local/Cellar/apache-spark/2.2.0/libexec" sys.path.append("/usr/local/Cellar/apache-spark/2.2.0/libexec/python") conf = SparkConf().setMaster("local").setAppName("hello") self.spark = SparkSession(SparkContext(conf=conf)) self.scdh = StockCustReturnByPrd(self.spark) def tearDown(self): self.spark.stop() def test_mapPartition(self): df = self.spark.createDataFrame([{ u"a": "1", u"b": "2" }, { u"a": "3", u"b": "4" }]) df.persist() def t(ite, s): for ss in ite: yield {"t": ss.a, "s1": s} d = self.spark.createDataFrame( df.rdd.mapPartitions(lambda x: t(x, 1), 2)) print d.withColumn("month", F.lit("hh")).show() def test_local_spark(self): doc = self.spark.createDataFrame([['a', 'b', 'c'], ['b', 'd', 'd']]) print doc.show() print "successful!" def test_get_base_data(self): self.scdh._get_base_data("2017-03-16", "2017-03-18", 1, 5) def test_init_data(self): self.scdh.init_data() def test_daily_compute(self): self.scdh.daily_compute("2017-03-16", "2017-03-16") def test_check_1(self): sql = """ SELECT * from adatatest.stock_cust_daily_return where short_return_rate>1 or long_return_rate>1 or total_return_rate>1 """ self.spark.sql(sql) # 收益率和收益率单测 def test_travel_row(self): row1 = Row(trade_id=u'10036', prd_no=u'2.300262', prd_ind=u'\u73af\u5883\u4e0e\u8bbe\u65bd\u670d\u52a1', detail_list=[{ u'return': u'5058', u'trd_type': u'long_related', u'pos_cash_flow': u'3297265', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'3302324.0', u'busi_date': u'2017-03-17', u'neg_cash_flow': u'3297265' }]) row2 = Row(trade_id=u'10178', prd_no=u'1.600729', prd_ind=u'unknown', detail_list=[ { u'return': u'-18', u'trd_type': u'long_related', u'pos_cash_flow': u'13508', u'exception_label': u'0', u'pre_mkt_val': u'12000', u'now_mkt_val': u'13490.0', u'busi_date': u'2017-03-16', u'neg_cash_flow': u'13508' }, { u'return': u'3816.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-255036.0', u'now_mkt_val': u'-251220.0', u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0' }, { u'return': u'-381', u'trd_type': u'short_related', u'pos_cash_flow': u'45000', u'exception_label': u'0', u'pre_mkt_val': u'3121', u'now_mkt_val': u'-255036.0', u'busi_date': u'2017-03-16', u'neg_cash_flow': u'0.0' }, { u'return': u'-400.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'13490.0', u'now_mkt_val': u'13090.0', u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0' }, ]) rowDict = _travel_row(row1, '2017-03-18') # 计算单个值的return, self.assertEqual(rowDict.get("return"), 5058, "return error!!!") # 计算单个值的return rate return_rate = 5058 * 1.0 / (3297265 + 3297265 + 0.0) print rowDict.get("return_rate"), return_rate self.assertEqual( rowDict.get("return_rate"), return_rate, "first[{}],second[{}],msg[{}]".format( rowDict.get("return_rate"), return_rate, "return rate isn't expected")) # 模拟所有数据类型,计算return rowDict2 = _travel_row(row2, '2017-03-18') self.assertEquals(rowDict2.get("return"), -18 - 381 - 400.0 + 3816.0, "return isn't expected") # 模拟所有数据类型,计算return_rate print rowDict2.get("return_rate"), (-18 - 381 - 400.0 + 3816.0) / ( 12000 + 13508 + 13508 + 251220.0 + 45000) self.assertEquals(rowDict2.get("return_rate"), (-18 - 381 - 400.0 + 3816.0) / (12000 + 13508 + 13508 + 251220.0 + 45000), "return isn't expected") def test_check_date_detail(self): # test trade_id=1987 """ stock_cust_return_by_prd.prd_ind 建筑机械与重型卡车 stock_cust_return_by_prd.prd_no 2.000816 stock_cust_return_by_prd.return -115226.41999999993 stock_cust_return_by_prd.return_rate -0.009502134028067698 stock_cust_return_by_prd.trade_id 1987 stock_cust_return_by_prd.return_ratio 0.6923033340521149 stock_cust_return_by_prd.return_rank 4 stock_cust_return_by_prd.return_rate_rank 2 stock_cust_return_by_prd.busi_date 2017-03-23 stock_cust_return_by_prd.compute 7 """ r = Row( trade_id=u'1987', prd_no=u'2.000816', prd_ind=u'\u5efa\u7b51\u673a\u68b0\u4e0e\u91cd\u578b\u5361\u8f66', detail_list=[{ u'return': u'-115226.41999999993', u'trd_type': u'long_related', u'pos_cash_flow': u'6063186.42', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'5947960.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'6063186.42' }]) rowDict2 = _travel_row(r, '2017-03-18').get("return") self.assertTrue(int(rowDict2) == -115226) # """ # stock_cust_return_by_prd.prd_ind 基础化工 # stock_cust_return_by_prd.prd_no 1.600301 # stock_cust_return_by_prd.return -5924.530000000028 # stock_cust_return_by_prd.return_rate -0.009933835241121989 # stock_cust_return_by_prd.trade_id 12466 # stock_cust_return_by_prd.return_ratio 0.051785613453405335 # stock_cust_return_by_prd.return_rank 4 # stock_cust_return_by_prd.return_rate_rank 6 # stock_cust_return_by_prd.busi_date 2017-03-23 # stock_cust_return_by_prd.compute 7 # """ r2 = Row(trade_id=u'12466', prd_no=u'1.600301', prd_ind=u'\u57fa\u7840\u5316\u5de5', detail_list=[{ u'return': u'225.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'296100.0', u'now_mkt_val': u'296325.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0' }, { u'return': u'-4050.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'296325.0', u'now_mkt_val': u'292275.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0' }, { u'return': u'-2099.530000000028', u'trd_type': u'long_related', u'pos_cash_flow': u'298199.53', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'296100.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'298199.53' }]) rd = _travel_row(r2, "2017-03-18").get("return") self.assertTrue(int(rd), -5924) rd_rate = _travel_row(r2, "2017-03-18").get("return_rate") print rd_rate def test_checkdata_sql(self): df = self.spark.sql(""" select df.trade_id,df.prd_no,df.return,df.return_rate,df.return_ratio,df.return_rank, df.return_rate_rank,busi_date from adatatest.stock_cust_return_by_prd df where return>0 """) df.where(df.trade_id == '17898').where( df.busi_date == '2017-03-31').where( df.prd_no == '2.002763').select( df.trade_id, df.prd_no, "return", df.return_rate, df.return_ratio, df.return_rank, df.return_rate_rank, df.busi_date).orderBy("return_rank", "busi_date").show() df.where(df.trade_id == '17898').where( df.busi_date == '2017-03-31').where( df.prd_no == '2.002763').select("return_ratio").orderBy( "return_rank", "busi_date")