def getEvaluationResults(self): """ Returns the ROC curve with TPR, FPR. """ ctx = SparkContext._active_spark_context sql_ctx = SQLContext.getOrCreate(ctx) return DataFrame(self._java_obj.getEvaluationResults(), sql_ctx)
def getScoredDataset(self): """ Returns scored dataset for the best model. """ ctx = SparkContext._active_spark_context sql_ctx = SQLContext.getOrCreate(ctx) return DataFrame(self._java_obj.getScoredDataset(), sql_ctx)
def getBestModelMetrics(self): """ Returns all of the best model metrics results from the evaluator. """ ctx = SparkContext._active_spark_context sql_ctx = SQLContext.getOrCreate(ctx) return DataFrame(self._java_obj.getBestModelMetrics(), sql_ctx)
def getAllModelMetrics(self): """ Returns a table of metrics from all models compared from the evaluation comparison. """ ctx = SparkContext._active_spark_context sql_ctx = SQLContext.getOrCreate(ctx) return DataFrame(self._java_obj.getAllModelMetrics(), sql_ctx)
def saveNativeModel(self, sparkSession, filename): """ Save the booster as string format to a local or WASB remote location. """ ctx = SparkContext.getOrCreate() sql_ctx = SQLContext.getOrCreate(ctx) jsession = sql_ctx.sparkSession._jsparkSession self._java_obj.saveNativeModel(jsession, filename)
def test_udf(self): self.spark.catalog.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType()) [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect() self.assertEqual(row[0], 5) # This is to check if a deprecated 'SQLContext.registerFunction' can call its alias. sqlContext = SQLContext.getOrCreate(self.spark.sparkContext) sqlContext.registerFunction("oneArg", lambda x: len(x), IntegerType()) [row] = sqlContext.sql("SELECT oneArg('test')").collect() self.assertEqual(row[0], 4)
def read_data(path): sc = SparkContext.getOrCreate() sc.setLogLevel("ERROR") sqlContext = SQLContext.getOrCreate(sc) fc = ts.flint.FlintContext(sqlContext) data = (sqlContext.read.csv(path, header=True, inferSchema=True) .withColumn('time', date_parser('%Y-%m-%d %H:%M:%S')(col('date')))) df = fc.read.dataframe(data, is_sorted='False') return df
def test_get_or_create(self): sc = None sql_context = None try: sc = SparkContext('local[4]', "SQLContextTests") sql_context = SQLContext.getOrCreate(sc) assert (isinstance(sql_context, SQLContext)) finally: if sql_context is not None: sql_context.sparkSession.stop() if sc is not None: sc.stop()
def __init__(self, app_name="Spark App", spark_conf=None): self._app_name = app_name if isinstance(spark_conf, SparkConf): self._conf = spark_conf else: self._conf = SparkConf().setAppName(self._app_name) self._sc = SparkContext.getOrCreate(conf=self._conf) self._sqlContext = SQLContext(self._sc) \ if self._sc.version < 1.6 \ else SQLContext.getOrCreate(self._sc)
def test_non_existed_udf(self): spark = self.spark self.assertRaisesRegex( AnalysisException, "Can not load class non_existed_udf", lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf"), ) # This is to check if a deprecated 'SQLContext.registerJavaFunction' can call its alias. sqlContext = SQLContext.getOrCreate(self.spark.sparkContext) self.assertRaisesRegex( AnalysisException, "Can not load class non_existed_udf", lambda: sqlContext.registerJavaFunction("udf1", "non_existed_udf"), )
def read_data(path): sc = SparkContext.getOrCreate() sc.setLogLevel("ERROR") sql_context = SQLContext.getOrCreate(sc=sc) created_at = pd.read_csv(path) created_at = created_at.set_index('date') created_at.index = pd.to_datetime(created_at.index) ts = created_at['item_cnt_day'].resample('M').sum() ts_df = pd.Series.to_frame(ts) ts_df.reset_index(level=0, inplace=True) ts_df = ts_df.rename(columns={"date": "date", "item_cnt_day": "sales"}) df = sql_context.createDataFrame(ts_df) return df
def test_udf_registration_returns_udf(self): df = self.spark.range(10) add_three = self.spark.udf.register("add_three", lambda x: x + 3, IntegerType()) self.assertListEqual( df.selectExpr("add_three(id) AS plus_three").collect(), df.select(add_three("id").alias("plus_three")).collect(), ) # This is to check if a 'SQLContext.udf' can call its alias. sqlContext = SQLContext.getOrCreate(self.spark.sparkContext) add_four = sqlContext.udf.register("add_four", lambda x: x + 4, IntegerType()) self.assertListEqual( df.selectExpr("add_four(id) AS plus_four").collect(), df.select(add_four("id").alias("plus_four")).collect(), )
def apply_model(self, sc, model, data_frame): """ Runs the model on a data frame :param model: PipelineModel from pyspark :param data_frame: Pyspark data frame :return: transformed pyspark data frame """ from pyspark.ml.linalg import Vectors, VectorUDT sql_ctx = SQLContext.getOrCreate(sc) vector_scaled_df = self._vector_scale( data_frame) # adds DenseVector for features and scaled_features transformed_data = model.transform(vector_scaled_df) # adds prediction # udf's udf_cast_vector = F.udf(lambda x: Vectors.dense(x), VectorUDT()) # Depending on the algorithm, different methods will extract the cluster centers if self._algorithm == 'GaussianMixture': # convert gaussian mean/covariance dataframe to pandas dataframe pandas_cluster_centers = (model.stages[-1].gaussiansDF.toPandas()) centers = sql_ctx.createDataFrame( self.gen_gaussians_center(self._dict_parameters['k'], pandas_cluster_centers)) merged_df = transformed_data.join( centers, self._dict_parameters['predictionCol'], 'inner') merged_df = merged_df.withColumn( 'centers', udf_cast_vector('mean')) # this is stupidity from spark! else: np_centers = model.stages[-1].clusterCenters() centers = self.gen_cluster_center(k=self._dict_parameters['k'], centers=np_centers) broadcast_center = sc.broadcast(centers) # Create user defined function for added cluster centers to data frame udf_assign_cluster = F.udf( f=lambda x: Vectors.dense(broadcast_center.value[x]), returnType=VectorUDT()) merged_df = transformed_data.withColumn( # adds DenseVector of centers "centers", udf_assign_cluster(self._dict_parameters['predictionCol'])) # return the result return merged_df
) logger_data_import.addHandler( logger_file_handler_parameter ) logger_file_handler_parameter.setFormatter( logger_formatter_parameter ) from pyspark import SparkContext, SQLContext from pyspark.sql import types as T from IPython import display from ipywidgets import widgets sc = SparkContext.getOrCreate() sql_context = SQLContext.getOrCreate(sc) class GeneralDataImport(object): """ Data object to handle importation of various types of data """ counter = 0 file_ending = {'txt': 'text', 'csv': 'csv', 'parquet': 'parquet', 'jbdc': 'jbdc', 'json': 'json' } def __init__(self, path=None, **kwargs):
def getPerformanceStatistics(self): ctx = SparkContext._active_spark_context sql_ctx = SQLContext.getOrCreate(ctx) return DataFrame(self._java_obj.getPerformanceStatistics(), sql_ctx)
def run_spark_application(): # Creates session and spark context sc = SparkContext(appName="Stocks") spark = SQLContext.getOrCreate(sc) amazonDataFrame = createDataFrame(spark, "amazon.csv") amazonInfo = selectInfoFromDataFrame(amazonDataFrame, "amazon") googDataFrame = createDataFrame(spark, "google.csv") googInfo = selectInfoFromDataFrame(googDataFrame, "google") facebookDataFrame = createDataFrame(spark, "facebook.csv") facebookInfo = selectInfoFromDataFrame(facebookDataFrame, "facebook") # Collect all Date and closing into one dataFrame dataTable = amazonInfo.join( googInfo, amazonInfo.amazonDate == googInfo.googleDate).select( "amazonDate", "closeAmazon", "closeGoogle") dataTable = dataTable.join( facebookInfo, dataTable.amazonDate == facebookInfo.facebookDate).select( dataTable["amazonDate"].alias("date"), "closeAmazon", "closeGoogle", "closeFacebook") # We want to format the data into the format such that first column is all date, second column is symbols and last # column is all about the closing price of that day amazFormatted = selectInfoAsNewNames(dataTable, "amazon") faceBookFormatted = selectInfoAsNewNames(dataTable, "facebook") googFormatted = selectInfoAsNewNames(dataTable, "google") # We union the columns together, then reorder them by dates formattedDataTable = amazFormatted.union(faceBookFormatted).union( googFormatted) formattedDataTable = formattedDataTable.orderBy( formattedDataTable.date.asc()) # We construct the final DataFrame # 1: We add timestamp and price as two new columns based on date and closing Price finalDf = formattedDataTable.withColumn( "timestamp", to_timestamp(formattedDataTable.date)).withColumn( "price", formattedDataTable["closingPrice"].cast("double")) # 2: After that we drop the original price and closingPrice finalDf = finalDf.drop("date", "closingPrice").sort("timestamp") finalDf.registerTempTable("preData") finalDf.show() # We gather the necessary data to create a time series RDD minDate = finalDf.selectExpr( "min(timestamp)").collect()[0]["min(timestamp)"] maxDate = finalDf.selectExpr("max(timestamp)").alias( "timestamp").collect()[0]["max(timestamp)"] frequency = DayFrequency(1, sc) dtIndex = datetimeindex.DateTimeIndex.uniform(start=minDate, end=maxDate, freq=frequency, sc=sc) tsRdd = timeseriesrdd.time_series_rdd_from_observations( dtIndex, finalDf, "timestamp", "symbol", "price") # Last step BRO, we perform the prediction df = tsRdd.map_series(train_transform_func) # Let's avoid the zone check in python here. it is way too annoying if we care about that finalDf.show() spark.stop()
from pyspark import SparkContext, SparkConf, SQLContext appName = "my-app01" conf = SparkConf().setAppName(appName) sc = SparkContext(conf=conf) sc.setLogLevel("WARN") sqlCtx = SQLContext.getOrCreate(sc) data = [1, 2, 3, 4, 5] distData = sc.parallelize(data) print("Nombre d'elements: " + str(distData.count())) print("Maximum: " + str(distData.max()))