def test_active_session_with_None_and_not_None_context(self): from pyspark.context import SparkContext from pyspark.conf import SparkConf sc = None session = None try: sc = SparkContext._active_spark_context self.assertEqual(sc, None) activeSession = SparkSession.getActiveSession() self.assertEqual(activeSession, None) sparkConf = SparkConf() sc = SparkContext.getOrCreate(sparkConf) activeSession = sc._jvm.SparkSession.getActiveSession() self.assertFalse(activeSession.isDefined()) session = SparkSession(sc) activeSession = sc._jvm.SparkSession.getActiveSession() self.assertTrue(activeSession.isDefined()) activeSession2 = SparkSession.getActiveSession() self.assertNotEqual(activeSession2, None) finally: if session is not None: session.stop() if sc is not None: sc.stop()
def test_get_active_session_when_no_active_session(self): active = SparkSession.getActiveSession() self.assertEqual(active, None) spark = SparkSession.builder.master("local").getOrCreate() active = SparkSession.getActiveSession() self.assertEqual(active, spark) spark.stop() active = SparkSession.getActiveSession() self.assertEqual(active, None)
def test_get_active_session_when_no_active_session(self): active = SparkSession.getActiveSession() self.assertEqual(active, None) spark = SparkSession.builder \ .master("local") \ .getOrCreate() active = SparkSession.getActiveSession() self.assertEqual(active, spark) spark.stop() active = SparkSession.getActiveSession() self.assertEqual(active, None)
def default_session() -> SparkSession: spark = SparkSession.getActiveSession() if spark is not None: return spark builder = SparkSession.builder.appName("pandas-on-Spark") return builder.getOrCreate()
def fill_missing_tbins_with_zero(pickup_bin_count_df: DataFrame, n_clusters): now = datetime.now() print("fill_missing_tbins_with_zero() - starting..") ss = SparkSession.getActiveSession() print("fill_missing_tbins_with_zero() - caching data...") pickup_bin_count_df_pd: pd.DataFrame = pickup_bin_count_df.toPandas() print("fill_missing_tbins_with_zero() - caching finished") for cluster_id in range(0, n_clusters): now_for_cluster = datetime.now() current_cluster_df = pickup_bin_count_df_pd.loc[pickup_bin_count_df_pd.pickup_cluster == cluster_id] time_bins = current_cluster_df["time_bin"].unique() for time_bin in range(4464): # todo добавить динамическое вычилсление количества бинов по месяцу # todo проверить совместимость типов str if time_bin not in time_bins: pickup_bin_count_df_pd = pickup_bin_count_df_pd.append({ "pickup_cluster": cluster_id, "time_bin": time_bin, "count": 0 }, ignore_index=True) print("fill_missing_tbins_with_zero() - cluster {0} processing finished. time taken {1}".format(cluster_id, datetime.now() - now_for_cluster)) pickup_bin_count_df_pd = pickup_bin_count_df_pd.loc[pickup_bin_count_df_pd.time_bin >= 0] print("fill_missing_tbins_with_zero() - time taken {}".format(datetime.now() - now)) assert len(pickup_bin_count_df_pd.index) == 4464 * 30 return ss.createDataFrame(pickup_bin_count_df_pd)
def import_trait_mappings() -> DataFrame: """Load the remote trait mappings file to a Spark dataframe.""" remote_trait_mappings_url = ( 'https://raw.githubusercontent.com/opentargets/curation/master/mappings/disease/manual_string.tsv' ) SparkSession.getActiveSession().sparkContext.addFile(remote_trait_mappings_url) return ( SparkSession.getActiveSession() .read.csv(SparkFiles.get('manual_string.tsv'), header=True, sep='\t') .select( col('PROPERTY_VALUE').alias('diseaseFromSource'), col('SEMANTIC_TAG').alias('diseaseFromSourceMappedId') ) )
def metric(data, metric): # Если по какой-то причине (случайно кто-то сломал сессию, пытаясь выполнить другую задачу и тд) # спарк упал, то пытаемся досчитать недосчитанное локально на одном узле try: spark_context = SparkSession.getActiveSession().sparkContext SQLContext(spark_context).clearCache() except AttributeError: spark_context = SparkContext.getOrCreate( SparkConf().setMaster("local[*]")) spark = SparkSession \ .builder \ .getOrCreate() data = data.drop('probability') try: if metric == 'sil': res = -ClusteringEvaluator( predictionCol='labels', distanceMeasure='squaredEuclidean').evaluate(data) elif metric == 'ch': res = ChIndex().find(data, spark_context) elif metric == 'db': res = DaviesIndex().find(data, spark_context) return res except TypeError: print("\n\nTYPE ERROR OCCURED IN Metric.py:\n\nDATA: {}\n\n".format( data)) return 0 except Py4JJavaError: print("\n\nPy4JJavaError ERROR OCCURED IN Metric.py:\n\nDATA: {}\n\n". format(data.printSchema())) return sys.float_info.max
def createOrReplace( cls, sparkSession: Optional[SparkSession] = None ) -> "DeltaTableBuilder": """ Return :class:`DeltaTableBuilder` object that can be used to specify the table name, location, columns, partitioning columns, table comment, and table properties replace a Delta table, error if the table doesn't exist (the same as SQL `REPLACE TABLE`). See :class:`DeltaTableBuilder` for a full description and examples of this operation. :param sparkSession: SparkSession to use for creating the table :return: an instance of DeltaTableBuilder :rtype: :py:class:`~delta.tables.DeltaTableBuilder` .. note:: Evolving """ if sparkSession is None: sparkSession = SparkSession.getActiveSession() assert sparkSession is not None jvm: "JVMView" = sparkSession._sc._jvm # type: ignore[attr-defined] jsparkSession: "JavaObject" = sparkSession._jsparkSession # type: ignore[attr-defined] jdt = jvm.io.delta.tables.DeltaTable.createOrReplace(jsparkSession) return DeltaTableBuilder(sparkSession, jdt)
def test_active_session(self): spark = SparkSession.builder.master("local").getOrCreate() try: activeSession = SparkSession.getActiveSession() df = activeSession.createDataFrame([(1, "Alice")], ["age", "name"]) self.assertEqual(df.collect(), [Row(age=1, name="Alice")]) finally: spark.stop()
def _get_empty_result_df(reference_df, primary_column_list): """ Function to create an empty dataframe containing Primary Key columns and an additional EFFULGE_VARIANCE_PROVOKER column Parameters: Name : reference_df Type : pyspark.sql.dataframe.DataFrame object Name : primary_column_list Type : list of String Return Type: pyspark.sql.dataframe.DataFrame object """ _schema = reference_df.select(*primary_column_list).schema _schema.add("EFFULGE_VARIANCE_PROVOKER", ArrayType(StringType())) # empty_df = SparkSession.getActiveSession().createDataFrame( SparkSession.getActiveSession().sparkContext.emptyRDD(), _schema) # return empty_df
def __init__(self, use_pretrained): self.train_data = None self.hdfs_uri = HDFS_HOST + "/models/trained/gbt-regressor/{}".format(datetime.now().date()) self.sc = SparkSession.getActiveSession() self.use_pretrained = use_pretrained if use_pretrained: self.model: GBTRegressor = self.__load_from_hdfs() else: self.model: GBTRegressor = GBTRegressor(featuresCol="features", maxIter=20, labelCol="target")
def __init__(self, use_pretrained): self.data = None self.hdfs_uri = HDFS_HOST + "/models/trained/kmeans/{}".format( datetime.now().date()) self.sc = SparkSession.getActiveSession() if use_pretrained: self.model: KMeansModel = self.__load_from_hdfs() else: self.model: KMeansModel = None
def test_get_active_session_after_create_dataframe(self): session2 = None try: activeSession1 = SparkSession.getActiveSession() session1 = self.spark self.assertEqual(session1, activeSession1) session2 = self.spark.newSession() activeSession2 = SparkSession.getActiveSession() self.assertEqual(session1, activeSession2) self.assertNotEqual(session2, activeSession2) session2.createDataFrame([(1, 'Alice')], ['age', 'name']) activeSession3 = SparkSession.getActiveSession() self.assertEqual(session2, activeSession3) session1.createDataFrame([(1, 'Alice')], ['age', 'name']) activeSession4 = SparkSession.getActiveSession() self.assertEqual(session1, activeSession4) finally: if session2 is not None: session2.stop()
def test_get_active_session_after_create_dataframe(self): session2 = None try: activeSession1 = SparkSession.getActiveSession() session1 = self.spark self.assertEqual(session1, activeSession1) session2 = self.spark.newSession() activeSession2 = SparkSession.getActiveSession() self.assertEqual(session1, activeSession2) self.assertNotEqual(session2, activeSession2) session2.createDataFrame([(1, "Alice")], ["age", "name"]) activeSession3 = SparkSession.getActiveSession() self.assertEqual(session2, activeSession3) session1.createDataFrame([(1, "Alice")], ["age", "name"]) activeSession4 = SparkSession.getActiveSession() self.assertEqual(session1, activeSession4) finally: if session2 is not None: session2.stop()
def test_active_session(self): spark = SparkSession.builder \ .master("local") \ .getOrCreate() try: activeSession = SparkSession.getActiveSession() df = activeSession.createDataFrame([(1, 'Alice')], ['age', 'name']) self.assertEqual(df.collect(), [Row(age=1, name=u'Alice')]) finally: spark.stop()
def _clean_cache_and_view(cached_dataframe_list, temporary_view_list): """ Function to explicitly free dataframe cache and to remove temporary views Parameters: Name : cached_dataframe_list Type : list of pyspark.sql.dataframe.DataFrame objects Name : temporary_view_list Type : list of String Return Type: None """ # clear previously cached list for d_f in cached_dataframe_list: d_f.unpersist(blocking=True) # clear temporary view for view in temporary_view_list: SparkSession.getActiveSession().catalog.dropTempView(view)
def _get_active_spark_session(): try: from pyspark.sql import SparkSession except ImportError: # Return None if user doesn't have PySpark installed return None try: # getActiveSession() only exists in Spark 3.0 and above return SparkSession.getActiveSession() except Exception: # Fall back to this internal field for Spark 2.x and below. return SparkSession._instantiatedSession
def get_spark_session_or_start_new_with_repoconfig( store_config: SparkOfflineStoreConfig, ) -> SparkSession: spark_session = SparkSession.getActiveSession() if not spark_session: spark_builder = SparkSession.builder spark_conf = store_config.spark_conf if spark_conf: spark_builder = spark_builder.config( conf=SparkConf().setAll([(k, v) for k, v in spark_conf.items()])) spark_session = spark_builder.getOrCreate() spark_session.conf.set("spark.sql.parser.quotedRegexColumnNames", "true") return spark_session
def wrapper(self, *args, **kwargs): # type: ignore session = SparkSession.getActiveSession() if not session: return f(self, *args, **kwargs) session.sparkContext.setJobGroup(name, name) # type: ignore start_time = time.time() ret = f(self, *args, **kwargs) _logger.info( f"Elapsed time (name: {name}) is {time.time() - start_time}(s)" ) _clear_job_group(session) return ret
def default_session() -> SparkSession: spark = SparkSession.getActiveSession() if spark is None: spark = SparkSession.builder.appName("pandas-on-Spark").getOrCreate() # Turn ANSI off when testing the pandas API on Spark since # the behavior of pandas API on Spark follows pandas, not SQL. if is_testing(): spark.conf.set("spark.sql.ansi.enabled", False) # type: ignore[arg-type] if spark.conf.get("spark.sql.ansi.enabled"): log_advice( "The config 'spark.sql.ansi.enabled' is set to True. " "This can cause unexpected behavior " "from pandas API on Spark since pandas API on Spark follows " "the behavior of pandas, not SQL.") return spark
def load_as_spark(url: str) -> "PySparkDataFrame": # noqa: F821 """ Load the shared table using the give url as a Spark DataFrame. `PySpark` must be installed, and the application must be a PySpark application with the Apache Spark Connector for Delta Sharing installed. :param url: a url under the format "<profile>#<share>.<schema>.<table>" :return: A Spark DataFrame representing the shared table. """ try: from pyspark.sql import SparkSession except ImportError: raise ImportError("Unable to import pyspark. `load_as_spark` requires PySpark.") spark = SparkSession.getActiveSession() assert spark is not None, ( "No active SparkSession was found. " "`load_as_spark` requires running in a PySpark application." ) return spark.read.format("deltaSharing").load(url)
def get_table_query_string(self) -> str: """Returns a string that can directly be used to reference this table in SQL""" if self.table: # Backticks make sure that spark sql knows this a table reference. return f"`{self.table}`" if self.query: return f"({self.query})" # If both the table query string and the actual query are null, we can load from file. spark_session = SparkSession.getActiveSession() if spark_session is None: raise AssertionError("Could not find an active spark session.") try: df = spark_session.read.format(self.file_format).load(self.path) except Exception: logger.exception("Spark read of file source failed.\n" + traceback.format_exc()) tmp_table_name = get_temp_entity_table_name() df.createOrReplaceTempView(tmp_table_name) return f"`{tmp_table_name}`"
def createIfNotExists(cls, sparkSession=None): """ Return :class:`DeltaTableBuilder` object that can be used to specify the table name, location, columns, partitioning columns, table comment, and table properties to create a Delta table, if it does not exists (the same as SQL `CREATE TABLE IF NOT EXISTS`). See :class:`DeltaTableBuilder` for a full description and examples of this operation. :param sparkSession: SparkSession to use for creating the table :return: an instance of DeltaTableBuilder :rtype: :py:class:`~delta.tables.DeltaTableBuilder` .. note:: Evolving """ if sparkSession is None: sparkSession = SparkSession.getActiveSession() assert sparkSession is not None jdt = sparkSession._sc._jvm.io.delta.tables.DeltaTable.createIfNotExists( sparkSession._jsparkSession) return DeltaTableBuilder(sparkSession, jdt)
def fill_missing_tbins_with_zero_withoud_collecting(pickup_bin_count_df: DataFrame, n_clusters): now = datetime.now() print("fill_missing_tbins_with_zero() - starting..") ss = SparkSession.getActiveSession() print("fill_missing_tbins_with_zero() - caching data...") pickup_bin_count_df = pickup_bin_count_df.cache() print("fill_missing_tbins_with_zero() - caching finished") for cluster_id in range(0, n_clusters): print("fill_missing_tbins_with_zero() - processing cluster {0}. {1} - left".format(cluster_id, n_clusters - cluster_id)) for time_bin in range(4464): # todo добавить динамическое вычилсление количества бинов по месяцу row = ss.createDataFrame([(cluster_id, time_bin, 0)], "pickup_cluster int, time_bin int, count int") pickup_bin_count_df = pickup_bin_count_df.union(row) from pyspark.sql.window import Window import pyspark.sql.functions as F from pyspark.sql.functions import col pickup_bin_count_df = pickup_bin_count_df.select("pickup_cluster", "time_bin", "count", F.row_number().over( Window.partitionBy("count").orderBy(pickup_bin_count_df['count'])).alias("row_num")).sort(col("count")) pickup_bin_count_df = pickup_bin_count_df.filter(pickup_bin_count_df.row_num == 1).show() print("fill_missing_tbins_with_zero() - time taken {}".format(datetime.now() - now)) return pickup_bin_count_df
def _spot_mismatch_variance(reference_view_name, received_view_name, prime_columns, non_prime_columns): """ Function to identify the mismatching records and also to identify the columns responsible for mismatch Parameters: Name: reference_view_name Type: String Name: received_view_name Type: String Name: prime_columns Type: list/tuple of Strings Name: non_prime_columns Type: list/tuple of Strings Return Type: pyspark.sql.dataframe.DataFrame """ df_mismatch = SparkSession.getActiveSession().sql(""" select {0} from {1} MINUS select {0} from {2} """.format(", ".join( (*prime_columns, *non_prime_columns)), reference_view_name, received_view_name)).select(prime_columns) # retains same column names for primary attributes # but renames non primary attributes to have "e_" prefix df_expected_with_renamed_columns = SparkSession.getActiveSession().sql(""" select -- primary columns {}, -- non primary columns with "e_" prefix {} from {} """.format( ", ".join(prime_columns), ", ".join(["{0} as e_{0}".format(c) for c in non_prime_columns]), reference_view_name)) # retains same column names for primary attributes # but renames non primary attributes to have "a_" prefix df_available_with_renamed_columns = SparkSession.getActiveSession().sql(""" select -- primary columns {}, -- non primary columns with "a_" prefix {} from {} """.format( ", ".join(prime_columns), ", ".join(["{0} as a_{0}".format(c) for c in non_prime_columns]), received_view_name)) df_mismatch_join = df_mismatch.join(df_expected_with_renamed_columns, prime_columns, "inner").join( df_available_with_renamed_columns, prime_columns, "inner") # for each mismatch record, compare and identify variance columns try: df_variance = df_mismatch_join\ .rdd\ .map( lambda r: _spot_corrupted_attributes(r, prime_columns, non_prime_columns, "e_", "a_") ).toDF() except ValueError as exp: if str(exp) == "RDD is empty": # create empty result set df_variance = _get_empty_result_df( df_expected_with_renamed_columns, prime_columns) else: # raise the same exception, when ValueError message is different raise exp return df_variance
from models.Kmeans import KMeansModelCustom from models.GbtModel import GBTModelCustom from pyspark.sql import SparkSession k_means = KMeansModelCustom(True) gbt = GBTModelCustom(True) ss = SparkSession.getActiveSession()
def generate_data2(table_name="my_data"): df = SparkSession.getActiveSession().range(0, 10) df.write.format("delta").mode("overwrite").saveAsTable(table_name)