예제 #1
0
 def getEvaluationResults(self):
     """
     Returns the ROC curve with TPR, FPR.
     """
     ctx = SparkContext._active_spark_context
     sql_ctx = SQLContext.getOrCreate(ctx)
     return DataFrame(self._java_obj.getEvaluationResults(), sql_ctx)
예제 #2
0
 def getScoredDataset(self):
     """
     Returns scored dataset for the best model.
     """
     ctx = SparkContext._active_spark_context
     sql_ctx = SQLContext.getOrCreate(ctx)
     return DataFrame(self._java_obj.getScoredDataset(), sql_ctx)
예제 #3
0
 def getBestModelMetrics(self):
     """
     Returns all of the best model metrics results from the evaluator.
     """
     ctx = SparkContext._active_spark_context
     sql_ctx = SQLContext.getOrCreate(ctx)
     return DataFrame(self._java_obj.getBestModelMetrics(), sql_ctx)
예제 #4
0
 def getAllModelMetrics(self):
     """
     Returns a table of metrics from all models compared from the evaluation comparison.
     """
     ctx = SparkContext._active_spark_context
     sql_ctx = SQLContext.getOrCreate(ctx)
     return DataFrame(self._java_obj.getAllModelMetrics(), sql_ctx)
예제 #5
0
 def saveNativeModel(self, sparkSession, filename):
     """
     Save the booster as string format to a local or WASB remote location.
     """
     ctx = SparkContext.getOrCreate()
     sql_ctx = SQLContext.getOrCreate(ctx)
     jsession = sql_ctx.sparkSession._jsparkSession
     self._java_obj.saveNativeModel(jsession, filename)
예제 #6
0
    def test_udf(self):
        self.spark.catalog.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType())
        [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
        self.assertEqual(row[0], 5)

        # This is to check if a deprecated 'SQLContext.registerFunction' can call its alias.
        sqlContext = SQLContext.getOrCreate(self.spark.sparkContext)
        sqlContext.registerFunction("oneArg", lambda x: len(x), IntegerType())
        [row] = sqlContext.sql("SELECT oneArg('test')").collect()
        self.assertEqual(row[0], 4)
예제 #7
0
def read_data(path):
    sc = SparkContext.getOrCreate()
    sc.setLogLevel("ERROR")
    sqlContext = SQLContext.getOrCreate(sc)
    fc = ts.flint.FlintContext(sqlContext)

    data = (sqlContext.read.csv(path, header=True, inferSchema=True)
          .withColumn('time', date_parser('%Y-%m-%d %H:%M:%S')(col('date'))))
    df = fc.read.dataframe(data, is_sorted='False')

    return df
예제 #8
0
 def test_get_or_create(self):
     sc = None
     sql_context = None
     try:
         sc = SparkContext('local[4]', "SQLContextTests")
         sql_context = SQLContext.getOrCreate(sc)
         assert (isinstance(sql_context, SQLContext))
     finally:
         if sql_context is not None:
             sql_context.sparkSession.stop()
         if sc is not None:
             sc.stop()
예제 #9
0
    def __init__(self, app_name="Spark App", spark_conf=None):
        self._app_name = app_name
        if isinstance(spark_conf, SparkConf):
            self._conf = spark_conf
        else:
            self._conf = SparkConf().setAppName(self._app_name)

        self._sc = SparkContext.getOrCreate(conf=self._conf)

        self._sqlContext = SQLContext(self._sc) \
            if self._sc.version < 1.6 \
            else SQLContext.getOrCreate(self._sc)
예제 #10
0
파일: test_udf.py 프로젝트: lumiseven/spark
    def test_non_existed_udf(self):
        spark = self.spark
        self.assertRaisesRegex(
            AnalysisException,
            "Can not load class non_existed_udf",
            lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf"),
        )

        # This is to check if a deprecated 'SQLContext.registerJavaFunction' can call its alias.
        sqlContext = SQLContext.getOrCreate(self.spark.sparkContext)
        self.assertRaisesRegex(
            AnalysisException,
            "Can not load class non_existed_udf",
            lambda: sqlContext.registerJavaFunction("udf1", "non_existed_udf"),
        )
예제 #11
0
def read_data(path):

    sc = SparkContext.getOrCreate()
    sc.setLogLevel("ERROR")
    sql_context = SQLContext.getOrCreate(sc=sc)

    created_at = pd.read_csv(path)
    created_at = created_at.set_index('date')
    created_at.index = pd.to_datetime(created_at.index)
    ts = created_at['item_cnt_day'].resample('M').sum()
    ts_df = pd.Series.to_frame(ts)
    ts_df.reset_index(level=0, inplace=True)
    ts_df = ts_df.rename(columns={"date": "date", "item_cnt_day": "sales"})

    df = sql_context.createDataFrame(ts_df)

    return df
예제 #12
0
    def test_udf_registration_returns_udf(self):
        df = self.spark.range(10)
        add_three = self.spark.udf.register("add_three", lambda x: x + 3, IntegerType())

        self.assertListEqual(
            df.selectExpr("add_three(id) AS plus_three").collect(),
            df.select(add_three("id").alias("plus_three")).collect(),
        )

        # This is to check if a 'SQLContext.udf' can call its alias.
        sqlContext = SQLContext.getOrCreate(self.spark.sparkContext)
        add_four = sqlContext.udf.register("add_four", lambda x: x + 4, IntegerType())

        self.assertListEqual(
            df.selectExpr("add_four(id) AS plus_four").collect(),
            df.select(add_four("id").alias("plus_four")).collect(),
        )
예제 #13
0
    def apply_model(self, sc, model, data_frame):
        """
        Runs the model on a data frame
        :param model: PipelineModel from pyspark
        :param data_frame: Pyspark data frame
        :return: transformed pyspark data frame
        """
        from pyspark.ml.linalg import Vectors, VectorUDT
        sql_ctx = SQLContext.getOrCreate(sc)
        vector_scaled_df = self._vector_scale(
            data_frame)  # adds DenseVector for features and scaled_features
        transformed_data = model.transform(vector_scaled_df)  # adds prediction

        # udf's
        udf_cast_vector = F.udf(lambda x: Vectors.dense(x), VectorUDT())

        # Depending on the algorithm, different methods will extract the cluster centers
        if self._algorithm == 'GaussianMixture':
            # convert gaussian mean/covariance dataframe to pandas dataframe
            pandas_cluster_centers = (model.stages[-1].gaussiansDF.toPandas())
            centers = sql_ctx.createDataFrame(
                self.gen_gaussians_center(self._dict_parameters['k'],
                                          pandas_cluster_centers))
            merged_df = transformed_data.join(
                centers, self._dict_parameters['predictionCol'], 'inner')
            merged_df = merged_df.withColumn(
                'centers',
                udf_cast_vector('mean'))  # this is stupidity from spark!
        else:
            np_centers = model.stages[-1].clusterCenters()
            centers = self.gen_cluster_center(k=self._dict_parameters['k'],
                                              centers=np_centers)
            broadcast_center = sc.broadcast(centers)

            # Create user defined function for added cluster centers to data frame
            udf_assign_cluster = F.udf(
                f=lambda x: Vectors.dense(broadcast_center.value[x]),
                returnType=VectorUDT())
            merged_df = transformed_data.withColumn(  # adds DenseVector of centers
                "centers",
                udf_assign_cluster(self._dict_parameters['predictionCol']))
        # return the result
        return merged_df
예제 #14
0
)

logger_data_import.addHandler(
    logger_file_handler_parameter
)
logger_file_handler_parameter.setFormatter(
    logger_formatter_parameter
)

from pyspark import SparkContext, SQLContext
from pyspark.sql import types as T
from IPython import display
from ipywidgets import widgets

sc = SparkContext.getOrCreate()
sql_context = SQLContext.getOrCreate(sc)


class GeneralDataImport(object):
    """
    Data object to handle importation of various types of data
    """
    counter = 0
    file_ending = {'txt': 'text',
                   'csv': 'csv',
                   'parquet': 'parquet',
                   'jbdc': 'jbdc',
                   'json': 'json'
                   }

    def __init__(self, path=None, **kwargs):
 def getPerformanceStatistics(self):
     ctx = SparkContext._active_spark_context
     sql_ctx = SQLContext.getOrCreate(ctx)
     return DataFrame(self._java_obj.getPerformanceStatistics(), sql_ctx)
예제 #16
0
def run_spark_application():
    # Creates session and spark context

    sc = SparkContext(appName="Stocks")
    spark = SQLContext.getOrCreate(sc)

    amazonDataFrame = createDataFrame(spark, "amazon.csv")
    amazonInfo = selectInfoFromDataFrame(amazonDataFrame, "amazon")

    googDataFrame = createDataFrame(spark, "google.csv")
    googInfo = selectInfoFromDataFrame(googDataFrame, "google")

    facebookDataFrame = createDataFrame(spark, "facebook.csv")
    facebookInfo = selectInfoFromDataFrame(facebookDataFrame, "facebook")

    # Collect all Date and closing into one dataFrame
    dataTable = amazonInfo.join(
        googInfo, amazonInfo.amazonDate == googInfo.googleDate).select(
            "amazonDate", "closeAmazon", "closeGoogle")
    dataTable = dataTable.join(
        facebookInfo,
        dataTable.amazonDate == facebookInfo.facebookDate).select(
            dataTable["amazonDate"].alias("date"), "closeAmazon",
            "closeGoogle", "closeFacebook")

    # We want to format the data into the format such that first column is all date, second column is symbols and last
    # column is all about the closing price of that day
    amazFormatted = selectInfoAsNewNames(dataTable, "amazon")
    faceBookFormatted = selectInfoAsNewNames(dataTable, "facebook")
    googFormatted = selectInfoAsNewNames(dataTable, "google")
    # We union the columns together, then reorder them by dates
    formattedDataTable = amazFormatted.union(faceBookFormatted).union(
        googFormatted)
    formattedDataTable = formattedDataTable.orderBy(
        formattedDataTable.date.asc())

    # We construct the final DataFrame
    # 1: We add timestamp and price as two new columns based on date and closing Price
    finalDf = formattedDataTable.withColumn(
        "timestamp", to_timestamp(formattedDataTable.date)).withColumn(
            "price", formattedDataTable["closingPrice"].cast("double"))
    # 2: After that we drop the original price and closingPrice
    finalDf = finalDf.drop("date", "closingPrice").sort("timestamp")
    finalDf.registerTempTable("preData")
    finalDf.show()

    # We gather the necessary data to create a time series RDD
    minDate = finalDf.selectExpr(
        "min(timestamp)").collect()[0]["min(timestamp)"]
    maxDate = finalDf.selectExpr("max(timestamp)").alias(
        "timestamp").collect()[0]["max(timestamp)"]
    frequency = DayFrequency(1, sc)

    dtIndex = datetimeindex.DateTimeIndex.uniform(start=minDate,
                                                  end=maxDate,
                                                  freq=frequency,
                                                  sc=sc)
    tsRdd = timeseriesrdd.time_series_rdd_from_observations(
        dtIndex, finalDf, "timestamp", "symbol", "price")

    # Last step BRO, we perform the prediction
    df = tsRdd.map_series(train_transform_func)

    # Let's avoid the zone check in python here. it is way too annoying if we care about that
    finalDf.show()
    spark.stop()
예제 #17
0
from pyspark import SparkContext, SparkConf, SQLContext

appName = "my-app01"
conf = SparkConf().setAppName(appName)

sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")

sqlCtx = SQLContext.getOrCreate(sc)

data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)

print("Nombre d'elements: " + str(distData.count()))
print("Maximum: " + str(distData.max()))