def _generate_data(self): df = self.spark.range(10) output = (df .select( 'id', F.rand(42).alias('a'), F.randn(1).alias('b'), F.round(10 * F.rand(42)).alias('Prediction'), F.rand().alias('distance')) .withColumn('is_outlier', F.when(F.col('distance') >= 0.7, 1.0).otherwise(0.)) .withColumn('computed_boundary', F.randn()) ) return output
def trick2(self): @F.udf('integer') def random(v): return Random().randint(0, 3) df = self.session.range(0, 100).withColumn("v", random( F.col("id"))).select("id", "v", F.rand(seed=10).alias("uniform"), F.randn(seed=27).alias("normal")) @F.pandas_udf(df.schema, F.PandasUDFType.GROUPED_MAP) def subtract_mean(pdf): return pdf.assign(uniform=pdf.uniform - pdf.uniform.mean()) df.groupby('v').apply(subtract_mean).show() @F.pandas_udf( StructType([ StructField(name="v", dataType=IntegerType()), StructField(name="add_all", dataType=DoubleType()) ]), F.PandasUDFType.GROUPED_MAP) def addAll(pdf): return pd.DataFrame( data={ "v": pdf.v[0], 'add_all': [pdf.uniform.sum() + pdf.normal.sum()] }) df.groupby('v').apply(addAll).show()
def trick3(self): df = self.session.range(0, 1000000).select( "id", F.rand(seed=10).alias("uniform"), F.randn(seed=27).alias("normal")) # 更少的内存和更快的速度 TimeProfile.profile(lambda: df.toPandas())() TimeProfile.print_prof_data(clear=True)
def test_rand_functions(self): df = self.df from pyspark.sql import functions rnd = df.select('key', functions.rand()).collect() for row in rnd: assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1] rndn = df.select('key', functions.randn(5)).collect() for row in rndn: assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]
def test_rand_functions(self): df = self.df from pyspark.sql import functions rnd = df.select('key', functions.rand()).collect() for row in rnd: assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1] rndn = df.select('key', functions.randn(5)).collect() for row in rndn: assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1] # If the specified seed is 0, we should use it. # https://issues.apache.org/jira/browse/SPARK-9691 rnd1 = df.select('key', functions.rand(0)).collect() rnd2 = df.select('key', functions.rand(0)).collect() self.assertEqual(sorted(rnd1), sorted(rnd2)) rndn1 = df.select('key', functions.randn(0)).collect() rndn2 = df.select('key', functions.randn(0)).collect() self.assertEqual(sorted(rndn1), sorted(rndn2))
def _transform(self, data): mean = self.getMean() stddev = self.getStddev() inputCol = self.getInputCol() outputCol = self.getOutputCol() df = data.withColumn(outputCol, when(col(inputCol).isNull(), stddev * randn() + mean). \ otherwise(col(inputCol))) return df
def benchmark2(): print("===Benchmark 2===") print( "Comparing JDBC writes to InnoDB and API writes to ColumnStore with larger datasets" ) print("") emptyDatabase() print("creating dataframe 1: two random generated doubles") randDF = sqlContext.range(0, 7000000).withColumn( 'uniform', rand(seed=23)).withColumn('normal', randn(seed=42)).cache() randDFRows = randDF.count() randDFItems = randDFRows * len(randDF.columns) randDF.printSchema() print("bemchmarking dataframe 1") rand_benchmark = benchmark2execution( "rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE") randDF.unpersist() print( "creating dataframe 2: sha1, sha256, sha512 and md5 hashes of integers" ) tmpDF = sqlContext.createDataFrame( sc.parallelize(range( 0, 3000000)).map(lambda i: Row(number=i, string=str(i)))) hashDF = tmpDF.select(tmpDF.number, sha1(tmpDF.string).alias("sha1"), sha2(tmpDF.string, 256).alias("sha256"), sha2(tmpDF.string, 512).alias("sha512"), md5(tmpDF.string).alias("md5")).cache() hashDFRows = hashDF.count() hashDFItems = hashDFRows * len(hashDF.columns) hashDF.printSchema() print("bemchmarking dataframe 2") hash_benchmark = benchmark2execution( "hash", hashDF, "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)" ) hashDF.unpersist() print("jdbc_innodb\tapi_columnstore\t\trows\t\titems") print("%.3fs\t\t%.3fs\t\t%i\t\t%i" % (rand_benchmark[0], rand_benchmark[1], randDFRows, randDFItems)) print("%.3fs\t\t%.3fs\t\t%i\t\t%i" % (hash_benchmark[0], hash_benchmark[1], hashDFRows, hashDFItems))
def test_ks(sdf): # generates uniform sdf = sdf.withColumn('rand', F.rand(42)) # compares with uniform,it should NOT reject pval = KolmogorovSmirnovTest(sdf, 'rand', dist='uniform').pValue npt.assert_equal(pval > .05, True) # compares with normal, it SHOULD reject pval = KolmogorovSmirnovTest(sdf, 'rand').pValue npt.assert_equal(pval < .05, True) # generates normal sdf = sdf.withColumn('rand', F.randn(42)) # compares with normal, it should NOT reject pval = KolmogorovSmirnovTest(sdf, 'rand').pValue npt.assert_equal(pval > .05, True) # compares with uniform, it SHOULD reject pval = KolmogorovSmirnovTest(sdf, 'rand', dist='uniform').pValue npt.assert_equal(pval < .05, True)
def randnMultiGaussian(meanArray, covMatrix, seed=0): """ Samples from multivariate gaussian as vector :param meanArray: mean of the distribution, either List[Float] or numpy array :param covMatrix: covariance of the distribution, either List[List[Float]]] (row major) or numpy 2d array :param seed: seed of the rand :return: DenseVector column """ root = np.linalg.cholesky(np.array(covMatrix)) rows, columns = root.shape root = arrayToMatrix( F.lit(rows), F.lit(columns), F.array([F.lit(el) for el in root.reshape(int(rows*columns), order="F").tolist()])) mean = arrayToVector(F.array([F.lit(float(el)) for el in meanArray])) samples = arrayToVector(F.array([F.randn(seed=seed+el) for el in range(0, len(meanArray))])) return _function_factory([mean, root, samples], _spark_functions().scaleToMultiGaussian())
def visualize_time_lines(patient_event, concept_id, num_patients=50): ra_patient = patient_event.where(F.col('standard_concept_id') == concept_id) \ .groupBy('person_id', 'standard_concept_id').agg(F.min('date').alias('index_date')) \ .withColumn('random_num', F.randn()) \ .withColumn('rank', F.dense_rank().over(Window.orderBy('random_num'))) \ .where(F.col('rank') <= num_patients) join_collection_udf = F.udf(lambda its: ' '.join(sorted([str(it[1]) for it in its], key=lambda x: (x[0], x[1]))), T.StringType()) patient_timeline_pd = patient_event \ .join(ra_patient, 'person_id') \ .where(F.col('index_date').between(F.col('lower_bound'), F.col('upper_bound')))\ .withColumn('date_concept_id', F.struct(F.col('index_date'), patient_event['standard_concept_id']))\ .groupBy('person_id').agg(join_collection_udf(F.collect_list('date_concept_id')).alias('sequence'), F.size(F.collect_list('date_concept_id')).alias('size')) \ .where(F.col('size') > 1) \ .select('person_id', 'sequence').toPandas() return patient_timeline_pd
def trick1(self): df = self.session.range(0, 1000000).select( "id", F.rand(seed=10).alias("uniform"), F.randn(seed=27).alias("normal")) @F.udf('double') def plus_one(v): return v + 1 TimeProfile.profile( lambda: df.withColumn('v2', plus_one(df.uniform)).count())() TimeProfile.print_prof_data(clear=True) @F.pandas_udf('double', F.PandasUDFType.SCALAR) def pandas_plus_one(v): return v + 1 TimeProfile.profile( lambda: df.withColumn('v2', pandas_plus_one(df.uniform)).count())() TimeProfile.print_prof_data(clear=True)
from pyspark.sql import SQLContext sc = SparkContext() sqlContext = SQLContext(sc) from pyspark.sql.session import SparkSession spark = SparkSession(sc) # <font size=4,font style=arial> # 5 tane normal dağılımlı(mean=0,std=1) kolona sahip olan veri setimizi oluşturalım. # </font> # In[3]: df1 = sqlContext.range(0, 4000000).withColumn( 'normal1', func.abs(func.round(100 * randn(seed=1), 2))).withColumn( 'normal2', func.abs(func.round(100 * randn(seed=2), 2))).withColumn( 'normal3', func.abs(func.round(100 * randn(seed=3), 2))).withColumn( 'normal4', func.abs(func.round(100 * randn(seed=4), 2))).withColumn( 'normal5', func.abs(func.round(100 * randn(seed=5), 2))) # <font size=4,font style=arial> # Sparkcontext'in içeriği aşağıda ki şekilde görülebilir. Spark UI tıklanırsa çalışılan jop'lar görüleiblir. Master makinem kendi lokalim. # </font> # In[4]: sc
from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql.functions import rand, randn from pyspark.sql.functions import mean, min, max spark = SparkSession \ .builder \ .appName("Summary and descriptive statistics") \ .getOrCreate() sqlContext = SQLContext(spark.sparkContext) # A slightly different way to generate the two random columns. df = sqlContext.range(0, 10) \ .withColumn("uniform", rand(seed=10)) \ .withColumn("normal", randn(seed=27)) df.describe().show() # If you have a DataFrame with a large number of columns, you can also run # describe on a subset of the columns: df.describe("uniform", "normal").show() # Of course, while describe works well for quick exploratory data analysis, # you can also control the list of descriptive statistics and the columns # they apply to using the normal select on a DataFrame: df.select([mean("uniform"), min("uniform"), max("uniform")]).show() spark.stop()
def gridIsingModel(self, n, vStd=1.0, eStd=1.0): """Grid Ising model with random parameters. Ising models are probabilistic graphical models over binary variables x\ :sub:`i`. Each binary variable x\ :sub:`i` corresponds to one vertex, and it may take values -1 or +1. The probability distribution P(X) (over all x\ :sub:`i`) is parameterized by vertex factors a\ :sub:`i` and edge factors b\ :sub:`ij`: P(X) = (1/Z) * exp[ \sum_i a_i x_i + \sum_{ij} b_{ij} x_i x_j ] where Z is the normalization constant (partition function). See `Wikipedia <https://en.wikipedia.org/wiki/Ising_model>`__ for more information on Ising models. Each vertex is parameterized by a single scalar a\ :sub:`i`. Each edge is parameterized by a single scalar b\ :sub:`ij`. :param n: Length of one side of the grid. The grid will be of size n x n. :param vStd: Standard deviation of normal distribution used to generate vertex factors "a". Default of 1.0. :param eStd: Standard deviation of normal distribution used to generate edge factors "b". Default of 1.0. :return: GraphFrame. Vertices have columns "id" and "a". Edges have columns "src", "dst", and "b". Edges are directed, but they should be treated as undirected in any algorithms run on this model. Vertex IDs are of the form "i,j". E.g., vertex "1,3" is in the second row and fourth column of the grid. """ assert n >= 1,\ "Grid graph must have size >= 1, but was given invalid value n = {}".format(n) # create coodinates grid coordinates = self._sql.createDataFrame(itertools.product( range(n), range(n)), schema=('i', 'j')) # create SQL expression for converting coordinates (i,j) to a string ID "i,j" # avoid Cartesian join due to SPARK-15425: use generator since n should be small toIDudf = sqlfunctions.udf(lambda i, j: '{},{}'.format(i, j)) # create the vertex DataFrame # create SQL expression for converting coordinates (i,j) to a string ID "i,j" vIDcol = toIDudf(sqlfunctions.col('i'), sqlfunctions.col('j')) # add random parameters generated from a normal distribution seed = 12345 vertices = (coordinates.withColumn('id', vIDcol).withColumn( 'a', sqlfunctions.randn(seed) * vStd)) # create the edge DataFrame # create SQL expression for converting coordinates (i,j+1) and (i+1,j) to string IDs rightIDcol = toIDudf(sqlfunctions.col('i'), sqlfunctions.col('j') + 1) downIDcol = toIDudf(sqlfunctions.col('i') + 1, sqlfunctions.col('j')) horizontalEdges = (coordinates.filter( sqlfunctions.col('j') != n - 1).select(vIDcol.alias('src'), rightIDcol.alias('dst'))) verticalEdges = (coordinates.filter( sqlfunctions.col('i') != n - 1).select(vIDcol.alias('src'), downIDcol.alias('dst'))) allEdges = horizontalEdges.unionAll(verticalEdges) # add random parameters from a normal distribution edges = allEdges.withColumn('b', sqlfunctions.randn(seed + 1) * eStd) # create the GraphFrame g = GraphFrame(vertices, edges) # materialize graph as workaround for SPARK-13333 g.vertices.cache().count() g.edges.cache().count() return g
def randn(df, c, mu=0.0, sigma=1.0, seed=None): return df.withColumn(c, F.randn(seed) * sigma + mu)
# Random data generation is useful for testing of existing algorithms and # implementing randomized algorithms, such as random projection. We provide # methods under sql.functions for generating columns that contains i.i.d. # values drawn from a distribution, e.g., uniform (rand), and standard # normal (randn). from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql.functions import rand, randn spark = SparkSession \ .builder \ .appName("Random data generation") \ .getOrCreate() sqlContext = SQLContext(spark.sparkContext) # Create a DataFrame with one int column and 10 rows. df = sqlContext.range(0, 10) df.show() # Generate two other columns using uniform distribution and normal # distribution. df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show() spark.stop()
#FITS #gal=spark.read.format("fits").option("hdu",1)\ # .load(os.environ['FITSDIR'])\ # .select(F.col("RA"), F.col("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z")) #PKT PARQUET="hdfs://134.158.75.222:8020/user/julien.peloton/LSST10Y_shuffled_uncomp" gal=spark.read.parquet(PARQUET)\ .select(F.col("RA"), F.col("DEC").alias("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z")) gal.printSchema() timer.step() timer.print("load") ####### gal=gal.withColumn("zrec",(gal.z+0.03*(1+gal.z)*randn()).astype('float')) gal.show(5) timer.step() timer.print("show") ##cache gal=gal.cache() print("N={}".format(gal.count())) timer.step() timer.print("data loaded") #### zshell=[0.0,0.13,0.27,0.43,0.63,0.82,1.05,1.32,1.61,1.95,2.32] #zshell=[0.1,0.2,0.3,0.4,0.5] #writemap write=False
# Use the `range` method to generate a sequence of integers and add new # columns as appropriate. spark.range(1000).show(5) # Use the `rand` function to generate a uniform random variable: from pyspark.sql.functions import rand df_uniform = spark \ .range(1000) \ .withColumn("uniform", rand(12345)) df_uniform.show(5) df_uniform.describe("uniform").show() # Or a Bernoulli random variable with $p = 0.25$: df_bernoulli = spark \ .range(1000) \ .withColumn("bernoulli", (rand(12345) < 0.25).cast("int")) df_bernoulli.show(5) df_bernoulli.groupby("bernoulli").count().show() # Use the `randn` function to generate a normal random variable: from pyspark.sql.functions import randn df_normal = spark.range(1000).withColumn("normal", 42 + 2 * randn(54321)) df_normal.show(5) df_normal.describe("normal").show() # ## Cleanup # Stop the SparkSession: spark.stop()
# OLS problem, states to be estimated are a, b and c # z = a*x + b * y + c + w, where w ~ N(0, 1) a = 0.5 b = 0.2 c = 1.2 noise_param = 1 label_expression = F.col("x") * a + F.col("y") * b + c + F.col("w") input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\ .withColumn("mod", F.col("value") % num_states)\ .withColumn("stateKey", F.col("mod").cast("String"))\ .withColumn("x", (F.col("value")/num_states).cast("Integer").cast("Double"))\ .withColumn("y", F.sqrt("x"))\ .withColumn("bias", F.lit(1.0))\ .withColumn("w", F.randn(0) * noise_param)\ .withColumn("label", label_expression) rls = RecursiveLeastSquaresFilter()\ .setStateKeyCol("stateKey")\ .setFeatureSize(3)\ .setInitialEstimate(Vectors.dense([0.0, 0.0, 0.0]))\ .setRegularizationMatrixFactor(10E6)\ .setForgettingFactor(0.99) assembler = VectorAssembler(inputCols=["x", "y", "bias"], outputCol="features") measurements = assembler.transform(input_df) query = rls.transform(measurements)\ .writeStream\
def get_baseline_scores(train_df, val_df, evaluator, eval_name): stats_rating_df = ( train_df .agg( F.avg('rating').alias('avg_rating'), F.stddev_samp('rating').alias('stddev_rating') ) ) stats_row = stats_rating_df.head() print('[plot_scores Train] Avg: {}'.format(stats_row[0])) print('[plot_scores Train] Std Dev: {}'.format(stats_row[1])) # Naive model: random normal rating centered on average rating and scaled # with standard deviation of training data. train_predict_df = ( train_df .crossJoin(stats_rating_df) .withColumn( 'prediction', F.col('avg_rating') + F.randn() * F.col('stddev_rating') ) .select( 'user', 'item', 'rating', 'prediction' ) ) val_predict_df = ( val_df .crossJoin(stats_rating_df) .withColumn( 'prediction', F.col('avg_rating') + F.randn() * F.col('stddev_rating') ) .select( 'user', 'item', 'rating', 'prediction' ) ) naive_score_train = evaluator.evaluate(train_predict_df) naive_score_val = evaluator.evaluate(val_predict_df) print('Train Naive {} score: {}'.format(eval_name, naive_score_train)) print('Validation Naive {} score: {}'.format(eval_name, naive_score_val)) estimator = Recommender( lambda_1=0.0, lambda_2=0.0, lambda_3=0.0, useALS=False, useBias=True, userCol='user', itemCol='item', ratingCol='rating' ) model = estimator.fit(train_df) baseline_score_train = evaluator.evaluate(model.transform(train_df)) baseline_score_val = evaluator.evaluate(model.transform(val_df)) print('Train Baseline {} score: {}'.format(eval_name, baseline_score_train)) print('Validation Baseline {} score: {}'.format(eval_name, baseline_score_val)) return ( naive_score_train, naive_score_val, baseline_score_train, baseline_score_val )
# <font size=4,font style=arial> # <br> # Veri setimizi oluşturalım. 5 kolonlu olsun kolonlar;<br> # normal1: normal dağılımlı kolon<br> # normal2: normal dağılımlı kolon<br> # normal3: normal dağılımlı kolon<br> # normal4: normal dağılımlı kolon<br> # normal5: normal dağılımlı kolon<br> # normal6: normal dağılımlı kolon<br> # Y:0,1 dğişkenlibir kolon<br> # </font> # In[4]: df1 = sqlContext.range(0, 1000000).withColumn( 'normal1', func.abs(10 * func.round(randn(seed=1), 2))).withColumn( 'normal2', func.abs(100 * func.round(randn(seed=2), 2))).withColumn( 'normal3', func.abs(func.round(randn(seed=3), 2))).withColumn( 'normal4', func.abs(func.round(randn(seed=4), 2))).withColumn( 'normal5', func.abs(func.round(randn(seed=5), 2))).withColumn( 'normal6', func.abs(func.round(randn(seed=6), 2))) df1.cache() # <font size=4,font style=arial> # <br> # Y değişkenini diğer değişkenlerle ilişkilendirelim ki output değişkenimiz olan Y anlamlı olsun . Modelde # anlamlı sonuçlar elde edelim. <br> # </font> # In[5]:
from pyspark.sql.functions import rand, randn # Create a DataFrame with one int column and 10 rows. df = sqlContext.range(0, 10) df.show() # COMMAND ---------- display(df) # COMMAND ---------- # Generate two other columns using uniform distribution and normal distribution. df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show() # COMMAND ---------- display( df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))) # COMMAND ---------- # MAGIC %md ### Summary and Descriptive Statistics # MAGIC # MAGIC The first operation to perform after importing data is to get some sense of what it looks like. For numerical columns, knowing the descriptive summary statistics can help a lot in understanding the distribution of your data. The function `describe` returns a DataFrame containing information such as number of non-null entries (count), mean, standard deviation, and minimum and maximum value for each numerical column. # COMMAND ----------
from pyspark.mllib.stat import Statistics parallelData = sc.parallelize([1.0, 2.0, 5.0, 4.0, 3.0, 3.3, 5.5]) # run a KS test for the sample versus a standard normal distribution testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) print(testResult) from pyspark.sql import SQLContext sqlCtx = SQLContext(sc)from pyspark.sql.functions import rand, randn # Create a DataFrame with one int column and 10 rows. df = sqlCtx.range(0, 10) df.show() df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show() df.describe().show() df = sqlCtx.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) print df.stat.corr('rand1', 'rand2') print df.stat.corr('id', 'id') names = ["Alice", "Bob", "Mike"] items = ["milk", "bread", "butter", "apples", "oranges"] df = sqlCtx.createDataFrame([(names[i % 3], items[i % 5]) for i in range(100)], ["name", "item"]) df.show(10) df = sqlCtx.createDataFrame([(1, 2, 3) if i % 2 == 0 else (i, 2 * i, i % 4) for i in range(100)], ["a", "b", "c"]) print df.show(10) freq = df.stat.freqItems(["a", "b", "c"], 0.4)
timer = Timer() ddt = [] ana = "1: load(HDU)" gal=spark.read.format("fits").option("hdu",1)\ .load(ff)\ .select(F.col("RA"), F.col("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z")) gal.printSchema() ddt.append(timer.step()) timer.print(ana) ##### gauss gal = gal.withColumn("zrec_g", (gal.z + 0.03 * (1 + gal.z) * F.randn()).astype('float')) ####full PZ ana = "2b: PZ full + show(5)" # read the inverse-cumulative file cuminv = np.loadtxt('scripts/cuminv_gauss.txt') #cuminv=np.loadtxt('scripts/cuminv_gauss.txt') #cuminv=np.loadtxt('scripts/cuminv_bdt.txt') # we know the binnings that were used dz = 0.01 du = 1 / 1000. #find indices and return the table values @pandas_udf('float', PandasUDFType.SCALAR)
"hdfs://hadoop-hadoop-hdfs-nn.spark.svc.cluster.local:9000/") # generate the worker nodes. spark = SparkSession.builder.config(conf=sparkConf).getOrCreate() sc = spark.sparkContext from pyspark.sql.functions import randn, round as roundNum data = [(i, i) for i in range(10)] # random data columns = ['id', 'txt'] # add your columns label here df = spark.createDataFrame(data, columns) df = df.drop('txt') for i in range(10): df = df.withColumn('col' + str(i), roundNum(randn(), 3)) df.show() URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration fs = FileSystem.get( URI("hdfs://hadoop-hadoop-hdfs-nn.spark.svc.cluster.local:9000"), Configuration()) status = fs.listStatus(Path('/')) for fileStatus in status: print(fileStatus.getPath())
"Usage: lkf_rate_source_llt.py <num_states> <measurements_per_sec>", file=sys.stderr) sys.exit(-1) num_states = int(sys.argv[1]) mps = int(sys.argv[2]) spark = SparkSession.builder.appName("RateSourceLKF").getOrCreate() spark.sparkContext.setLogLevel("WARN") noise_param = 1 input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\ .withColumn("mod", F.col("value") % num_states)\ .withColumn("stateKey", F.col("mod").cast("String"))\ .withColumn("trend", (F.col("value")/num_states).cast("Integer") + F.randn() * noise_param) lkf = LinearKalmanFilter(2, 1)\ .setStateKeyCol("stateKey")\ .setMeasurementCol("measurement")\ .setInitialCovariance(Matrices.dense(2, 2, [10000.0, 0.0, 0.0, 10000.0]))\ .setProcessModel(Matrices.dense(2, 2, [1.0, 0.0, 1.0, 1.0]))\ .setProcessNoise(Matrices.dense(2, 2, [0.0001, 0.0, 0.0, 0.0001]))\ .setMeasurementNoise(Matrices.dense(1, 1, [noise_param]))\ .setMeasurementModel(Matrices.dense(1, 2, [1.0, 0.0])) assembler = VectorAssembler(inputCols=["trend"], outputCol="measurement") measurements = assembler.transform(input_df) query = lkf.transform(measurements)\ .writeStream\
def gridIsingModel(self, n, vStd=1.0, eStd=1.0): """Grid Ising model with random parameters. Ising models are probabilistic graphical models over binary variables x\ :sub:`i`. Each binary variable x\ :sub:`i` corresponds to one vertex, and it may take values -1 or +1. The probability distribution P(X) (over all x\ :sub:`i`) is parameterized by vertex factors a\ :sub:`i` and edge factors b\ :sub:`ij`: P(X) = (1/Z) * exp[ \sum_i a_i x_i + \sum_{ij} b_{ij} x_i x_j ] where Z is the normalization constant (partition function). See `Wikipedia <https://en.wikipedia.org/wiki/Ising_model>`__ for more information on Ising models. Each vertex is parameterized by a single scalar a\ :sub:`i`. Each edge is parameterized by a single scalar b\ :sub:`ij`. :param n: Length of one side of the grid. The grid will be of size n x n. :param vStd: Standard deviation of normal distribution used to generate vertex factors "a". Default of 1.0. :param eStd: Standard deviation of normal distribution used to generate edge factors "b". Default of 1.0. :return: GraphFrame. Vertices have columns "id" and "a". Edges have columns "src", "dst", and "b". Edges are directed, but they should be treated as undirected in any algorithms run on this model. Vertex IDs are of the form "i,j". E.g., vertex "1,3" is in the second row and fourth column of the grid. """ # check param n if n < 1: raise ValueError( "Grid graph must have size >= 1, but was given invalid value n = {}" .format(n)) # create coodinates grid coordinates = self._sql.createDataFrame( itertools.product(range(n), range(n)), schema=('i', 'j')) # create SQL expression for converting coordinates (i,j) to a string ID "i,j" # avoid Cartesian join due to SPARK-15425: use generator since n should be small toIDudf = sqlfunctions.udf(lambda i, j: '{},{}'.format(i,j)) # create the vertex DataFrame # create SQL expression for converting coordinates (i,j) to a string ID "i,j" vIDcol = toIDudf(sqlfunctions.col('i'), sqlfunctions.col('j')) # add random parameters generated from a normal distribution seed = 12345 vertices = (coordinates.withColumn('id', vIDcol) .withColumn('a', sqlfunctions.randn(seed) * vStd)) # create the edge DataFrame # create SQL expression for converting coordinates (i,j+1) and (i+1,j) to string IDs rightIDcol = toIDudf(sqlfunctions.col('i'), sqlfunctions.col('j') + 1) downIDcol = toIDudf(sqlfunctions.col('i') + 1, sqlfunctions.col('j')) horizontalEdges = (coordinates.filter(sqlfunctions.col('j') != n - 1) .select(vIDcol.alias('src'), rightIDcol.alias('dst'))) verticalEdges = (coordinates.filter(sqlfunctions.col('i') != n - 1) .select(vIDcol.alias('src'), downIDcol.alias('dst'))) allEdges = horizontalEdges.unionAll(verticalEdges) # add random parameters from a normal distribution edges = allEdges.withColumn('b', sqlfunctions.randn(seed + 1) * eStd) # create the GraphFrame g = GraphFrame(vertices, edges) # materialize graph as workaround for SPARK-13333 g.vertices.cache().count() g.edges.cache().count() return g
def benchmark(ff): timer=Timer() ddt=[] ana="1: load(HDU)" gal=spark.read.format("fits").option("hdu",1)\ .load(ff)\ .select(F.col("RA"), F.col("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z")) #PARQUET="hdfs://134.158.75.222:8020/user/julien.peloton/LSST10Y_shuffled_uncomp" #gal=spark.read.parquet(PARQUET)\ # .select(F.col("RA"), F.col("DEC").alias("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z")) gal.printSchema() ddt.append(timer.step()) timer.print(ana) ####### ana="2: gauss PZ + show(5)" gal=gal.withColumn("zrec",(gal.z+0.03*(1+gal.z)*F.randn()).astype('float')) gal.show(5) ddt.append(timer.step()) timer.print(ana) #### ana="3: cache (count)" gal=gal.cache()#.persist(StorageLevel.MEMORY_ONLY_SER) print("N={}".format(gal.count())) ddt.append(timer.step()) timer.print(ana) ##### ana="4: statistics z" gal.describe(['z']).show() ddt.append(timer.step()) timer.print(ana) ana="5: statistics all" #get all statitics on z gal.describe().show() ddt.append(timer.step()) timer.print(ana) ana="6: minmax" minmax=gal.select(F.min("z"),F.max("z")).first() zmin=minmax[0] zmax=minmax[1] Nbins=100 dz=(zmax-zmin)/Nbins ddt.append(timer.step()) timer.print(ana) ############### ana="7: histo df" #df on z #zbin=gal.select(gal.z,((gal['z']-zmin)/dz).astype('int').alias('bin')) zbin=gal.select(gal.z,((gal['z']-zmin-dz/2)/dz).cast(IntegerType()).alias('bin')) h=zbin.groupBy("bin").count().orderBy(F.asc("bin")) p=h.select("bin",(zmin+dz/2+h['bin']*dz).alias('zbin'),"count").drop("bin").toPandas() #p.to_csv("p.csv") ddt.append(timer.step()) timer.print(ana) # #ana="histo p3" #import df_tools #p3=df_tools.hist_df(gal,"zrec",Nbins,bounds=minmax).toPandas() #p3.to_csv("prec3.csv") #timer.print(ana) #p3.to_csv("prec3.csv") #ana="histo p5 (on the fly)" #p5=df_tools.hist_df(gal.withColumn("zrec2",gal.z+0.05*randn()*(1+gal.z)),"zrec2",Nbins,bounds=minmax).toPandas() #timer.print(ana) #p5.to_csv("prec5.csv") #ana="8a: histo (UDF)" #binNumber_udf=F.udf(lambda z: int((z-zmin)/dz)) #p_udf=gal.select(gal.z,binNumber_udf(gal.z).alias('bin')).groupBy("bin").count().orderBy(F.asc("bin")).toPandas() #ddt.append(timer.step()) #timer.print(ana) ana="8b: histo (pandas UDF)" @pandas_udf("float", PandasUDFType.SCALAR) def binFloat(z): return pd.Series((z-zmin)/dz) #dont know how to cast in pd so do it later p_udf=gal.select(gal.z,binFloat("z").astype('int').alias('bin')).groupBy("bin").count().orderBy(F.asc("bin")).toPandas() ddt.append(timer.step()) timer.print(ana) #via rdd #ana="9: histo (rdd) reducebykey" #from operator import add #h=zbin.select("bin").rdd.map(lambda r:(r.bin,1)).reduceByKey(add).sortByKey().map(lambda x: (zmin+dz/2 +x[0]*dz,x[1])) #h=zbin.select("bin").rdd.map(lambda r:(r[0],1)).countByKey() #h.collect() #plt.plot(h.keys(),k,values()) #ddt.append(timer.step()) #timer.print(ana) ## ana="10: RDD histogram" ## #p_rdd=gal.select(gal.z).rdd.flatMap(list).histogram(Nbins) ## p_rdd=gal.select(gal.z).rdd.map(lambda r: r.z).histogram(Nbins) ## ddt.append(timer.step()) ## timer.print(ana) ## ana="11:tomographie" ## shell=gal.filter(gal['zrec'].between(0.1,0.2)) ## nside=512 ## @pandas_udf('int', PandasUDFType.SCALAR) ## def Ang2Pix(ra,dec): ## return pd.Series(hp.ang2pix(nside,np.radians(90-dec),np.radians(ra))) ## map=shell.select(Ang2Pix("RA","Dec").alias("ipix")).groupBy("ipix").count().toPandas() #back to python world #myMap = np.zeros(12 * nside**2) #myMap[map['ipix'].values]=map['count'].values #ddt.append(timer.step()) #timer.print(ana) return ddt
## Run this to clear predicted quality tables, in case you want to try again clear_for_demo() # COMMAND ---------- # MAGIC %md # MAGIC ### Generate data for demo # COMMAND ---------- df = spark.range(1, 8000) # Setup Temperature, Pressure, Duration df = df.select("id", F.rand(seed=10).alias("temp_raw"), F.randn(seed=27).alias("pressure_raw"), F.rand(seed=45).alias("duration_raw"), F.randn(seed=54).alias("temp_n"), F.randn(seed=78).alias("pressure_n"), F.randn(seed=96).alias("duration_n"), F.round(F.rand() * 7.5 * 60, 0).alias("timestamp_n")) df = df.withColumn('pid', (100000 + df["id"])) df = (df.withColumn("temp_raw", (10.0 * df["temp_raw"]) + 350).withColumn( "pressure_raw", (2.0 * df["pressure_raw"]) + 12).withColumn( "duration_raw", (4.0 * df["duration_raw"]) + 28.5).withColumn( "timestamp", ((df["id"] * 7.5 * 60) + 1561939200 + df["timestamp_n"]).cast('timestamp'))) df = df.withColumn("process_time", df["timestamp"]) df = df.withColumn("qualitycheck_time", F.date_trunc("day", F.date_add(df["timestamp"], 2)))
# id:id kolonu<br> # uniform: uniform dağılımlı kolon<br> # uniform1: uniform dağılımlı kolon<br> # normal: normal dağılımlı kolon<br> # normal1: normal dağılımlı kolon<br> # Y:0,1 içeren kolon<br> # NOT:func.round oluşturduğumzu rastgele sayıları yuvarlamak içindir <br> # NOT: Sparksjobs ları http://localhost:4040/jobs/ 'den takip edebilirsiniz # </font> # In[33]: df1 = sqlContext.range(0, 1000000).withColumn( 'uniform', func.round(rand(seed=10), 2)).withColumn( 'uniform1', func.round(rand(seed=9), 2)).withColumn( 'normal', func.round(randn(seed=22), 2)).withColumn( 'normal1', func.round(randn(seed=23), 2)).withColumn('Y', when(rand() > 0.5, 1).otherwise(0)) # <font size=4,font style=arial> # df1 satır sayısı # </font> # In[34]: #df1 satır sayısı df1.count() # <font size=4,font style=arial>
from pyspark.sql.functions import rand, randn, mean, min, max from pyspark.sql.context import SQLContext from pyspark.context import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("sparkDataFrame") sc = SparkContext(conf = conf) sqlcontext = SQLContext(sc) # 1. Create a DataFrame with one int column and 10 rows. df = sqlcontext.range(0, 10) df.show() # Generate two other columns using uniform distribution and normal distribution. df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")) df.show() # 2. Summary and Descriptive Statistics df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)) df.describe('uniform', 'normal').show() df.select([mean('uniform'), min('uniform'), max('uniform')]).show() # 3. Sample covariance and correlation # Covariance is a measure of how two variables change with respect to each other. # A positive number would mean that there is a tendency that as one variable increases, # the other increases as well. # A negative number would mean that as one variable increases, # the other variable has a tendency to decrease. df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) df.stat.cov('rand1', 'rand2') df.stat.cov('id', 'id')
def break_ties(target): return (funcs.col(target) + (funcs.randn(conf['random_state']) / funcs.lit(10000000000))).alias(target)