from environment import spark from pyspark.sql.types import IntegerType from pyspark.sql.column import Column df = spark.createDataFrame([(0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat")], ["id", "sentence"]) class Coll(Column): def __init__(self, jc): super().__init__(jc) def cast(self): return sen_col = df['sentence'] dd = df.withColumn('sentence', sen_col.cast(IntegerType())) dd.show()
from environment import spark from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ), (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ), (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print(r1[0])
from environment import spark from pyspark.ml.linalg import Vectors from pyspark.ml.stat import ChiSquareTest data = [(0.0, Vectors.dense(0.5, 10.0)), (0.0, Vectors.dense(1.5, 20.0)), (1.0, Vectors.dense(1.5, 30.0)), (0.0, Vectors.dense(3.5, 30.0)), (0.0, Vectors.dense(3.5, 40.0)), (1.0, Vectors.dense(3.5, 40.0))] df = spark.createDataFrame(data, ["label", "features"]) rr = ChiSquareTest.test(df, "features", "label") rr.show() print(str(rr.pValues))
from environment import spark import json df1 = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"), (5, "f"), (6, "g"), (7, "h"), (8, "i"), (9, "j"), (10, "k")], ["id", "words"]) rdd = df1.toJSON() # print(df1.toJSON().first())
from environment import spark from pyspark.sql.functions import col df1 = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"), (5, "f"), (6, "g"), (7, "h"), (8, "i"), (9, "j"), (10, "k")], ["id1", "words"]) df2 = spark.createDataFrame([(0, "shanghai"), (1, "nanjing"), (3, "shanghai"), (4, "suzhou"), (5, "pizhou"), (6, "ningbo"), (7, "内蒙古"), (8, "广州"), (9, "厦门"), (20, "哈尔滨")], ["id2", "address"]) # df1.filter(df1.id1 < 5).join(df2, df1.id1 == df2.id2).groupBy(df2.address).agg({"id1": "max"}).show() # df1.agg({"id1": "max"}).show() # df_as1 = df1.alias("df_as1") # df_as2 = df1.alias("df_as2") # joined_df = df_as1.join(df_as2, col("df_as1.id1") == col("df_as2.id1"), 'inner') # joined_df.select("df_as1.words", "df_as2.words", "df_as2.id1").show() # print(df1.coalesce(2).rdd.getNumPartitions()) df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["Col1", "Col2"]) # print(df.colRegex("`(Col)?+.+`")) df.select(df.colRegex("`(Col)[1|2]+`")).show()
from environment import spark from pyspark.ml.feature import HashingTF, IDF, Tokenizer sentenceData = spark.createDataFrame( [(0.0, "Hi I I I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat")], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=100) featurizedData = hashingTF.transform(wordsData) featurizedData.show(truncate=False) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show(truncate=False)
# json.createGlobalTempView("people") # spark.sql("SELECT name FROM global_temp.people").show() # spark.newSession().sql("SELECT name FROM global_temp.people").show() # sc = spark.sparkContext # lines = sc.textFile("hdfs://192.168.30.117:9000/tmp/txt") # parts = lines.map(lambda l: l.split(',')) # people = parts.map(lambda p: Row(name=p[0], age=int(p[1]))) # schemaPeople = spark.createDataFrame(people) # schemaPeople.createOrReplaceTempView("people") # teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <=19") # teenName = teenagers.rdd.map(lambda p: "Name: " + p.name).collect() # for name in teenName: # print(name) sc = spark.sparkContext lines = sc.textFile("hdfs://192.168.30.117:9000/tmp/txt") parts = lines.map(lambda l: l.split(',')) people = parts.map(lambda p: (p[0], p[1].strip())) schemaString = 'name age' fields = [ StructField(field_name, StringType(), True) for field_name in schemaString.split() ] schema = StructType(fields) schemaPeople = spark.createDataFrame(people, schema) schemaPeople.createOrReplaceTempView("people") teenagers = spark.sql("SELECT * FROM people WHERE age >= 13 AND age <=19") teenagers.show()
from pyspark import SparkContext, SparkConf from os import environ from environment import spark from pyspark.ml.feature import VectorAssembler # environ['JAVA_HOME'] = 'D:\Program Files\Java\jdk1.8.0_181' # environ['HADOOP_HOME'] = 'D:\hadoop-3.1.2' # environ['SPARK_HOME'] = 'D:\spark-2.4.3-bin-hadoop2.7\spark-2.4.3-bin-hadoop2.7' # # conf = SparkConf() \ # .setAppName("demo") \ # .setMaster("spark://192.168.30.247:7077") \ # .set("spark.driver.host", "192.168.30.109") \ # .set("spark.cores.max", "4") \ # .set("spark.executor.memory", "512m") # sc = SparkContext(conf=conf) # # data = sc.parallelize(['a', 'a', 'd', 'd', 'b', 'c', 'd', 'e', 'f', 'g']) # pairs = data.map(lambda s: (s, 1)) # counts = pairs.reduceByKey(lambda a, b: a + b) # print(counts.sortByKey().collect()) df = spark.createDataFrame([(1, 11, 19)], ["id", "name", "age"]) ddf = VectorAssembler(inputCols=["name", "age"], outputCol='features').transform(df) ddf.show()
from environment import spark from pyspark.ml.feature import VectorAssembler df = spark.createDataFrame([(0, "a"), (0, "b"), (0, "c"), (0, "d"), (0, "e"), (0, "f"), (0, "g"), (0, "h"), (0, "i"), (0, "j"), (1, "k")], ["id", "words"]) def df2vector(df, features, label): if isinstance(features, str): features = features.split(',') assembler = VectorAssembler(inputCols=features, outputCol="features") return assembler.transform(df).select("features", df[label].alias('label')), df # dfs = df.randomSplit([0.3, 0.7]) v, _ = df2vector(df, "id", "id") v.show()
from environment import spark from pyspark.ml.feature import CountVectorizer df = spark.createDataFrame([(0, "a b c d".split(" ")), (1, "a b b c a d".split(" "))], ["id", "words"]) # cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=100, minDF=2.0) # model = cv.fit(df) # # df1 = spark.createDataFrame([ # (0, "a b c d".split(" ")), # (1, "a b b c a d".split(" ")) # ], ["id", "words"]) # # result = model.transform(df1) # result.show(truncate=False) df.agg()
from environment import spark from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) lr = LogisticRegression(maxIter=10, regParam=0.01) model1 = lr.fit(training) paramMap = {lr.maxIter: 20} paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. paramMap.update({ lr.regParam: 0.1, lr.threshold: 0.55 }) # Specify multiple Params. # You can combine paramMaps, which are python dictionaries. paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name paramMapCombined = paramMap.copy() paramMapCombined.update(paramMap2) # Now learn a new model using the paramMapCombined parameters. # paramMapCombined overrides all parameters set earlier via lr.set* methods. model2 = lr.fit(training, paramMapCombined) test = spark.createDataFrame([(1.0, Vectors.dense([-1.0, 1.5, 1.3])), (0.0, Vectors.dense([3.0, 2.0, -0.1])), (1.0, Vectors.dense([0.0, 2.2, -1.5]))],
from environment import spark from pyspark.ml.feature import Word2Vec documentDF = spark.createDataFrame( [("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), )], ["text"]) word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF) result = model.transform(documentDF) result.show(truncate=False)
from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer from environment import spark # Prepare training documents from a list of (id, text, label) tuples. training = spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled (id, text) tuples. test = spark.createDataFrame([ (4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop") ], ["id", "text"]) # Make predictions on test documents and print columns of interest.
from pyspark.ml.stat import Summarizer from pyspark.sql import Row from pyspark.ml.linalg import Vectors from environment import spark df = spark.createDataFrame([(1.0, Vectors.dense(1.0, 1.0, 1.0)), (0.0, Vectors.dense(1.0, 2.0, 3.0))], ["weight", "features"]) summarizer = Summarizer.metrics("mean", "count") df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) df.select(summarizer.summary(df.features)).show(truncate=False) df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) df.select(Summarizer.mean(df.features)).show(truncate=False)