Пример #1
0
from environment import spark
from pyspark.sql.types import IntegerType
from pyspark.sql.column import Column

df = spark.createDataFrame([(0, "Hi I heard about Spark"),
                            (1, "I wish Java could use case classes"),
                            (2, "Logistic,regression,models,are,neat")],
                           ["id", "sentence"])


class Coll(Column):
    def __init__(self, jc):
        super().__init__(jc)

    def cast(self):

        return


sen_col = df['sentence']
dd = df.withColumn('sentence', sen_col.cast(IntegerType()))
dd.show()
Пример #2
0
from environment import spark
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )]
df = spark.createDataFrame(data, ["features"])
r1 = Correlation.corr(df, "features").head()
print(r1[0])
Пример #3
0
from environment import spark
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest

data = [(0.0, Vectors.dense(0.5, 10.0)),
        (0.0, Vectors.dense(1.5, 20.0)),
        (1.0, Vectors.dense(1.5, 30.0)),
        (0.0, Vectors.dense(3.5, 30.0)),
        (0.0, Vectors.dense(3.5, 40.0)),
        (1.0, Vectors.dense(3.5, 40.0))]
df = spark.createDataFrame(data, ["label", "features"])
rr = ChiSquareTest.test(df, "features", "label")
rr.show()
print(str(rr.pValues))
Пример #4
0
from environment import spark
import json

df1 = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"),
                             (5, "f"), (6, "g"), (7, "h"), (8, "i"), (9, "j"),
                             (10, "k")], ["id", "words"])

rdd = df1.toJSON()

# print(df1.toJSON().first())
Пример #5
0
from environment import spark
from pyspark.sql.functions import col

df1 = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"),
                             (5, "f"), (6, "g"), (7, "h"), (8, "i"), (9, "j"),
                             (10, "k")], ["id1", "words"])

df2 = spark.createDataFrame([(0, "shanghai"), (1, "nanjing"), (3, "shanghai"),
                             (4, "suzhou"), (5, "pizhou"), (6, "ningbo"),
                             (7, "内蒙古"), (8, "广州"), (9, "厦门"), (20, "哈尔滨")],
                            ["id2", "address"])

# df1.filter(df1.id1 < 5).join(df2, df1.id1 == df2.id2).groupBy(df2.address).agg({"id1": "max"}).show()
# df1.agg({"id1": "max"}).show()

# df_as1 = df1.alias("df_as1")
# df_as2 = df1.alias("df_as2")
# joined_df = df_as1.join(df_as2, col("df_as1.id1") == col("df_as2.id1"), 'inner')
# joined_df.select("df_as1.words", "df_as2.words", "df_as2.id1").show()

# print(df1.coalesce(2).rdd.getNumPartitions())

df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["Col1", "Col2"])
# print(df.colRegex("`(Col)?+.+`"))
df.select(df.colRegex("`(Col)[1|2]+`")).show()
Пример #6
0
from environment import spark
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame(
    [(0.0, "Hi I I I heard about Spark"),
     (0.0, "I wish Java could use case classes"),
     (1.0, "Logistic regression models are neat")], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=100)
featurizedData = hashingTF.transform(wordsData)
featurizedData.show(truncate=False)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show(truncate=False)
Пример #7
0
# json.createGlobalTempView("people")
# spark.sql("SELECT name FROM global_temp.people").show()
# spark.newSession().sql("SELECT name FROM global_temp.people").show()

# sc = spark.sparkContext
# lines = sc.textFile("hdfs://192.168.30.117:9000/tmp/txt")
# parts = lines.map(lambda l: l.split(','))
# people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
# schemaPeople = spark.createDataFrame(people)
# schemaPeople.createOrReplaceTempView("people")
# teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <=19")
# teenName = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
# for name in teenName:
#     print(name)

sc = spark.sparkContext
lines = sc.textFile("hdfs://192.168.30.117:9000/tmp/txt")
parts = lines.map(lambda l: l.split(','))
people = parts.map(lambda p: (p[0], p[1].strip()))
schemaString = 'name age'
fields = [
    StructField(field_name, StringType(), True)
    for field_name in schemaString.split()
]
schema = StructType(fields)

schemaPeople = spark.createDataFrame(people, schema)
schemaPeople.createOrReplaceTempView("people")
teenagers = spark.sql("SELECT * FROM people WHERE age >= 13 AND age <=19")
teenagers.show()
Пример #8
0
from pyspark import SparkContext, SparkConf
from os import environ
from environment import spark

from pyspark.ml.feature import VectorAssembler

# environ['JAVA_HOME'] = 'D:\Program Files\Java\jdk1.8.0_181'
# environ['HADOOP_HOME'] = 'D:\hadoop-3.1.2'
# environ['SPARK_HOME'] = 'D:\spark-2.4.3-bin-hadoop2.7\spark-2.4.3-bin-hadoop2.7'
#
# conf = SparkConf() \
#     .setAppName("demo") \
#     .setMaster("spark://192.168.30.247:7077") \
#     .set("spark.driver.host", "192.168.30.109") \
#     .set("spark.cores.max", "4") \
#     .set("spark.executor.memory", "512m")
# sc = SparkContext(conf=conf)
#
# data = sc.parallelize(['a', 'a', 'd', 'd', 'b', 'c', 'd', 'e', 'f', 'g'])
# pairs = data.map(lambda s: (s, 1))
# counts = pairs.reduceByKey(lambda a, b: a + b)
# print(counts.sortByKey().collect())

df = spark.createDataFrame([(1, 11, 19)], ["id", "name", "age"])
ddf = VectorAssembler(inputCols=["name", "age"],
                      outputCol='features').transform(df)
ddf.show()
Пример #9
0
from environment import spark
from pyspark.ml.feature import VectorAssembler

df = spark.createDataFrame([(0, "a"), (0, "b"), (0, "c"), (0, "d"), (0, "e"),
                            (0, "f"), (0, "g"), (0, "h"), (0, "i"), (0, "j"),
                            (1, "k")], ["id", "words"])


def df2vector(df, features, label):
    if isinstance(features, str):
        features = features.split(',')

    assembler = VectorAssembler(inputCols=features, outputCol="features")

    return assembler.transform(df).select("features",
                                          df[label].alias('label')), df


# dfs = df.randomSplit([0.3, 0.7])
v, _ = df2vector(df, "id", "id")
v.show()
Пример #10
0
from environment import spark
from pyspark.ml.feature import CountVectorizer

df = spark.createDataFrame([(0, "a b c d".split(" ")),
                            (1, "a b b c a d".split(" "))], ["id", "words"])

# cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=100, minDF=2.0)
# model = cv.fit(df)
#
# df1 = spark.createDataFrame([
#     (0, "a b c d".split(" ")),
#     (1, "a b b c a d".split(" "))
# ], ["id", "words"])
#
# result = model.transform(df1)
# result.show(truncate=False)

df.agg()
Пример #11
0
from environment import spark
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),
                                  (0.0, Vectors.dense([2.0, 1.0, -1.0])),
                                  (0.0, Vectors.dense([2.0, 1.3, 1.0])),
                                  (1.0, Vectors.dense([0.0, 1.2, -0.5]))],
                                 ["label", "features"])
lr = LogisticRegression(maxIter=10, regParam=0.01)
model1 = lr.fit(training)

paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30  # Specify 1 Param, overwriting the original maxIter.
paramMap.update({
    lr.regParam: 0.1,
    lr.threshold: 0.55
})  # Specify multiple Params.

# You can combine paramMaps, which are python dictionaries.
paramMap2 = {lr.probabilityCol: "myProbability"}  # Change output column name
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)

# Now learn a new model using the paramMapCombined parameters.
# paramMapCombined overrides all parameters set earlier via lr.set* methods.
model2 = lr.fit(training, paramMapCombined)

test = spark.createDataFrame([(1.0, Vectors.dense([-1.0, 1.5, 1.3])),
                              (0.0, Vectors.dense([3.0, 2.0, -0.1])),
                              (1.0, Vectors.dense([0.0, 2.2, -1.5]))],
Пример #12
0
from environment import spark
from pyspark.ml.feature import Word2Vec

documentDF = spark.createDataFrame(
    [("Hi I heard about Spark".split(" "), ),
     ("I wish Java could use case classes".split(" "), ),
     ("Logistic regression models are neat".split(" "), )], ["text"])

word2Vec = Word2Vec(vectorSize=3,
                    minCount=0,
                    inputCol="text",
                    outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
result.show(truncate=False)
Пример #13
0
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from environment import spark

# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents and print columns of interest.
Пример #14
0
from pyspark.ml.stat import Summarizer
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from environment import spark

df = spark.createDataFrame([(1.0, Vectors.dense(1.0, 1.0, 1.0)),
                            (0.0, Vectors.dense(1.0, 2.0, 3.0))],
                           ["weight", "features"])
summarizer = Summarizer.metrics("mean", "count")
df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)
df.select(summarizer.summary(df.features)).show(truncate=False)
df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)
df.select(Summarizer.mean(df.features)).show(truncate=False)