示例#1
0
inp = sc.emptyRDD()
for file in cluster:
    filePath = "/user/hm74/NYCColumns/" + file.replace("'", "")
    tmp = sc.textFile(filePath).map(lambda row: row.split("\t")).map(
        lambda x: (str(x[0]), x[1]))
    inp = sc.union([inp, tmp])

inp = inp.reduceByKey(lambda x, y: int(x) + int(y))

df = sqlContext.createDataFrame(inp, ['inp', 'count'])
df = df.withColumn("sentence", resub(df.inp))

#tokenized words
regexTokenized = RegexTokenizer(inputCol="sentence",
                                outputCol="words").transform(df)
regexTokenized = regexTokenized.select("sentence", "words", "count")

#transform word to vec
print("word2vec")
word2vec = Word2Vec(inputCol="words", outputCol="features").setMinCount(0)
model = word2vec.fit(regexTokenized)
result = model.transform(regexTokenized)
result.createOrReplaceTempView("result")

#dulicate dataset to fit kmeans
print("flat")
n_to_array = F.udf(lambda n: [1] * int(n), ArrayType(IntegerType()))
df2 = result.withColumn("n", n_to_array(result["count"]))
flat = df2.withColumn('n', F.explode(df2.n))

#fit kmeans and save the model
示例#2
0
    feature_prep = data.select(lower(data["message"]).alias("message"), length(data["message"]).alias("length"), "label")

    feature_prep = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W+").transform(feature_prep)

    feature_prep = StopWordsRemover(inputCol='words',outputCol='stop_words_removed').transform(feature_prep)

    feature_prep = HashingTF(inputCol="stop_words_removed", outputCol="hashing_tf", numFeatures=4000).transform(feature_prep)

    feature_prep = IDF(inputCol="hashing_tf", outputCol="tf_idf").fit(feature_prep).transform(feature_prep)

    feature_prep = StringIndexer(inputCol='label',outputCol='label_indexed').fit(feature_prep).transform(feature_prep)

    feature_prep = VectorAssembler(inputCols=["tf_idf", "length"],
                           outputCol="features").transform(feature_prep)

    final_data = feature_prep.select("label_indexed", "features")


    # Split data into train and test sets
    train_data, test_data = final_data.randomSplit([0.7,0.3])

    # Model training
    classifier = RandomForestClassifier(featuresCol="features", labelCol="label_indexed", numTrees=100, maxDepth=25)
    model = classifier.fit(train_data)    model = classifier.fit(train_data)


    # Transform the test data using the model to get predictions
    predicted_test_data = model.transform(test_data)


    # Evaluate the model performance