inp = sc.emptyRDD() for file in cluster: filePath = "/user/hm74/NYCColumns/" + file.replace("'", "") tmp = sc.textFile(filePath).map(lambda row: row.split("\t")).map( lambda x: (str(x[0]), x[1])) inp = sc.union([inp, tmp]) inp = inp.reduceByKey(lambda x, y: int(x) + int(y)) df = sqlContext.createDataFrame(inp, ['inp', 'count']) df = df.withColumn("sentence", resub(df.inp)) #tokenized words regexTokenized = RegexTokenizer(inputCol="sentence", outputCol="words").transform(df) regexTokenized = regexTokenized.select("sentence", "words", "count") #transform word to vec print("word2vec") word2vec = Word2Vec(inputCol="words", outputCol="features").setMinCount(0) model = word2vec.fit(regexTokenized) result = model.transform(regexTokenized) result.createOrReplaceTempView("result") #dulicate dataset to fit kmeans print("flat") n_to_array = F.udf(lambda n: [1] * int(n), ArrayType(IntegerType())) df2 = result.withColumn("n", n_to_array(result["count"])) flat = df2.withColumn('n', F.explode(df2.n)) #fit kmeans and save the model
feature_prep = data.select(lower(data["message"]).alias("message"), length(data["message"]).alias("length"), "label") feature_prep = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W+").transform(feature_prep) feature_prep = StopWordsRemover(inputCol='words',outputCol='stop_words_removed').transform(feature_prep) feature_prep = HashingTF(inputCol="stop_words_removed", outputCol="hashing_tf", numFeatures=4000).transform(feature_prep) feature_prep = IDF(inputCol="hashing_tf", outputCol="tf_idf").fit(feature_prep).transform(feature_prep) feature_prep = StringIndexer(inputCol='label',outputCol='label_indexed').fit(feature_prep).transform(feature_prep) feature_prep = VectorAssembler(inputCols=["tf_idf", "length"], outputCol="features").transform(feature_prep) final_data = feature_prep.select("label_indexed", "features") # Split data into train and test sets train_data, test_data = final_data.randomSplit([0.7,0.3]) # Model training classifier = RandomForestClassifier(featuresCol="features", labelCol="label_indexed", numTrees=100, maxDepth=25) model = classifier.fit(train_data) model = classifier.fit(train_data) # Transform the test data using the model to get predictions predicted_test_data = model.transform(test_data) # Evaluate the model performance