def get_cv_model(self): if self.has_cv: from pyspark.ml.feature import CountVectorizerModel cv_model = CountVectorizerModel.load( os.path.join(model_pth, model_name)) else: from pyspark.ml.feature import CountVectorizer data = self._fit() cv = CountVectorizer(inputCol='item_seq', outputCol='item_seq_enc', vocabSize=1 << 20, minTF=0, minDF=0) cv_model = cv.fit(data) cv_model.write().overwrite().save( os.path.join(model_pth, model_name)) copora = cv_model.vocabulary # 579012 action_copora = [ 'clickout item', 'interaction item deals', 'interaction item image', 'interaction item info', 'search for item', 'interaction item rating' ] item2id = dict(zip(copora, range(1, len(copora) + 1))) action2id = dict(zip(action_copora, range(1, len(action_copora) + 1))) sc = self.sqlContext.sparkContext bitem2id = sc.broadcast(item2id) baction2id = sc.broadcast(action2id) print("Item size:", len(item2id)) return bitem2id, baction2id
def transform_model(sqlContext, modelDataframe): # Load the CV model model = CountVectorizerModel.load("models/cvModel") # Transform the data frame transformedDf = model.transform(modelDataframe) return transformedDf
def loadModel(path): ''' Load Count Vectorizer model input : - path output: - model [Count Vectorizer model data frame] ''' model = CountVectorizerModel.load(path) return model
def __init__(self): self.NUM_TOPICS = 3 db_path = "/home/hadoop/csce678-project/LDA/db.csv" df = pd.read_csv(db_path, lineterminator='\n').drop_duplicates(['ID']) self.db = spark.createDataFrame(df) ldaModel_path = "/home/hadoop/csce678-project/LDA/lda_model" self.ldaModel = LocalLDAModel.load(ldaModel_path) model_path = "/home/hadoop/csce678-project/LDA/token" self.model = CountVectorizerModel.load(model_path)
def createCV(df, sc, col_name='text'): cv = CountVectorizer(inputCol=col_name, outputCol="features", minDF=5) try: df_r = sc.read.parquet("data/vectorized.parquet") cv = CountVectorizer.load("data/vectorized_cv.parquet") model = CountVectorizerModel.load("data/vectorized_model.parquet") except: model = cv.fit(df) df_r = model.transform(df) df_r.write.parquet("data/vectorized.parquet") cv.save("data/vectorized_cv.parquet") model.save("data/vectorized_model.parquet") return (df_r, model, cv)
def main(): review_topics = spark.read.parquet("topic_modelling/review_topics_pos") cv_model = CountVectorizerModel.load("topic_modelling/cvmodel_pos") ldamodel = LocalLDAModel.load("topic_modelling/ldamodel_pos") f1out = open("topic_modelling/postive_topics", "w+") topics = ldamodel.describeTopics( maxTermsPerTopic=10).rdd.map(lambda x: list(x)).collect() vocabulary = cv_model.vocabulary for topic in range(len(topics)): towrite = "topic {} : \n".format(topic) f1out.write(towrite) words = topics[topic][1] scores = topics[topic][2] stri = '' for word in range(len(words)): stri += str(scores[word]) + "*" + vocabulary[words[word]] + " + " f1out.write(stri[:-3] + "\n") f1out.close() review_topics = spark.read.parquet("topic_modelling/review_topics_neg") cv_model = CountVectorizerModel.load("topic_modelling/cvmodel_neg") ldamodel = LocalLDAModel.load("topic_modelling/ldamodel_neg") f2out = open("topic_modelling/negative_topics", "w+") topics = ldamodel.describeTopics( maxTermsPerTopic=10).rdd.map(lambda x: list(x)).collect() vocabulary = cv_model.vocabulary for topic in range(len(topics)): towrite = "topic {} : \n".format(topic) f2out.write(towrite) words = topics[topic][1] scores = topics[topic][2] stri = '' for word in range(len(words)): stri += str(scores[word]) + "*" + vocabulary[words[word]] + " + " f2out.write(stri[:-3] + "\n") f2out.close()
def countVectorizer(self, infoData): originalColName = infoData.get(pc.ORIGINALCOLMNAME) dataset = infoData.get(pc.DATASET) oneHotEncoderMapping = infoData.get(pc.ONEHOTENCODERPATHMAPPING) countVectorizerPath = oneHotEncoderMapping.get(originalColName) countVectorizer = CountVectorizerModel.load(countVectorizerPath) encodedColmName = infoData.get(pc.ENCODEDCOLM) dataset = dataset.drop(encodedColmName) dataset = countVectorizer.transform(dataset) infoData.update({pc.DATASET: dataset}) infoData = pu.featureAssembler(infoData) return infoData
def topicPredict(inputs): #output_path = "/user/llbui/bigdata45_500" output_path = "C:/Users/linhb/bigdata45_500" query = inputs n = 10 #number of similar document to return feature = "abstract" #feature to compare df = sc.parallelize([(0, query)]).toDF(["id", feature]) tokenizer = RegexTokenizer(inputCol=feature, outputCol="words", pattern="\\P{Alpha}+") df2 = tokenizer.transform(df) remover = StopWordsRemover(inputCol="words", outputCol="words2") df3 = remover.transform(df2) udf_remove_words = udf(lambda x: remove_words(x), ArrayType(StringType())) df4 = df3.withColumn("words3", udf_remove_words(df3.words2)) # text to feature vector - TF_IDF countTF_model = CountVectorizerModel.load(output_path + "/tf_model") df_countTF = countTF_model.transform(df4) idf_model = IDFModel.load(output_path + "/idf_model") df_IDF = idf_model.transform(df_countTF) # LDA Model lda_model = LocalLDAModel.load(output_path + "/lda_model") #output topics for document -> topicDistribution df_Feature = lda_model.transform(df_IDF) feature_vector = df_Feature.select("id", "topicDistribution").collect()[0][1] print("Feature Vector:", feature_vector) #Load existing document df_Document = sqlCt.read.load(output_path + "/topicDistribution.parquet") udf_cosineSimilarity = udf( lambda x_vector: cosineSimilarity(x_vector, feature_vector), FloatType()) df_Similarity = df_Document.withColumn( "similarity", udf_cosineSimilarity("topicDistribution")) df_Similarity_Sorted = df_Similarity.sort(desc("similarity")) return df_Similarity_Sorted.limit(n).select("_id", "title", "abstract", "url", "topicDistribution").collect()
def run(sc, args): ds_path = args[0] output_path = args[1] model_path = args[2] model_name = args[3] original_dim = int(args[4]) hidden_dim = int(args[5]) latent_dim = int(args[6]) cv_model_path = 'text-reuse/pipeline/vsh-hashing/cv_model' cv_model = CountVectorizerModel.load(cv_model_path) threshold = None if latent_dim == 8: threshold = np.array([ 0.1457837 , 0.061413 , -0.03391605, 0.04686656, -0.14745404, -0.08641829, -0.04190724, -0.05972087]) elif latent_dim == 16: threshold = np.array([ 0.00231892, -0.00791987, 0.00027306, 0.07018767, -0.07945273, 0.01763633, 0.01450929, 0.04488222, -0.0289745 , 0.02851318, 0.01496754, 0.00133035, -0.00523619, -0.10513094, 0.07906742, -0.07930097]) else: threshold = np.array([-0.01227623, -0.00382998, -0.00029179, -0.04484864, -0.02657753, 0.01505825, 0.00319679, -0.01186464, -0.03057225, 0.02324941, 0.01272652, -0.01289577, -0.02995954, 0.04656317, -0.01781761, -0.01934269, 0.1332021 , 0.00064231, 0.01289176, -0.00131864, 0.02279386, -0.06245026, -0.02096441, 0.01817522, 0.02722896, 0.0211685 , 0.01392594, -0.06448705, 0.00062385, 0.02365676, -0.01207885, 0.02566718]) vdsh_loader = VSH.VDSHLoader(model_path, model_name, threshold , original_dim, hidden_dim, latent_dim) df = sc.pickleFile(ds_path).toDF() tfidf_df = tfidf(df, cv_model, 'paragraph', 'tfidf') tfidf_rdd = tfidf_df.rdd.repartition(8000) tfidf_rdd = tfidf_rdd.mapPartitions(lambda p: hash_partition(p, vdsh_loader)) tfidf_rdd.saveAsPickleFile(output_path)
def main(context): """Main function takes a Spark SQL context.""" comments_df = context.read.parquet("comments.parquet") submissions_df = context.read.parquet("submissions.parquet") labeled_data_df = context.read.parquet("labeled_data.parquet") if path.exists("df_label.parquet"): labeled_df = context.read.parquet("df_label.parquet") comments_df = cleanedCommentDF(comments_df) else: labeled_df = createLabeledDF(comments_df, labeled_data_df) comments_df = cleanedCommentsDF(comments_df) if path.exists("cvModel"): cvModel = CountVectorizerModel.load("cvModel") posModel = CrossValidatorModel.load("pos.model") negModel = CrossValidatorModel.load("neg.model") else: cvModel, posModel, negModel = train(labeled_df) # the "final join" without actually joining! output = cvModel.transform(comments_df) output = output.drop('score') output = output.drop('ngrams_combined') output = output.drop('link_id_cleaned') posResult = posModel.transform(output) posResult = posResult.drop('rawPrediction') posResult = posResult.drop('prediction') posResult = posResult.withColumnRenamed('probability', 'pos_prob') fullResult = negModel.transform(posResult) fullResult = fullResult.withColumnRenamed('probability', 'neg_prob') fullResult = fullResult.drop('rawPrediction') fullResult = fullResult.drop('prediction') fullResult = fullResult.withColumn( 'neg', when(get_probability_udf(fullResult.neg_prob) > 0.25, 1).otherwise(0)) fullResult = fullResult.withColumn( 'pos', when(get_probability_udf(fullResult.pos_prob) > 0.2, 1).otherwise(0)) fullResult.write.parquet("resulting_df.parquet") #fullResult_df = context.read.parquet("resulting_df.parquet") print(fullResult.count())
''' 4、计算N篇文章数据的TFIDF值 步骤: 4.1、获取两个模型相关参数,计算并保存所有的13万文章中的关键字对应的idf值和索引。 为什么要保存这些值?并且存入数据库当中? 后续计算tfidf画像需要使用,避免放入内存中占用过多,持久化使用 Hive中建立表:idf_keywords_values CREATE TABLE idf_keywords_values( keyword STRING comment "article_id", idf DOUBLE comment "idf", index INT comment "index"); ''' from pyspark.ml.feature import CountVectorizerModel # cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/countVectorizerOfArticleWords.model") cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/CV.model") from pyspark.ml.feature import IDFModel # idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDFOfArticleWords.model") idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDF.model") keywords_list_with_idf = list(zip(cv_model.vocabulary, idf_model.idf.toArray())) def func(data): for index in range(len(data)): data[index] = list(data[index]) data[index].append(index) data[index][1] = float(data[index][1]) print(len(keywords_list_with_idf)) func(keywords_list_with_idf)
def main(context): """Main function takes a Spark SQL context.""" # YOUR CODE HERE # YOU MAY ADD OTHER FUNCTIONS AS NEEDED start = time.time() # task 1 if(read_raw): comments = sqlContext.read.json('comments-minimal.json.bz2') submissions = sqlContext.read.json('submissions.json.bz2') label = sqlContext.read.load('labeled_data.csv', format = 'csv', sep = ',',header="true") print("load done") comments.write.parquet('comments') submissions.write.parquet('submissions') label.write.parquet('label') else: comments = context.read.load('comments') submissions = context.read.load('submissions') label = context.read.load('label') print("task 1 complete: read data") #result.show() if(training): # task 2 associate = associated(comments, label).select(col('id'), col('body'), col('labeldjt')) print("task 2 complete: associate data") # task 4, 5 newColumn = associate.withColumn('ngrams', sanitize_udf(associate['body'])) print("task 4, 5 complete: generate unigrams") # task 6A cv = CountVectorizer(inputCol = 'ngrams', outputCol = "features", binary = True) model = cv.fit(newColumn) tmp = model.transform(newColumn) print("task 6A complete: cv model") # task 6B result = tmp.withColumn('poslabel', F.when(col('labeldjt') == 1, 1).otherwise(0)) result = result.withColumn('neglabel', F.when(col('labeldjt') == -1, 1).otherwise(0)) pos = result.select(col('poslabel').alias('label'), col('features')) neg = result.select(col('neglabel').alias('label'), col('features')) print("task 6B complete: relabel data") # task 7 # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10) neglr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator( estimator = poslr, evaluator = posEvaluator, estimatorParamMaps = posParamGrid, numFolds = 5) negCrossval = CrossValidator( estimator = neglr, evaluator = negEvaluator, estimatorParamMaps = negParamGrid, numFolds = 5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("pos.model") negModel.save("neg.model") model.save("cv.model") print("task 7 complete: training") # posModel = CrossValidatorModel.load('pos.model') # negModel = CrossValidatorModel.load('neg.model') # point 7 pos_trans = posModel.transform(posTest) neg_trans = negModel.transform(negTest) pos_results = pos_trans.select(['probability', 'label']) pos_trans_collect = pos_results.collect() pos_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in pos_trans_collect] pos_scoreAndLabels = sc.parallelize(pos_trans_results_list) pos_metrics = metric(pos_scoreAndLabels) print("The ROC score of positive results is: ", pos_metrics.areaUnderROC) neg_results = neg_trans.select(['probability', 'label']) neg_trans_collect = neg_results.collect() neg_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in neg_trans_collect] neg_scoreAndLabels = sc.parallelize(neg_trans_results_list) neg_metrics = metric(neg_scoreAndLabels) print("The ROC score of negative results is: ", neg_metrics.areaUnderROC) plot_ROC(pos_trans_results_list, 'positive_results') plot_ROC(neg_trans_results_list, 'negative_results') print("point 7 complete: ROC") else: model = CountVectorizerModel.load('cv.model') posModel = CrossValidatorModel.load('pos.model') negModel = CrossValidatorModel.load('neg.model') print("model loaded") # task 8 comments_tmp = comments.select(col('id'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('score').alias('com_score')) comments_full = comments_tmp.withColumn('link_id', process_id_udf(comments_tmp['link_id'])) submissions_full = submissions.select(col('id').alias('sub_id'), col('title'), col('score').alias('sub_score')) if(joinFull): com_sub = comments_full.join(submissions_full, comments_full.link_id == submissions_full.sub_id, 'inner') com_sub = com_sub.select(col('id'), col('title'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('com_score'), col('sub_score')) com_sub.write.parquet('com_sub') else: com_sub = context.read.load('com_sub')# .sample(False, 0.01, None) print('task 8 complete: comment with submission') # task 9 filtered = com_sub.filter("body NOT LIKE '%/s%' and body NOT LIKE '>%'") filtered_result = filtered.withColumn('ngrams', sanitize_udf(filtered['body'])) feaResult = model.transform(filtered_result).select(col('id'), col('link_id'), col('created_utc'), \ col('features'), col('author_flair_text'), col('com_score'), col('sub_score'), col('title')) posResult = posModel.transform(feaResult) negResult = negModel.transform(feaResult) print("transformed") pos = posResult.withColumn('pos', threshold_pos_udf(posResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'pos', 'com_score', 'sub_score', 'title') neg = negResult.withColumn('neg', threshold_neg_udf(negResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'neg', 'com_score', 'sub_score', 'title') #final_probs = pos.join(neg, pos.id == neg.id_neg, 'inner').select('id', 'created_utc', 'author_flair_text', 'title', 'pos', 'neg') #final_probs.show() #pos.write.parquet('pos') #neg.write.parquet('neg') print('task 9 complete: predict') # task 10 # compute 1 num_rows = pos.count() pos_filtered = pos.filter(pos.pos == 1) neg_filtered = neg.filter(neg.neg == 1) num_pos = pos_filtered.count() num_neg = neg_filtered.count() print('Percentage of positive comments: {}'.format(num_pos / num_rows)) print('Percentage of negative comments: {}'.format(num_neg / num_rows)) print('finish compute 1') # compute 2 pos_time = pos.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType())) neg_time = neg.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType())) num_pos_time = pos_time.groupBy('time').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('time') num_neg_time = neg_time.groupBy('time').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('time') num_pos_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_pos_time') num_neg_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_neg_time') print('finish compute 2') # compute 3 state = sqlContext.createDataFrame(states, StringType()) pos_state = pos.groupBy('author_flair_text').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')) neg_state = neg.groupBy('author_flair_text').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')) pos_state = pos_state.join(state, pos_state.author_flair_text == state.value, 'inner') pos_state = pos_state.na.drop(subset=['value']) pos_state = pos_state.select(col('author_flair_text').alias('state'), col('Percentage of positive').alias('Positive')) neg_state = neg_state.join(state, neg_state.author_flair_text == state.value, 'inner') neg_state = neg_state.na.drop(subset=['value']) neg_state = neg_state.select(col('author_flair_text').alias('state'), col('Percentage of negative').alias('Negative')) pos_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_state') neg_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_state') print('finish compute 3') # compute 4 pos_com_score = pos.groupBy('com_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('com_score') pos_sub_score = pos.groupBy('sub_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('sub_score') neg_com_score = neg.groupBy('com_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('com_score') neg_sub_score = neg.groupBy('sub_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('sub_score') pos_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_com_score') pos_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_sub_score') neg_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_com_score') neg_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_sub_score') print('finish compute 4') # compute 5 pos_story = pos.groupBy('title').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy(F.desc('Percentage of positive')).limit(10) neg_story = neg.groupBy('title').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy(F.desc('Percentage of negative')).limit(10) pos_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_story') neg_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_story') print('finish compute 5') end = time.time() print('time consumed: {}'.format(end - start))
def read_parquet(parquet_path): parquet_df = spark.read.parquet(parquet_path) parquet_df = parquet_df.drop('id') parquet_df = parquet_df.drop('one_area_price') parquet_df = parquet_df.drop('agency_nameVec') parquet_df = parquet_df.drop('districtVec') parquet_df = parquet_df.drop('room_type') parquet_df.show(truncate=False) print('parquet_df.count()==========11', parquet_df.count(), parquet_df.columns) for i in parquet_df.columns: if ('Vec' not in i) & ('facilities_vectors' not in i): if parquet_df.filter(parquet_df[i].isNull()).count() > 0: parquet_df = parquet_df.na.fill(0, i) elif parquet_df.filter(parquet_df[i] == 'NULL').count() > 0: parquet_df = parquet_df.filter(parquet_df[i] != 'NULL') parquet_df = parquet_df.select( '*', parquet_df[i].cast('float').alias('tmp_name')).drop(i) parquet_df = parquet_df.withColumnRenamed('tmp_name', i) parquet_df = parquet_df.filter(parquet_df[i].isNotNull()) print('parquet_df.count()==========22', i, parquet_df.count()) columns = parquet_df.columns columns.remove('price') from pyspark.ml.feature import OneHotEncoder, StringIndexer, StringIndexerModel from pyspark.ml.feature import CountVectorizer, CountVectorizerModel model_path = "/user/limeng/ganji_daxing_save_models/" columns_list = [] for i in columns: if i == 'facilities_vectors': loadedCountVectorizerModel = CountVectorizerModel.load( model_path + 'count-vectorizer-model') temp = loadedCountVectorizerModel.vocabulary columns_list.extend(temp) elif i == 'rent_typeVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modelrent_type') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'agency_nameVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modelagency_name') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'directionVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modeldirection') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'zoneVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modelzone') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'pay_typeVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modelpay_type') temp = loadedStringIndexerModel.labels columns_list.extend(temp) elif i == 'districtVec': loadedStringIndexerModel = StringIndexerModel.load( model_path + 'stringIndexer_modeldistrict') temp = loadedStringIndexerModel.labels columns_list.extend(temp) else: columns_list.append(i) vecAssembler = VectorAssembler(inputCols=columns, outputCol="features") parquet_df = vecAssembler.transform(parquet_df).select('features', 'price') parquet_df = parquet_df.withColumnRenamed('price', 'label') return parquet_df, columns_list
def main(context): # dem(context) # gop(context) # SAVED PARQUETS # comments is the comments-minimal.json # submissions is the submissions.json # task7 is the result of the count vectorizer # commentsFull is the comments-minimal.json joined with submissions with the sarcasm removed and the > removed #TASK 1 # Read from JSON #comments = sqlContext.read.json("comments-minimal.json.bz2") #comments.registerTempTable("commentsTable") #submissions = sqlContext.read.json("submissions.json.bz2") #submissions.registerTempTable("submissionsTable") # Write the Parquets #comments.write.parquet("comments.parquet") #submissions.write.parquet("submissions.parquet") # Read the parquets comments = sqlContext.read.parquet("comments.parquet") comments.registerTempTable("commentsTable") submissions = sqlContext.read.parquet("submissions.parquet") submissions.registerTempTable("submissionsTable") # Read the CSV labels = sqlContext.read.format('csv').options(header='true', inferSchema='true').load("labeled_data.csv") labels.registerTempTable("labelsTable") #TASK 2 dfTask2 = sqlContext.sql("SELECT commentsTable.* FROM commentsTable INNER JOIN labelsTable ON commentsTable.id = labelsTable.Input_id") #TASK 4 and TASK 5 def do_something(text): return parser.sanitize(text) udf_func = udf(do_something, ArrayType(StringType())) dfTask4 = dfTask2.withColumn("udf_results", udf_func(col("body"))) #TASK 6A and Task 6B if(not os.path.exists("cvModel")): cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0) model = cv.fit(dfTask4) model.write().overwrite().save("cvModel") model = CountVectorizerModel.load("cvModel") dfTask6A = model.transform(dfTask4) dfTask6A.registerTempTable("dfTask6ATable") dfTask6B = sqlContext.sql("SELECT dfTask6ATable.*, IF(labelsTable.labeldjt=1, 1, 0) AS pos_label, if(labelsTable.labeldjt=-1, 1, 0) AS neg_label FROM dfTask6ATable INNER JOIN labelsTable ON dfTask6ATable.id = labelsTable.Input_id") dfTask6B.registerTempTable("dfTask6BTable") pos = sqlContext.sql('select pos_label as label, features from dfTask6BTable') neg = sqlContext.sql('select neg_label as label, features from dfTask6BTable') if(not os.path.exists("www/neg.model") or not os.path.exists("www/pos.model")): # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator( estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=2) negCrossval = CrossValidator( estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=2) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.write().overwrite().save("www/pos.model") negModel.write().overwrite().save("www/neg.model") # TO LOAD BACK IN posModel = CrossValidatorModel.load("www/pos.model") negModel = CrossValidatorModel.load("www/neg.model") # Task 8 dfTask8 = sqlContext.sql('SELECT commentsTable.id, commentsTable.body, commentsTable.created_utc, commentsTable.author_flair_text, submissionsTable.title, commentsTable.score AS comment_score, submissionsTable.score AS story_score FROM commentsTable INNER JOIN submissionsTable ON RIGHT(commentsTable.link_id, 6)=submissionsTable.id') dfTask8 = dfTask8.sample(False, 0.1, None) #TASK 4 and TASK 5 def do_something(text): return parser.sanitize(text) udf_func = udf(do_something, ArrayType(StringType())) dfTask9_1 = dfTask8.withColumn("udf_results", udf_func(col("body"))) #TASK 6A and Task 6B model = CountVectorizerModel.load("cvModel") dfTask9_2 = model.transform(dfTask9_1) dfTask9_2.registerTempTable("dfTask9_2Table") # Task 9 dfTask9_3 = sqlContext.sql("SELECT * FROM dfTask9_2Table WHERE dfTask9_2Table.body NOT LIKE '%/s%' AND dfTask9_2Table.body NOT LIKE '>%'") dfTask9_3.registerTempTable("dfTask9_3Table") posResult_1 = posModel.transform(dfTask9_3) posResult_1.registerTempTable("posResult_1Table") posResult_2 = sqlContext.sql("SELECT posResult_1Table.id, posResult_1Table.body, posResult_1Table.author_flair_text, posResult_1Table.created_utc, posResult_1Table.title, posResult_1Table.comment_score, posResult_1Table.story_score, posResult_1Table.features, posResult_1Table.prediction AS pos FROM posResult_1Table") finalResult_1 = negModel.transform(posResult_2) finalResult_1.registerTempTable("finalResult_1Table") finalResult_2 = sqlContext.sql("SELECT finalResult_1Table.id, finalResult_1Table.body, finalResult_1Table.created_utc, finalResult_1Table.author_flair_text, finalResult_1Table.title, finalResult_1Table.comment_score, finalResult_1Table.story_score, finalResult_1Table.pos, finalResult_1Table.prediction AS neg FROM finalResult_1Table") finalResult_2.registerTempTable("finalResult_2Table") if(not os.path.exists("final.parquet")): finalResult_2.write.parquet("final.parquet") final = sqlContext.read.parquet("final.parquet") final.registerTempTable("finalTable") # Task 10 if(not os.path.exists("question1.csv")): question1 = sqlContext.sql("SELECT (100 * sum(pos) / COUNT(*)) AS percent_pos, (100 * sum(neg) / COUNT(*)) AS percent_neg FROM finalTable") question1.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question1.csv") if(not os.path.exists("question2.csv")): question2 = sqlContext.sql("SELECT DATE(from_unixtime(finalTable.created_utc)) AS date, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY date ORDER BY date") question2.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question2.csv") if(not os.path.exists("question3.csv")): question3 = sqlContext.sql("SELECT finalTable.author_flair_text AS place, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY place ORDER BY place") question3.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question3.csv") if(not os.path.exists("question4_comment.csv")): question4_comment = sqlContext.sql("SELECT finalTable.comment_score AS comment_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY comment_score ORDER BY comment_score") question4_comment.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question4_comment.csv") if(not os.path.exists("question4_story.csv")): question4_story = sqlContext.sql("SELECT finalTable.story_score AS story_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY story_score ORDER BY story_score") question4_story.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question4_story.csv")
item_info = item_info.filter("impression_freqs>%d" % min_TF) # reference_list is NULL 表示需要有上下文信息 train_data = sqlContext.sql( "select * from cmp_tmp_rec_train_agg where reference_list is not NULL " ) test_data = sqlContext.sql( "select * from cmp_tmp_rec_test_agg where action_type=='clickout item' and reference_list is not NULL" ) has_cv = True if has_cv: from pyspark.ml.feature import CountVectorizerModel cv = CountVectorizerModel() cv_model = cv.load(os.path.join(model_pth, model_name)) else: cv = CountVectorizer(inputCol="item_seq", outputCol='item_seq_enc') cv_model = cv.fit(train_data) vocab = cv_model.vocabulary import pandas as pd vocab = pd.DataFrame(np.array([range(len(vocab)), vocab]).T, columns=['impression_id', 'impression']) df_vocab = sqlContext.createDataFrame(vocab) test_data = data_transform(test_data, cv_model) train_data = data_transform(train_data, cv_model)
words_df = article_dataframe.rdd.mapPartitions(segmentation).toDF(['article_id','channel_id','words']) # print(words_df.collect()) try: print('正在判断CV模型是否存在') ktt.textFile('hdfs://master:9000/headlines/model/CV.model') # print(cv_model) print('模型存在') except Exception as e: print(e) print('不存在模型,启动训练') cv = CountVectorizer(inputCol='words', outputCol='countFeatures', vocabSize=200 * 10000, minDF=1.0) # cv_model = cv.fit(words_df) # cv_model.write().overwrite().save('hdfs://master:9000/headlines/model/CV.model') finally: print('读取模型:。。。。') cv_model= CountVectorizerModel.load('hdfs://master:9000/headlines/model/CV.model') print('将模型的词频统计转化为词的向量') cv_result = cv_model.transform(words_df) print('利用计算的词向量训练向量模型并保存') idf = IDF(inputCol='countFeatures',outputCol='idfFeatures') idfmode = idf.fit(cv_result) idfmode.write().overwrite().save('hdfs://master:9000/headlines/model/IDF.model') print('IDF模型训练并保存成功') print('查看cv_model模型效果:') print(cv_model.vocabulary) print('查看IDF模型效果:') print(idfmode.idf.toArray[:20])
queries = idf_model.transform(queries) queries = scalerModel.transform(queries) preds = model.transform(queries) preds.select('payload', 'prediction').show() except: print('No data') APP_NAME = "BigData" conf = pyspark.SparkConf().setAll([('spark.app.name', APP_NAME), ('spark.executor.memory', '8g'), ('spark.cores.max', '2'), ('spark.driver.memory', '8g')]) sc = SparkContext(conf=conf) sqlc = SQLContext(sc) ngrams = udf(to_ngram, StringType()) tokenizer = Tokenizer.load('models/Tokenizer') vectorizer = CountVectorizerModel.load('models/Vectorizer') idf_model = IDFModel.load('models/idf') scalerModel = StandardScalerModel.load('models/scalerModel') model = LogisticRegressionModel.load('models/Logistic_Regression_Model') ssc = StreamingContext(sc, batchDuration=3) lines = ssc.socketTextStream("localhost", 9999) lines.foreachRDD(get_prediction) ssc.start() ssc.awaitTermination()
def main(sqlContext): """Main function takes a Spark SQL context.""" # YOUR CODE HERE # YOU MAY ADD OTHER FUNCTIONS AS NEEDED # load files label = sqlContext.read.load("labeled_data.csv", format="csv", sep=",", inferSchema="true", header="true") if (flag): comments = sqlContext.read.json("comments-minimal.json.bz2") submissions = sqlContext.read.json("submissions.json.bz2") print("loading done") comments.write.parquet("comments_data") submissions.write.parquet("submissions_data") print("writing done") else: comments = sqlContext.read.parquet("comments") submissions = sqlContext.read.parquet("submissions") print("loading done") comments.show() exit() if (save): # task 7 starts here associated = join(comments, label) withngrams = associated.withColumn("ngrams", makeNgrams_udf(associated['body'])) withplabels = withngrams.withColumn("poslabel", pLabel_udf(withngrams['labeldjt'])) withpnlabels = withplabels.withColumn( "neglabel", nLabel_udf(withplabels['labeldjt'])).select( "id", "ngrams", "poslabel", "neglabel") # withpnlabels.show() cv = CountVectorizer(binary=True, inputCol="ngrams", outputCol="features") model = cv.fit(withpnlabels) model.save("cv.model") # model.transform(withpnlabels).show() pos = model.transform(withpnlabels).select( "id", col("poslabel").alias("label"), "features") neg = model.transform(withpnlabels).select( "id", col("neglabel").alias("label"), "features") # pos.show() # neg.show() poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=2) # for test negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=2) # for test posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) posModel.save("pos.model") negModel.save("neg.model") print("trained") else: # comments.show() # submissions.show() posModel = CrossValidatorModel.load("pos.model") negModel = CrossValidatorModel.load("neg.model") model = CountVectorizerModel.load("cv.model") # withngrams = comments.withColumn("ngrams", makeNgrams_udf(comments['body'])) # cv = CountVectorizer(binary=True, inputCol="ngrams", outputCol="features") # model = cv.fit(withngrams) print("model loaded") if (predict == 0): # task 8 starts here temp_comments = comments.select("id", "link_id", "author_flair_text", "created_utc", "body") clean_comments = temp_comments.withColumn( "true_id", getLinkid_udf(temp_comments['link_id'])) # print(clean_comments.count()) clean_submissions = submissions.select( col("id").alias("sub_id"), "title") # clean_comments.show() # clean_submissions.show() com_sub = clean_comments.join( clean_submissions, clean_comments.true_id == clean_submissions.sub_id, "inner") com_sub.write.parquet("com_sub") else: # task 9 starts here com_sub = sqlContext.read.parquet("com_sub") com_sub = com_sub.sample(False, 0.0001, None) filtered = com_sub.filter( "body NOT LIKE '%/s%' and body NOT LIKE '>%'") # print(filtered.count()) filtered_ngrams = filtered.withColumn( "ngrams", makeNgrams_udf(filtered['body'])) # filtered_ngrams = filtered_ngrams.sample(False, 0.01, None) print("prepared") featuredata = model.transform(filtered_ngrams).select( "id", "author_flair_text", "created_utc", "sub_id", "title", "features") posResult = posModel.transform(featuredata) negResult = negModel.transform(featuredata) # posResult.show() # negResult.show() poslabel = posResult.withColumn( "positive", posTh_udf(posResult['probability']) ) # .select("id", "author_flair_text", "created_utc", "title", "positive") neglabel = negResult.withColumn( "negtive", negTh_udf(negResult['probability']) ) # .select(col("id").alias("nid"), "author_flair_text", "created_utc", "title", "negtive") print("predict done") # poslabel.show() # neglabel.show() # how to combine these 2 tables??? # task 10 starts here # c_all = poslabel.count() all_day = poslabel.withColumn( "date", from_unixtime('created_utc').cast( DateType())).groupby("date").count() pos_posts = poslabel.filter("positive = 1") # c_pos_posts = pos_posts.count() # p_pos_posts = c_pos_posts/c_all # print(p_pos_posts) # neg_posts = neglabel.filter("negtive = 1") # c_neg_posts = neg_posts.count() # p_neg_posts = c_neg_posts/c_all # print(p_neg_posts) pos_day = pos_posts.withColumn( "pos_date", from_unixtime('created_utc').cast( DateType())).groupby("pos_date").count().withColumnRenamed( "count", "pos_count") p_pos_day = all_day.join(pos_day, all_day.date == pos_day.pos_date, "left").withColumn( "pos_per", pos_count / count).show() print("end")
from pyspark.sql import SQLContext, Row from pyspark.ml.feature import CountVectorizer, CountVectorizerModel from pyspark.ml.linalg import Vector, Vectors import numpy as np from pyspark.sql.types import DoubleType from pyspark.sql import functions as F from pyspark.ml.clustering import LDA, LocalLDAModel from pyspark.sql.types import * from pyspark.sql.functions import udf ldaModel_path = "lda_model" ldaModel = LocalLDAModel.load(ldaModel_path) model_path = "token" model = CountVectorizerModel.load(model_path) l_test = [(1, "I f*****g hate covid-19")] def test(text, ldaModel, model): rdd_ = sc.parallelize(text) data = rdd_.map(lambda kv: Row(idd=kv[0], Text=kv[1].split(" "))) docDF = spark.createDataFrame(data) result = model.transform(docDF) corpus = result.select("idd", "vectors").rdd.map(lambda xy: [ xy[0], Vectors.sparse(xy[1].size, xy[1].indices, xy[1].values) ]).cache() columns = ['id', 'features'] corpus = corpus.toDF(columns)
def get_cv_model(self): from pyspark.ml.feature import CountVectorizerModel cv_model = CountVectorizerModel.load(self.cv_path) return cv_model
# tdidf # 词频,即tf from pyspark.ml.feature import CountVectorizer # vocabSize是总词汇的大小,minDF是文本中出现的最少次数 cv = CountVectorizer(inputCol="words", outputCol="countFeatures", vocabSize=200 * 10000, minDF=1.0) # 训练词频统计模型 cv_model = cv.fit(words_df) cv_model.write().overwrite().save("models/CV.model") from pyspark.ml.feature import CountVectorizerModel cv_model = CountVectorizerModel.load("models/CV.model") # 得出词频向量结果 cv_result = cv_model.transform(words_df) # idf from pyspark.ml.feature import IDF idf = IDF(inputCol="countFeatures", outputCol="idfFeatures") idf_model = idf.fit(cv_result) idf_model.write().overwrite().save("models/IDF.model") # tf-idf from pyspark.ml.feature import IDFModel idf_model = IDFModel.load("models/IDF.model") tfidf_result = idf_model.transform(cv_result)
for word in s: if word not in stopwords and word != '\r': wordlist.append(word) texts[len(texts) - 1].append(count) texts[len(texts) - 1].append(wordlist) count += 1 spark = SparkSession.builder.appName("dataFrame").getOrCreate() df = spark.createDataFrame(texts, ["id", "words"]) # sc =SparkContext() # dataset = sc.parallelize(texts) # dataset = dataset.zipWithIndex() #cv=CountVectorizer(inputCol="words",outputCol="features",vocabSize=1000,minDF=2.0) model = CountVectorizerModel.load("hdfs:/User/" + user + "/" + user + "_cv.model") result = model.transform(df) #result.show() voclist = model.vocabulary # for x in df.collect(): # print("#######################################################") # print(x) #lda = LDA(k=3,maxIter=10) #lda.save("hdfs:/"+user+"_text.model") # print("ss") #ldamodel =lda.fit(result) ldamodel = LocalLDAModel.load("hdfs:/User/" + user + "/" + user + "_lda.model") # ll=model.logLikelihood(dataset) # lp=model.logPerplexity(dataset) # print("ll"+str(ll))
def main(context): """Main Function takes a Spark SQL Context.""" #--------------------------------------------------------------------------- # TASK 1 # Code for task 1... # df = context.read.csv('labeled_data.csv') # df.write.parquet("labeled_data.parquet") # comments = context.read.json("comments-minimal.json.bz2") # comments.write.parquet("comments.parquet") # submissions = context.read.json("submissions.json.bz2") # submissions.write.parquet("submissions.parquet") labeled_data = context.read.parquet('labeled_data.parquet') labeled_data = labeled_data.withColumnRenamed("_c0", "Input_id")\ .withColumnRenamed("_c1", "labeldem")\ .withColumnRenamed("_c2", "labelgop")\ .withColumnRenamed("_c3", "labeldjt") # labeled_data.show() comments = context.read.parquet('comments.parquet') # comments.show() submissions = context.read.parquet('submissions.parquet') # submissions.show() #--------------------------------------------------------------------------- # TASK 2 # Code for task 2... labeled_comments = labeled_data.join(comments, comments.id == labeled_data.Input_id) labeled_comments = labeled_comments.select('Input_id', 'labeldjt', 'body') # labeled_comments.show() #--------------------------------------------------------------------------- # TASK 4 # Code for task 4... sanitize_udf = udf(sanitize, ArrayType(StringType())) #--------------------------------------------------------------------------- # TASK 5 # Code for task 5... sanitized_labeled_comments = labeled_comments.select( 'Input_id', 'labeldjt', sanitize_udf('body').alias('raw')) #--------------------------------------------------------------------------- # TASK 6A # Code for task 6A... cv = CountVectorizer(binary=True, minDF=10.0, inputCol="raw", outputCol="features") model = cv.fit(sanitized_labeled_comments) sanitized_labeled_comments = model.transform(sanitized_labeled_comments) sanitized_labeled_comments.show(truncate=False) countVectorizerPath = "count_vectorizer_model" model.save(countVectorizerPath) #--------------------------------------------------------------------------- # TASK 6B # Code for task 6B... # Labels: {1, 0, -1, -99} pos = sanitized_labeled_comments.select( sanitized_labeled_comments.features, sanitized_labeled_comments.labeldjt.cast(IntegerType())) pos = pos.withColumnRenamed("labeldjt", "label") pos = pos.replace(-1, 0) pos = pos.replace(-99, 0) # pos.show() neg = sanitized_labeled_comments.select( sanitized_labeled_comments.features, sanitized_labeled_comments.labeldjt.cast(IntegerType())) neg = neg.withColumnRenamed("labeldjt", "label") neg = neg.replace(1, 0) neg = neg.replace(-99, 0) neg = neg.replace(-1, 1) # neg.show() #--------------------------------------------------------------------------- # TASK 7 # Code for task 7... # ... MACHINE LEARNING PORTION TO TRAIN MODELS - Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("project2/pos.model") negModel.save("project2/neg.model") # Positive Model: posModel # Negative Model: negModel #--------------------------------------------------------------------------- # TASK 8 # Code for task 8... # ... Make Final Deliverable for Unseen Data - We don't need labeled_data anymore strip_t3_udf = udf(strip_t3, StringType()) sarcastic_or_quote_udf = udf(sarcastic_or_quote, BooleanType()) # Get Unseen Data sanitized_final_deliverable = comments.select('created_utc', strip_t3_udf(comments.link_id).alias('link_id'), 'author_flair_text', 'id', 'body', 'gilded', sanitize_udf('body').alias('raw'), comments.score.alias('c_score'))\ .filter(sarcastic_or_quote_udf(comments['body'])) #F.when(comments["body"].rlike('^>|\/s'), False).otherwise(True)) # sanitized_final_deliverable.show() #--------------------------------------------------------------------------- # TASK 9 # Code for task 9... # Load models that we saved on previous runs of this script model = CountVectorizerModel.load("count_vectorizer_model") posModel = CrossValidatorModel.load("project2/pos.model") negModel = CrossValidatorModel.load("project2/neg.model") # Sanitize TASK 8 - Run the CountVectorizerModel on TASK 8 Relation sanitized_final_deliverable = model.transform(sanitized_final_deliverable) # Run classifier on unseen data to get positive labels posResult = posModel.transform(sanitized_final_deliverable) # Rename the 3 new columns to prevent name conflicts posResult = posResult.withColumnRenamed("probability", "probability_pos")\ .withColumnRenamed("rawPrediction", "rawPrediction_pos")\ .withColumnRenamed("prediction", "prediction_pos") # Run the classifier on previous positive result to get negative labels too result = negModel.transform(posResult) # Rename the 3 new columns to make it easier to see which is which result = result.withColumnRenamed("probability", "probability_neg")\ .withColumnRenamed("rawPrediction", "rawPrediction_neg")\ .withColumnRenamed("prediction", "prediction_neg") # UDF functions for predicting label based on thresholds predict_pos_udf = udf(predict_pos, IntegerType()) predict_neg_udf = udf(predict_neg, IntegerType()) # Make predictions based on probability and threshold: result = result.select('created_utc', 'author_flair_text', 'link_id', 'id', 'c_score', 'gilded',\ predict_pos_udf(result.probability_pos).alias('pos'),\ predict_neg_udf(result.probability_neg).alias('neg')) result.write.parquet("result.parquet") # result.show() #--------------------------------------------------------------------------- # TASK 10 # Code for task 10... # ... Perform Analysis on the Predictions result = context.read.parquet("result.parquet") submissions = submissions.select('id', 'title', submissions.score.alias('s_score')) result = result.join(submissions, result.link_id == submissions.id) # .explain() result.show() context.registerDataFrameAsTable(result, "result") # 1. Percentage of Comments that Were Positive/Negative Across ALL Submissions task_10_1 = context.sql( "SELECT title, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY title" ) task_10_1.show() task_10_1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_1.csv") # 2. Percentage of Comments that Were Positive/Negative Across ALL Days task_10_2 = context.sql( "SELECT FROM_UNIXTIME(created_utc, 'Y-M-d') AS day, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY day ORDER BY day asc" ) task_10_2.show() task_10_2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_2.csv") # 3. Percentage of Comments that Were Positive/Negative Across ALL States context.registerFunction("check_state_udf", check_state, BooleanType()) task_10_3 = context.sql( "SELECT author_flair_text AS state, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result WHERE check_state_udf(author_flair_text) = True GROUP BY state" ) task_10_3.show() task_10_3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_3.csv") # 4A. Percentage of Comments that Were Positive/Negative Across ALL Comments task_10_4A = context.sql( "SELECT c_score AS comment_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY comment_score" ) task_10_4A.show() task_10_4A.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_4A.csv") # 4B. Percentage of Comments that Were Positive/Negative Across ALL Story Scores task_10_4B = context.sql( "SELECT s_score AS submission_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY submission_score" ) task_10_4B.show() task_10_4B.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_4B.csv") #--------------------------------------------------------------------------- # Extra Credit (Task 10) # 1. Percentage of Comments that Were Positive/Negative For Gilded and Non-Gilded Comments task_10_extra_credit = context.sql( "SELECT gilded, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY gilded" ) task_10_extra_credit.show() task_10_extra_credit.repartition(1).write.format( "com.databricks.spark.csv").option( "header", "true").save("task_10_extra_credit.csv")
def main(context): # TASK 1 try: commentsDF = context.read.load('comments.parquet') submissionsDF = context.read.load('submissions.parquet') labeled_dataDF = context.read.load('label.parquet') except: commentsDF = sqlContext.read.json('comments-minimal.json.bz2') submissionsDF = sqlContext.read.json('submissions.json.bz2') labeled_dataDF = sqlContext.read.load('labeled_data.csv', format='csv', sep=',', header="true") commentsDF.write.parquet('comments.parquet') submissionsDF.write.parquet('submissions.parquet') labeled_dataDF.write.parquet('label.parquet') # TASK 2 joined_data = commentsDF.join(labeled_dataDF, commentsDF.id == labeled_dataDF.Input_id, 'inner').select(col('id'), col('body'), col('labeldjt')) # TASK 4,5 ngrams_udf = udf(get_ngrams, ArrayType(StringType())) joined_col = joined_data.withColumn('ngrams', ngrams_udf(joined_data['body'])) try: model = CountVectorizerModel.load('cv.model') except: # task 6A cv = CountVectorizer(inputCol='ngrams', outputCol="features", binary=True) model = cv.fit(joined_col) vectors = model.transform(joined_col) # task 6B positive_udf = udf(lambda x: 1 if x == '1' else 0, IntegerType()) negative_udf = udf(lambda x: 1 if x == '-1' else 0, IntegerType()) vectors = vectors.withColumn('positive', positive_udf(col('labeldjt'))) vectors = vectors.withColumn('negative', negative_udf(col('labeldjt'))) pos = vectors.select(col('positive').alias('label'), col('features')) neg = vectors.select(col('negative').alias('label'), col('features')) pos.write.parquet('positive_ROC.parquet') neg.write.parquet('negative_ROC.parquet') model.save('cv.model') try: posModel = CrossValidatorModel.load('pos.model') negModel = CrossValidatorModel.load('neg.model') except: # Task 7 # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("pos.model") print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. negModel.save("neg.model") # Task 8,9 try: finalDF = context.read.load('final.parquet') except: extract_id_udf = udf(lambda x: x[3:], StringType()) comments = commentsDF.select( col('id').alias('comment_id'), extract_id_udf(col('link_id')).alias('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('score').alias('comment_score')) submissions = submissionsDF.select( col('id').alias('submission_id'), col('title'), col('score').alias('submission_score')) finalDF = comments.join(submissions, comments.link_id == submissions.submission_id, 'inner') #sampling 20% finalDF = finalDF.sample(False, 0.02, None) pos_threshold_udf = udf(lambda x: 1 if x[1] > 0.2 else 0, IntegerType()) neg_threshold_udf = udf(lambda x: 1 if x[1] > 0.25 else 0, IntegerType()) finalDF = finalDF.filter( "body NOT LIKE '%/s%' and body NOT LIKE '>%'") finalDF = finalDF.withColumn('ngrams', ngrams_udf(finalDF['body'])) finalDF = model.transform(finalDF) posResult = posModel.transform(finalDF) temp = posResult.withColumn( 'pos', pos_threshold_udf(posResult['probability'])) temp = temp.select(col('comment_id'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('comment_score'), col('submission_id'), col('title'), col('submission_score'), col('ngrams'), col('pos')) temp = model.transform(temp) negResult = negModel.transform(temp) temp = negResult.withColumn( 'neg', neg_threshold_udf(negResult['probability'])) finalDF = temp.select(col('comment_id'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('comment_score'), col('submission_id'), col('title'), col('submission_score'), col('ngrams'), col('pos'), col('neg')) finalDF.write.parquet('final.parquet') # Task 10 # percentage of positive and negative comments try: task1 = context.read.load('percentage_value.csv/*.csv', format='csv', sep=',', header="true") except: total_rows = finalDF.count() total_pos_comments = finalDF.filter(col('pos') == '1').count() total_neg_comments = finalDF.filter(col('neg') == '1').count() pos_percentage = total_pos_comments / total_rows neg_percentage = total_neg_comments / total_rows values = [{ 'Total Rows': total_rows, 'Percentage of Positive Comments': pos_percentage, 'Percentage of Negative Comments': neg_percentage }] task1 = sqlContext.createDataFrame(values) task1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("percentage_value.csv") #percent over date try: task2 = context.read.load('time_data.csv/*.csv', format='csv', sep=',', header="true") except: task2 = finalDF.withColumn( 'date', F.from_unixtime(col('created_utc')).cast(DateType())) task2 = task2.groupBy('date').agg( (F.sum('pos') / F.count('pos')).alias('Positive'), (F.sum('neg') / F.count('neg')).alias('Negative')) task2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("time_data.csv") #percent over states try: task3 = context.read.load('state_data.csv/*.csv', format='csv', sep=',', header="true") except: state = sqlContext.createDataFrame(states, StringType()) task3 = finalDF.groupBy('author_flair_text').agg( (F.sum('pos') / F.count('pos')).alias('Positive'), (F.sum('neg') / F.count('neg')).alias('Negative')) task3 = task3.join(state, task3.author_flair_text == state.value, 'inner').na.drop(subset=['value']).select( col('author_flair_text').alias('state'), col('Positive'), col('Negative')) task3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("state_data.csv") #percent over submission score try: task4 = context.read.load('submission_score.csv/*.csv', format='csv', sep=',', header="true") except: task4 = finalDF.groupBy('submission_score').agg( (F.sum('pos') / F.count('pos')).alias('Positive'), (F.sum('neg') / F.count('neg')).alias('Negative')) task4.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("submission_score.csv") #percent over commet score try: task5 = context.read.load('comment_score.csv/*.csv', format='csv', sep=',', header="true") except: task5 = finalDF.groupBy('comment_score').agg( (F.sum('pos') / F.count('pos')).alias('Positive'), (F.sum('neg') / F.count('neg')).alias('Negative')) task5.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("comment_score.csv") #list top 10 stories of each sentiment try: top_positive = context.read.load('top_positive.csv/*.csv', format='csv', sep=',', header="true") top_negative = context.read.load('top_negative.csv/*.csv', format='csv', sep=',', header="true") except: top_positive = finalDF.groupBy('title').agg( (F.sum('pos') / F.count('pos')).alias('Percentage')).orderBy( F.desc('Percentage')).limit(10) top_negative = finalDF.groupBy('title').agg( (F.sum('neg') / F.count('neg')).alias('Percentage')).orderBy( F.desc('Percentage')).limit(10) top_positive.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save("top_positive.csv") top_negative.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save("top_negative.csv")
for row in partition: sentence = re.sub("<.*?>", "", row.sentence) # 替换掉标签数据 words = cut_sentence(sentence) yield row.article_id, row.channel_id, words # 分词 sqlContext.sql("use article") articleDF = sqlContext.sql("select * from article_data") # words_df = article_dataframe.rdd.mapPartitions(segmentation).toDF(["article_id", "channel_id", "words"]) wordsDF = articleDF.rdd.mapPartitions(segmentation, 5).toDF( ["article_id", "channel_id", "words"]) cv_model = CountVectorizerModel.load( "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/CV.model" ) idf_model = IDFModel.load( "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/IDF.model" ) cv_result = cv_model.transform(wordsDF) tfidf_result = idf_model.transform(cv_result) def func(partition): TOPK = 20 for row in partition: # 找到索引与IDF值并进行排序 _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values)) _ = sorted(_, key=lambda x: x[1], reverse=True)
def loadData(data_ingestion, train_cv=1, binarize=True, minDF=3, TFIDF_b=False, PCA_b=False, PCA_k=1000): if train_cv: # we train cv... cv_model = CountVectorizer(inputCol='words', outputCol='X', minDF=minDF) else: # we load cv ! cv = CountVectorizerModel.load(cvModelPath) tokenizer = Tokenizer(inputCol="comment", outputCol="words") # Creation of an empty DataFrame field1 = StructField('score', IntegerType(), True) field2 = StructField('X', VectorUDT(), True) fields = [] fields.append(field1) fields.append(field2) schema = StructType(fields) X = spark.createDataFrame(sc.emptyRDD(), schema) # Ingestion par fichier for filePath in data_ingestion: file = sc.textFile(filePath) data = file.map(lambda line: line.split("\t")).toDF() data = data.withColumnRenamed('_2', 'comment') data = data.withColumnRenamed('_1', 'score') data = data.withColumn('score', data['score'].cast(IntegerType())) data = tokenizer.transform(data) if train_cv: cv = cv_model.fit(data) data = cv.transform(data) X = X.union(data.select('score', 'X')) try: shutil.rmtree(cvModelPath, ignore_errors=True) except: pass cv.save(cvModelPath) if binarize: X_1 = X.where((X.score == 4) | (X.score == 5)).withColumn( 'score', lit(1)) X_0 = X.where((X.score == 0) | (X.score == 1) | (X.score == 2) | (X.score == 3)).withColumn('score', lit(0)) X = X_1.union(X_0) if TFIDF_b: idf = IDF(inputCol="X", outputCol="X_TFIDF") model = idf.fit(X) X = model.transform(X) X = X.select('score', 'X_TFIDF') X = X.withColumnRenamed('X_TFIDF', 'X') if PCA_b: pca = PCA(k=PCA_k, inputCol="X", outputCol="X_PCA") model = pca.fit(X) X = X.select('score', 'X_PCA') X = X.withColumnRenamed('X_PCA', 'X') return (X)
print( '********************* after preprocessing testing files *****************' ) print( '********************* after preprocessing testing files *****************' ) #converting the testing rdd to df testing_data = rddToDf_testing(testing_data) print('********************* after converting to df *****************') print('********************* after converting to df *****************') print('********************* after converting to df *****************') #reading the saved countvector model cv = CountVectorizerModel.load(args.model_path + '/countvector_model') #transforming test data to count vector testing_data = cv.transform(testing_data) #saving the transformed data as parquet file testing_data.write.parquet(args.model_path + '/testingdata.parquet') print( '********************* after cv transformation *****************') print( '********************* after cv transformation *****************') print( '********************* after cv transformation *****************') #reading the saved random forest model rfModel = RandomForestClassificationModel.load(args.model_path + '/rfmodel')
def main(context): # Read from JSON comments = sqlContext.read.json("comments-minimal.json.bz2") comments.registerTempTable("commentsTable") submissions = sqlContext.read.json("submissions.json.bz2") submissions.registerTempTable("submissionsTable") # Read the CSV labels = sqlContext.read.format('csv').options(header='true', inferSchema='true').load("labeled_data.csv") labels.registerTempTable("labelsTable") df = sqlContext.sql("SELECT commentsTable.* FROM commentsTable INNER JOIN labelsTable ON commentsTable.id = labelsTable.Input_id") # unigrams, bigrams, trigrams def unigrams_bigrams_trigrams(text): return parsetext.clean_up(text) udf_function = udf(unigrams_bigrams_trigrams, ArrayType(StringType())) df_2 = df.withColumn("udf_results", udf_function(col("body"))) # countVectorizer if(not os.path.exists("cvModel")): cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0) model = cv.fit(df_2) model.write().overwrite().save("cvModel") model = CountVectorizerModel.load("cvModel") df_3A = model.transform(df_2) df_3A.registerTempTable("df_3ATable") df_3B = sqlContext.sql("SELECT df_3ATable.*, IF(labelsTable.labeldjt=1, 1, 0) AS pos_label, if(labelsTable.labeldjt=-1, 1, 0) AS neg_label FROM df_3ATable INNER JOIN labelsTable ON df_3ATable.id = labelsTable.Input_id") df_3B.registerTempTable("df_3BTable") pos = sqlContext.sql('select pos_label as label, features from df_3BTable') neg = sqlContext.sql('select neg_label as label, features from df_3BTable') if(not os.path.exists("www/neg.model") or not os.path.exists("www/pos.model")): # Initialize two logistic regression models. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25) # Binary classifier posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # 5 fold cross-validation pipeline. posCrossval = CrossValidator( estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=2) negCrossval = CrossValidator( estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=2) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Save the models and load them again later. posModel.write().overwrite().save("www/pos.model") negModel.write().overwrite().save("www/neg.model") # TO LOAD BACK IN posModel = CrossValidatorModel.load("www/pos.model") negModel = CrossValidatorModel.load("www/neg.model") df_4 = sqlContext.sql('SELECT commentsTable.id, commentsTable.body, commentsTable.created_utc, commentsTable.author_flair_text, submissionsTable.title, submissionsTable.pinned, commentsTable.score AS comment_score, submissionsTable.score AS story_score FROM commentsTable INNER JOIN submissionsTable ON RIGHT(commentsTable.link_id, 6)=submissionsTable.id') df_4 = df_4.sample(False, 0.05, None) # unigrams, bigrams, trigrams def unigrams_bigrams_trigrams(text): return parsetext.clean_up(text) udf_function = udf(unigrams_bigrams_trigrams, ArrayType(StringType())) df_5_1 = df_4.withColumn("udf_results", udf_function(col("body"))) # countVectorizer model = CountVectorizerModel.load("cvModel") df_5_2 = model.transform(df_5_1) df_5_2.registerTempTable("df_5_2Table") df_5_3 = sqlContext.sql("SELECT * FROM df_5_2Table WHERE df_5_2Table.body NOT LIKE '%/s%' AND df_5_2Table.body NOT LIKE '>%'") df_5_3.registerTempTable("df_5_3Table") posResult_1 = posModel.transform(df_5_3) posResult_1.registerTempTable("posResult_1Table") posResult_2 = sqlContext.sql("SELECT posResult_1Table.id, posResult_1Table.body, posResult_1Table.author_flair_text, posResult_1Table.created_utc, posResult_1Table.title, posResult_1Table.comment_score, posResult_1Table.story_score, posResult_1Table.features, posResult_1Table.pinned, posResult_1Table.prediction AS pos FROM posResult_1Table") finalResult_1 = negModel.transform(posResult_2) finalResult_1.registerTempTable("finalResult_1Table") finalResult_2 = sqlContext.sql("SELECT finalResult_1Table.id, finalResult_1Table.body, finalResult_1Table.created_utc, finalResult_1Table.author_flair_text, finalResult_1Table.title, finalResult_1Table.comment_score, finalResult_1Table.story_score, finalResult_1Table.pos, finalResult_1Table.pinned, finalResult_1Table.prediction AS neg FROM finalResult_1Table") finalResult_2.registerTempTable("finalResult_2Table") if(not os.path.exists("final.parquet")): finalResult_2.write.parquet("final.parquet") final = sqlContext.read.parquet("final.parquet") final.registerTempTable("finalTable") # computations if(not os.path.exists("submissions.csv")): question1 = sqlContext.sql("SELECT (100 * sum(pos) / COUNT(*)) AS percent_pos, (100 * sum(neg) / COUNT(*)) AS percent_neg FROM finalTable") question1.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("submissions.csv") if(not os.path.exists("days.csv")): question2 = sqlContext.sql("SELECT DATE(from_unixtime(finalTable.created_utc)) AS date, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY date ORDER BY date") question2.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("days.csv") if(not os.path.exists("states.csv")): question3 = sqlContext.sql("SELECT finalTable.author_flair_text AS place, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY place ORDER BY place") question3.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("states.csv") if(not os.path.exists("comment.csv")): question4_comment = sqlContext.sql("SELECT finalTable.comment_score AS comment_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY comment_score ORDER BY comment_score") question4_comment.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("comment.csv") if(not os.path.exists("story.csv")): question4_story = sqlContext.sql("SELECT finalTable.story_score AS story_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY story_score ORDER BY story_score") question4_story.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("story.csv")