def predict_with_multiple_version(df, versions, model_date, spid): for version_name in versions: version_infor = MODEL_VERSION_INFO[version_name] convmaps = get_convmap_dics(version_name, model_date) for k in convmaps[str(spid)].keys(): df = df.withColumn( k + '_' + version_name, categorical_conv(convmaps[str(spid)][k])(col(k))) name_features = version_infor['func_feature_names'](df) name_features = convert_name_features(name_features, version_name, list(convmaps[str(spid)])) df = VectorAssembler(inputCols=name_features, outputCol='features_%s' % version_name).transform(df) print(df.columns) predicted_list = [] for version_name in versions: model = get_model(version_name, spid, model_date) prob_col_name = 'prob_%s' % version_name df = df.withColumn('features', col('features_%s' % version_name)) df = model.transform(df).withColumn( prob_col_name, UserDefinedFunction(lambda x: x.tolist()[1], DoubleType())(col('probability'))) predicted_list.append(version_name) df = df.select(['is_click', 'dsp_id'] + ['prob_%s' % v for v in predicted_list] + ['features_%s' % v for v in versions]) df = df.select(['is_click', 'dsp_id'] + ['prob_%s' % v for v in versions]) return df
def mutual_info(sdf, colnames): check_columns(sdf, colnames) n = len(colnames) probs = [] for i in range(n): probs.append(distribution(sdf, colnames[i])) res = np.zeros(shape=(n, n)) for i in range(n): for j in range(i, n): tdf = VectorAssembler(inputCols=[colnames[i], colnames[j]], outputCol='__vectors').transform(sdf) tdf = distribution(tdf, '__vectors') tdf = disassemble(dense_to_array(tdf, '__col', '__features'), '__features') tdf = tdf.join(probs[i].toDF('__features_0', '__p0'), on='__features_0') tdf = tdf.join(probs[j].toDF('__features_1', '__p1'), on='__features_1') mi = tdf.select( F.sum( F.expr( 'log2(__probability / (__p0 * __p1)) * __probability')) ).take(1)[0][0] res[i, j] = mi res[j, i] = mi return pd.DataFrame(res, index=colnames, columns=colnames)
def predictionFromValues(query): # Split to values values = query.split(",") # Prepare dictionary for feature dataframe from web form values features_dict = [{ "level_index": float(values[0]), "gender_index": float(values[1]), "thumbs_up_sum": int(values[2]), "thumbs_down_sum": int(values[3]), "nextsong_sum": int(values[4]), "downgrade_sum": int(values[5]), "length_sum": float(values[6]), "sessionId_count": int(values[7]), }] # Create a user row to use in VectorAssembler df_user_row = spark.createDataFrame(features_dict) # Create feature dataframe with VectorAssembler df_features = VectorAssembler(inputCols= \ ["level_index", "gender_index", "thumbs_up_sum", "thumbs_down_sum", \ "nextsong_sum", "downgrade_sum", "length_sum", "sessionId_count"], \ outputCol="features").transform(df_user_row) # Select features df_features = df_features.select("features") # Predict on model prediction = model.transform(df_features) return prediction.select("prediction").collect()[0][0]
def encoding_data(df): column_names = df.schema.names column_indexes = [item + "_index" for item in column_names] indexers = [StringIndexer(inputCol=col, outputCol=col + "_index").fit(df) for col in column_names] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df).cache() df = VectorAssembler(inputCols=column_indexes, outputCol="corr_vec").transform(df).cache() string_indexer_df = VectorAssembler(inputCols=column_indexes[1:], outputCol="feature_vec").transform(df).cache() strong_index = get_correlation_matrix(df, column_names, 0.6) dc_column_indexes = copy.copy(column_indexes) for corr_pair in strong_index: if corr_pair[0] != 0: # this correlated pair is between features # in such cases, remove either of both and append to the list dc_column_indexes.pop(corr_pair[0]) column_vecs = [item + "_vec" for item in dc_column_indexes] df = OneHotEncoderEstimator(inputCols=dc_column_indexes, outputCols=column_vecs).fit(df).transform(df).cache() df = VectorAssembler(inputCols=column_vecs, outputCol="features").transform(df).cache() return df.select("class_index", "features"), string_indexer_df
# In[54]: #List of all the columns col = netflow.columns col.remove("class_attack") col # ## Vectorization # In[55]: #Vectorizing the input features from pyspark.ml.feature import VectorAssembler conn_vect = VectorAssembler(inputCols=col, outputCol="features").transform(netflow) conn_vect.select("features", "class_attack").limit(5).toPandas() # ## Scaling # In[56]: #Applying Min-Max scaling from pyspark.ml.feature import MinMaxScaler scaler = MinMaxScaler(inputCol="features", outputCol="minmax_scaled_features") # In[57]: mm = scaler.fit(conn_vect) conn_scale = mm.transform(conn_vect) conn_scale.select("minmax_scaled_features", "class_attack").limit(5).toPandas()
maxDepth=10) model = classifier.fit(train_data) # Transform the test data using the model to get predictions predicted_test_data = model.transform(test_data) # Evaluate the model performance evaluator_f1 = MulticlassClassificationEvaluator( labelCol='gender', predictionCol='prediction', metricName='f1') print("F1 score: {}", evaluator_f1.evaluate(predicted_test_data)) evaluator_accuracy = MulticlassClassificationEvaluator( labelCol='gender', predictionCol='prediction', metricName='accuracy') print("Accuracy: {}", evaluator_accuracy.evaluate(predicted_test_data)) # Predict some new records # In real case, use VectorAssembler to transform df for features column data_to_predict = final_data.select("features").limit(10) model.transform(data_to_predict).show() # Save the model model.save("hdfs://devenv/user/spark/web_logs_analysis/gender_model/") # Read the saved model model_reloaded = RandomForestClassificationModel.load( "hdfs://devenv/user/spark/web_logs_analysis/gender_model/") # Predict some new records # In real case, use VectorAssembler to transform df for features column data_to_predict = final_data.select("features").limit(10) model_reloaded.transform(data_to_predict).show()
def annotate_pval_dataset(self, cur_df): import pyspark try: tr_inst = self.spark.read.parquet(self.training_temp_dir) te_inst = self.spark.read.parquet(self.testing_temp_dir) return tr_inst, te_inst except pyspark.sql.utils.AnalysisException as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) self.logger.info(message) self.logger.info("PROCESS") self.logger.debug("NOTEXISTS ANNOTATE_FILE") self.logger.debug("RUN_PROCESS") except: self.logger.info("TEST_PURPOSE") from pyspark.ml.feature import VectorAssembler postfix = self.postfix.format(self.sel_top) obs_df = cur_df cur_cols = obs_df.columns for i in self.non_feature_column: cur_cols.remove(i) self.logger.debug("feature_columns") cur_cols = sorted(cur_cols) self.logger.debug(cur_cols) import json json.dump({"non_demo_features": cur_cols}, open(self.json_feature_dump_loc, "w")) obs_df = VectorAssembler( inputCols=cur_cols, outputCol="features_imputed").transform(obs_df) cur_time_list = obs_df.select("ID", "TIME_SPAN") of_annotated = obs_df of_excl_training = dict() demo_feature = self.add_demo() of_annotated = VectorAssembler( inputCols=["features_imputed", "demo_feature"], outputCol="features").transform( of_annotated.join(demo_feature, "ID")) of_annotated.show() from pyspark.sql.functions import col, lit, when self.logger.debug("ANNOTATED") cur_test_ids = self.get_target_test_id() self.logger.debug(cur_test_ids) # TODO CHECK why I put 'why 0' comment over here? self.logger.debug(len(cur_test_ids)) tr_inst, te_inst = self.cur_annotator.prep_TR_TE( of_annotated, test_id_list=cur_test_ids) self.logger.debug("IDS") self.logger.debug( tr_inst.select("ID").distinct().count(), te_inst.select("ID").distinct().count()) self.logger.debug("TR_TE_CNT:{0}_{1}".format(tr_inst.count(), te_inst.count())) train_data_ID = tr_inst.select("ID").distinct().rdd.flatMap( list).collect() testing_data_ID = te_inst.select("ID").distinct().rdd.flatMap( list).collect() self.action_df.show() train_action_df = self.action_df.where( col("ID").isin(train_data_ID)).persist() self.logger.debug(train_action_df.select("ID").distinct().count()) train_terminal_outcome = self.terminal_outcome.where( col("ID").isin(train_data_ID)).persist() self.logger.debug( train_terminal_outcome.select("ID").distinct().count()) intv_w_p_val = self.identify_relevant_action( train_action_df, train_terminal_outcome, tr_inst.select("ID").distinct().count()) intv_w_p_val.join( self.def_df.where(col("SOURCE").isin(["CPT", "MED", "PROC"])), self.itemid).orderBy("p_val").show(100, truncate=False) from pyspark.sql.functions import sum, rand, max, lit from pyspark.ml.feature import VectorAssembler cur_annot_topk = self.sel_top self.action_df.show() self.terminal_outcome.show() annot_df = self.action_df.join(self.terminal_outcome, "ID").persist() annot_df.show() pos_inst_dict = dict() from pyspark.sql.functions import count for cur_of in [self.target_disch_col]: # For debug purpose, pass if target_of is not identified self.logger.debug(cur_of) intv_w_p_val.where("DISCH_DX == '{0}'".format(cur_of)).orderBy( col("p_val").cast("double")).show(50, truncate=False) target_annot_criteria = intv_w_p_val.where( "DISCH_DX == '{0}'".format(cur_of)).orderBy( col("p_val").cast("double")).limit(cur_annot_topk) target_annot_criteria.write.save(self.annot_intv_dir.format( cur_of, cur_annot_topk), mode="overwrite") target_annot_criteria = target_annot_criteria.select( self.itemid).rdd.flatMap(list).collect() if len(target_annot_criteria) == 0: self.logger.info( "NO TERMINAL DX {0} idenfieid from pts".format(cur_of)) pos_inst_dict[cur_of] = None continue self.logger.debug(target_annot_criteria) self.logger.debug(len(target_annot_criteria)) self.logger.debug("selected intv!!") self.def_df.where(col( self.itemid).isin(target_annot_criteria)).show(cur_annot_topk, truncate=False) pos_inst_dict[cur_of] = annot_df.where((col(self.itemid).isin(target_annot_criteria)) & (col("DISCH_DX") == cur_of))\ .select("ID", col("TIME_OBS").cast("date").alias("TIME_OBS"), lit("1").cast("double").alias("{0}_label".format(cur_of)))\ .distinct().persist() pos_inst_dict[cur_of].groupBy("{0}_label".format(cur_of)).agg( count("*")).show() from pyspark.sql.functions import broadcast true_inst = annot_df.where( (col(self.itemid).isin(target_annot_criteria)) & (col("DISCH_DX") == cur_of)) excl_id = annot_df.withColumn("IS_TARGET_OF",when(col("DISCH_DX") ==cur_of,lit("1").cast("double")).otherwise(lit("0").cast("double")))\ .withColumn("IS_REL_INTV", when(col(self.itemid).isin(target_annot_criteria), lit("1").cast("double")).otherwise(lit("0").cast("double")))\ .groupBy("ID").agg(sum("IS_TARGET_OF").alias("SUM_IS_TARGET_OF"),sum("IS_REL_INTV").alias("SUM_IS_REL_INTV"))\ .where("(SUM_IS_TARGET_OF <> 0) AND (SUM_IS_REL_INTV == 0)").select("ID").distinct().rdd.flatMap(list).collect() self.logger.debug("NUM_PTS_EXCLUDED:{0}".format(len(excl_id))) self.logger.debug("TRAINING_INST_COUNT:{0}".format( tr_inst.count())) tr_inst = tr_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\ .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\ .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0,subset=["{0}_label".format(cur_of)]).persist() print(tr_inst.count()) tr_inst.groupBy("{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() te_inst = te_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\ .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\ .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0, subset=["{0}_label".format(cur_of)]).persist() print(te_inst.count()) te_inst.groupBy("{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() tr_inst.groupBy("ID").agg( max("{0}_label".format(cur_of)).alias( "{0}_label".format(cur_of)), max("{0}_excl".format(cur_of)).alias( "{0}_excl".format(cur_of))).groupBy( "{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() te_inst.groupBy("ID").agg( max("{0}_label".format(cur_of)).alias( "{0}_label".format(cur_of)), max("{0}_excl".format(cur_of)).alias( "{0}_excl".format(cur_of))).groupBy( "{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() tr_inst.write.save(self.training_temp_dir, mode="overwrite") te_inst.write.save(self.testing_temp_dir, mode="overwrite") tr_inst = self.spark.read.parquet(self.training_temp_dir) te_inst = self.spark.read.parquet(self.testing_temp_dir) #te_inst.show() return (tr_inst, te_inst)
user_weights = list(row.user_weights + [0] * (10 - len(row.user_weights))) weights = sorted(user_weights)[:10] except Exception as e: weights = [0.0] * 10 return row.article_id, row.user_id, row.channel_id, Vectors.dense( row.articlevector), Vectors.dense(weights), Vectors.dense( row.article_weights), int(row.clicked) train_vector = train_data.rdd.map(get_user_weights).toDF(columns) train = VectorAssembler().setInputCols( columns[2:6]).setOutputCol("features").transform(train_vector) # train.show() df = train.select(['user_id', 'article_id', 'clicked', 'features']) df_array = df.collect() df = pd.DataFrame(df_array) def write_to_tfrecords(click_batch, feature_batch): # initialize writer writer = tf.io.TFRecordWriter( "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/TFRecords" ) # 循环将所有样本一个个封装成example,写入这个文件 for i in range(len(click_batch)): click = click_batch[i] feature = feature_batch[i].tostring() print(len(feature))
segment = preds_join segment = segment.withColumnRenamed("prediction", "churn_prediction") features = ('Total_Subscriptions', 'Customer_Lifetime','Avg_Subscription_Period', 'Avg_Subscription_Period_inMonths', 'Total_meals_Regular', 'Total_meals_Exceptional', 'Avg_Meal_Price', 'sum(ProductDiscount)', 'sum(TotalDiscount)', 'Total_Price', 'Avg_Price', 'Avg_Credit', 'Total_Credit', 'Total_Subscriptions_14', 'Total_Subscriptions_15', 'Total_Subscriptions_16', 'Total_Subscriptions_17', 'Total_Subscriptions_18', 'Total_Subscriptions_19', 'Count_Formula_DirectMail', 'Total_Formula_Duration_DirectMail', 'Count_Formula_Reg', 'Total_Formula_Duration_Reg', 'count(ComplaintID)') segment_join = VectorAssembler()\ .setInputCols(features)\ .setOutputCol("features")\ .transform(segment) #Selecting the features column for clustering segment_features = segment_join.select("features") #Creating the kmeans clustering model kmeans = KMeans().setK(3).setSeed(1) KM_model = kmeans.fit(segment_join) clusters = KM_model.clusterCenters() churn_segmentation = KM_model\ .transform(segment_join).select('CustomerID', 'Total_Subscriptions', 'Customer_Lifetime', 'Avg_Subscription_Period', 'Avg_Subscription_Period_inMonths', 'Total_meals_Regular','Total_meals_Exceptional', 'Avg_Meal_Price', 'sum(ProductDiscount)', 'sum(TotalDiscount)', 'Total_Price', 'Total_Credit', 'Region','Total_Subscriptions_14','Total_Subscriptions_15', 'Total_Subscriptions_16', 'Total_Subscriptions_17', 'Total_Subscriptions_18', 'Total_Subscriptions_19', 'Count_Formula_DirectMail', 'Total_Formula_Duration_DirectMail', 'Count_Formula_Reg', 'Total_Formula_Duration_Reg', 'Avg_Price', 'Avg_Credit', 'count(ComplaintID)', col("prediction").alias("clusters")) # COMMAND ---------- #Visualizing clusters display(final_segmentation)
outputCol="indexedFeatures", maxCategories=5).fit(test) # indexedLabel -> label labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # lr standard scale standardscaler = StandardScaler().setInputCol("features").setOutputCol( "Scaled_features") train = standardscaler.fit(train).transform(train) test = standardscaler.fit(test).transform(test) # modify unbalance train_size = train.select("label").count() negative_num = train.select("label").where("label==0").count() balance_ratio = float(float(negative_num) / float(train_size)) train = train.withColumn( "classWeights", when(train.label == 1, balance_ratio).otherwise(1 - balance_ratio)) #-------------------train-----------------------# #build models md = LogisticRegression(labelCol="indexedLabel", featuresCol="indexedFeatures", weightCol="classWeights", maxIter=50, regParam=0.02) # md = LogisticRegression(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10, regParam=0.05)
#List off all the columns colum=window_dataset.columns colum.remove("success_failure") colum # ### Vectorizing # In[59]: #Vectorizing the set of input features from pyspark.ml.feature import VectorAssembler df_vect = VectorAssembler(inputCols = colum, outputCol="features").transform(window_dataset) df_vect.select("features", "success_failure").limit(5).toPandas() # ### Scaling # In[60]: #Applying Min-Max scaling from pyspark.ml.feature import MinMaxScaler mm_scaler = MinMaxScaler(inputCol="features", outputCol="minmax_scaled_features") # In[61]: