예제 #1
0
def predict_with_multiple_version(df, versions, model_date, spid):
    for version_name in versions:
        version_infor = MODEL_VERSION_INFO[version_name]
        convmaps = get_convmap_dics(version_name, model_date)
        for k in convmaps[str(spid)].keys():
            df = df.withColumn(
                k + '_' + version_name,
                categorical_conv(convmaps[str(spid)][k])(col(k)))
        name_features = version_infor['func_feature_names'](df)
        name_features = convert_name_features(name_features, version_name,
                                              list(convmaps[str(spid)]))
        df = VectorAssembler(inputCols=name_features,
                             outputCol='features_%s' %
                             version_name).transform(df)
    print(df.columns)
    predicted_list = []
    for version_name in versions:
        model = get_model(version_name, spid, model_date)
        prob_col_name = 'prob_%s' % version_name
        df = df.withColumn('features', col('features_%s' % version_name))
        df = model.transform(df).withColumn(
            prob_col_name,
            UserDefinedFunction(lambda x: x.tolist()[1],
                                DoubleType())(col('probability')))
        predicted_list.append(version_name)
        df = df.select(['is_click', 'dsp_id'] +
                       ['prob_%s' % v for v in predicted_list] +
                       ['features_%s' % v for v in versions])
    df = df.select(['is_click', 'dsp_id'] + ['prob_%s' % v for v in versions])
    return df
예제 #2
0
def mutual_info(sdf, colnames):
    check_columns(sdf, colnames)
    n = len(colnames)
    probs = []
    for i in range(n):
        probs.append(distribution(sdf, colnames[i]))
    res = np.zeros(shape=(n, n))
    for i in range(n):
        for j in range(i, n):
            tdf = VectorAssembler(inputCols=[colnames[i], colnames[j]],
                                  outputCol='__vectors').transform(sdf)
            tdf = distribution(tdf, '__vectors')
            tdf = disassemble(dense_to_array(tdf, '__col', '__features'),
                              '__features')
            tdf = tdf.join(probs[i].toDF('__features_0', '__p0'),
                           on='__features_0')
            tdf = tdf.join(probs[j].toDF('__features_1', '__p1'),
                           on='__features_1')
            mi = tdf.select(
                F.sum(
                    F.expr(
                        'log2(__probability / (__p0 * __p1)) * __probability'))
            ).take(1)[0][0]
            res[i, j] = mi
            res[j, i] = mi
    return pd.DataFrame(res, index=colnames, columns=colnames)
예제 #3
0
파일: run.py 프로젝트: thomasrobertz/mooc
def predictionFromValues(query):

	# Split to values
	values = query.split(",")

	# Prepare dictionary for feature dataframe from web form values
	features_dict = [{
		"level_index": float(values[0]),
		"gender_index": float(values[1]),
		"thumbs_up_sum": int(values[2]),
		"thumbs_down_sum": int(values[3]),
		"nextsong_sum": int(values[4]),
		"downgrade_sum": int(values[5]),
		"length_sum": float(values[6]),
		"sessionId_count": int(values[7]),
	}]

	# Create a user row to use in VectorAssembler
	df_user_row = spark.createDataFrame(features_dict)

	# Create feature dataframe with VectorAssembler
	df_features = VectorAssembler(inputCols= \
									  ["level_index", "gender_index", "thumbs_up_sum", "thumbs_down_sum", \
									   "nextsong_sum", "downgrade_sum", "length_sum", "sessionId_count"], \
								  outputCol="features").transform(df_user_row)

	# Select features
	df_features = df_features.select("features")

	# Predict on model
	prediction = model.transform(df_features)
	return prediction.select("prediction").collect()[0][0]
예제 #4
0
def encoding_data(df):
    column_names = df.schema.names
    column_indexes = [item + "_index" for item in column_names]
    indexers = [StringIndexer(inputCol=col, outputCol=col + "_index").fit(df) for col in column_names]

    pipeline = Pipeline(stages=indexers)
    df = pipeline.fit(df).transform(df).cache()

    df = VectorAssembler(inputCols=column_indexes, outputCol="corr_vec").transform(df).cache()
    string_indexer_df = VectorAssembler(inputCols=column_indexes[1:], outputCol="feature_vec").transform(df).cache()

    strong_index = get_correlation_matrix(df, column_names, 0.6)
    dc_column_indexes = copy.copy(column_indexes)
    for corr_pair in strong_index:
        if corr_pair[0] != 0:
            # this correlated pair is between features
            # in such cases, remove either of both and append to the list
            dc_column_indexes.pop(corr_pair[0])
    column_vecs = [item + "_vec" for item in dc_column_indexes]
    df = OneHotEncoderEstimator(inputCols=dc_column_indexes, outputCols=column_vecs).fit(df).transform(df).cache()
    df = VectorAssembler(inputCols=column_vecs, outputCol="features").transform(df).cache()
    return df.select("class_index", "features"), string_indexer_df
# In[54]:

#List of all the columns
col = netflow.columns
col.remove("class_attack")
col

# ## Vectorization

# In[55]:

#Vectorizing the input features
from pyspark.ml.feature import VectorAssembler
conn_vect = VectorAssembler(inputCols=col,
                            outputCol="features").transform(netflow)
conn_vect.select("features", "class_attack").limit(5).toPandas()

# ## Scaling

# In[56]:

#Applying Min-Max scaling
from pyspark.ml.feature import MinMaxScaler
scaler = MinMaxScaler(inputCol="features", outputCol="minmax_scaled_features")

# In[57]:

mm = scaler.fit(conn_vect)
conn_scale = mm.transform(conn_vect)
conn_scale.select("minmax_scaled_features", "class_attack").limit(5).toPandas()
예제 #6
0
                                        maxDepth=10)
    model = classifier.fit(train_data)

    # Transform the test data using the model to get predictions
    predicted_test_data = model.transform(test_data)

    # Evaluate the model performance
    evaluator_f1 = MulticlassClassificationEvaluator(
        labelCol='gender', predictionCol='prediction', metricName='f1')
    print("F1 score: {}", evaluator_f1.evaluate(predicted_test_data))

    evaluator_accuracy = MulticlassClassificationEvaluator(
        labelCol='gender', predictionCol='prediction', metricName='accuracy')
    print("Accuracy: {}", evaluator_accuracy.evaluate(predicted_test_data))

    # Predict some new records
    # In real case, use VectorAssembler to transform df for features column
    data_to_predict = final_data.select("features").limit(10)
    model.transform(data_to_predict).show()

    # Save the model
    model.save("hdfs://devenv/user/spark/web_logs_analysis/gender_model/")

    # Read the saved model
    model_reloaded = RandomForestClassificationModel.load(
        "hdfs://devenv/user/spark/web_logs_analysis/gender_model/")

    # Predict some new records
    # In real case, use VectorAssembler to transform df for features column
    data_to_predict = final_data.select("features").limit(10)
    model_reloaded.transform(data_to_predict).show()
    def annotate_pval_dataset(self, cur_df):
        import pyspark
        try:
            tr_inst = self.spark.read.parquet(self.training_temp_dir)
            te_inst = self.spark.read.parquet(self.testing_temp_dir)
            return tr_inst, te_inst
        except pyspark.sql.utils.AnalysisException as ex:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            self.logger.info(message)
            self.logger.info("PROCESS")
            self.logger.debug("NOTEXISTS ANNOTATE_FILE")
            self.logger.debug("RUN_PROCESS")
        except:
            self.logger.info("TEST_PURPOSE")

        from pyspark.ml.feature import VectorAssembler
        postfix = self.postfix.format(self.sel_top)
        obs_df = cur_df

        cur_cols = obs_df.columns
        for i in self.non_feature_column:
            cur_cols.remove(i)
            self.logger.debug("feature_columns")
        cur_cols = sorted(cur_cols)
        self.logger.debug(cur_cols)
        import json

        json.dump({"non_demo_features": cur_cols},
                  open(self.json_feature_dump_loc, "w"))

        obs_df = VectorAssembler(
            inputCols=cur_cols, outputCol="features_imputed").transform(obs_df)

        cur_time_list = obs_df.select("ID", "TIME_SPAN")
        of_annotated = obs_df
        of_excl_training = dict()

        demo_feature = self.add_demo()

        of_annotated = VectorAssembler(
            inputCols=["features_imputed", "demo_feature"],
            outputCol="features").transform(
                of_annotated.join(demo_feature, "ID"))

        of_annotated.show()

        from pyspark.sql.functions import col, lit, when
        self.logger.debug("ANNOTATED")

        cur_test_ids = self.get_target_test_id()
        self.logger.debug(cur_test_ids)
        # TODO CHECK why I put 'why 0' comment over here?
        self.logger.debug(len(cur_test_ids))
        tr_inst, te_inst = self.cur_annotator.prep_TR_TE(
            of_annotated, test_id_list=cur_test_ids)

        self.logger.debug("IDS")
        self.logger.debug(
            tr_inst.select("ID").distinct().count(),
            te_inst.select("ID").distinct().count())

        self.logger.debug("TR_TE_CNT:{0}_{1}".format(tr_inst.count(),
                                                     te_inst.count()))

        train_data_ID = tr_inst.select("ID").distinct().rdd.flatMap(
            list).collect()

        testing_data_ID = te_inst.select("ID").distinct().rdd.flatMap(
            list).collect()

        self.action_df.show()

        train_action_df = self.action_df.where(
            col("ID").isin(train_data_ID)).persist()

        self.logger.debug(train_action_df.select("ID").distinct().count())

        train_terminal_outcome = self.terminal_outcome.where(
            col("ID").isin(train_data_ID)).persist()

        self.logger.debug(
            train_terminal_outcome.select("ID").distinct().count())

        intv_w_p_val = self.identify_relevant_action(
            train_action_df, train_terminal_outcome,
            tr_inst.select("ID").distinct().count())
        intv_w_p_val.join(
            self.def_df.where(col("SOURCE").isin(["CPT", "MED", "PROC"])),
            self.itemid).orderBy("p_val").show(100, truncate=False)

        from pyspark.sql.functions import sum, rand, max, lit
        from pyspark.ml.feature import VectorAssembler
        cur_annot_topk = self.sel_top

        self.action_df.show()
        self.terminal_outcome.show()

        annot_df = self.action_df.join(self.terminal_outcome, "ID").persist()
        annot_df.show()
        pos_inst_dict = dict()
        from pyspark.sql.functions import count
        for cur_of in [self.target_disch_col]:
            # For debug purpose, pass if target_of is not identified
            self.logger.debug(cur_of)
            intv_w_p_val.where("DISCH_DX == '{0}'".format(cur_of)).orderBy(
                col("p_val").cast("double")).show(50, truncate=False)
            target_annot_criteria = intv_w_p_val.where(
                "DISCH_DX == '{0}'".format(cur_of)).orderBy(
                    col("p_val").cast("double")).limit(cur_annot_topk)
            target_annot_criteria.write.save(self.annot_intv_dir.format(
                cur_of, cur_annot_topk),
                                             mode="overwrite")
            target_annot_criteria = target_annot_criteria.select(
                self.itemid).rdd.flatMap(list).collect()
            if len(target_annot_criteria) == 0:
                self.logger.info(
                    "NO TERMINAL DX {0} idenfieid from pts".format(cur_of))
                pos_inst_dict[cur_of] = None
                continue
            self.logger.debug(target_annot_criteria)
            self.logger.debug(len(target_annot_criteria))
            self.logger.debug("selected intv!!")
            self.def_df.where(col(
                self.itemid).isin(target_annot_criteria)).show(cur_annot_topk,
                                                               truncate=False)
            pos_inst_dict[cur_of] = annot_df.where((col(self.itemid).isin(target_annot_criteria)) & (col("DISCH_DX") == cur_of))\
                .select("ID", col("TIME_OBS").cast("date").alias("TIME_OBS"), lit("1").cast("double").alias("{0}_label".format(cur_of)))\
                .distinct().persist()
            pos_inst_dict[cur_of].groupBy("{0}_label".format(cur_of)).agg(
                count("*")).show()
            from pyspark.sql.functions import broadcast

            true_inst = annot_df.where(
                (col(self.itemid).isin(target_annot_criteria))
                & (col("DISCH_DX") == cur_of))
            excl_id = annot_df.withColumn("IS_TARGET_OF",when(col("DISCH_DX") ==cur_of,lit("1").cast("double")).otherwise(lit("0").cast("double")))\
                .withColumn("IS_REL_INTV", when(col(self.itemid).isin(target_annot_criteria), lit("1").cast("double")).otherwise(lit("0").cast("double")))\
                .groupBy("ID").agg(sum("IS_TARGET_OF").alias("SUM_IS_TARGET_OF"),sum("IS_REL_INTV").alias("SUM_IS_REL_INTV"))\
                .where("(SUM_IS_TARGET_OF <> 0) AND (SUM_IS_REL_INTV == 0)").select("ID").distinct().rdd.flatMap(list).collect()
            self.logger.debug("NUM_PTS_EXCLUDED:{0}".format(len(excl_id)))
            self.logger.debug("TRAINING_INST_COUNT:{0}".format(
                tr_inst.count()))
            tr_inst = tr_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\
                .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\
                .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0,subset=["{0}_label".format(cur_of)]).persist()
            print(tr_inst.count())
            tr_inst.groupBy("{0}_label".format(cur_of),
                            "{0}_excl".format(cur_of)).agg(count("*")).show()
            te_inst = te_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\
                .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\
                .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0, subset=["{0}_label".format(cur_of)]).persist()
            print(te_inst.count())
            te_inst.groupBy("{0}_label".format(cur_of),
                            "{0}_excl".format(cur_of)).agg(count("*")).show()

            tr_inst.groupBy("ID").agg(
                max("{0}_label".format(cur_of)).alias(
                    "{0}_label".format(cur_of)),
                max("{0}_excl".format(cur_of)).alias(
                    "{0}_excl".format(cur_of))).groupBy(
                        "{0}_label".format(cur_of),
                        "{0}_excl".format(cur_of)).agg(count("*")).show()
            te_inst.groupBy("ID").agg(
                max("{0}_label".format(cur_of)).alias(
                    "{0}_label".format(cur_of)),
                max("{0}_excl".format(cur_of)).alias(
                    "{0}_excl".format(cur_of))).groupBy(
                        "{0}_label".format(cur_of),
                        "{0}_excl".format(cur_of)).agg(count("*")).show()

        tr_inst.write.save(self.training_temp_dir, mode="overwrite")
        te_inst.write.save(self.testing_temp_dir, mode="overwrite")

        tr_inst = self.spark.read.parquet(self.training_temp_dir)
        te_inst = self.spark.read.parquet(self.testing_temp_dir)
        #te_inst.show()

        return (tr_inst, te_inst)
예제 #8
0
        user_weights = list(row.user_weights + [0] *
                            (10 - len(row.user_weights)))
        weights = sorted(user_weights)[:10]
    except Exception as e:
        weights = [0.0] * 10
    return row.article_id, row.user_id, row.channel_id, Vectors.dense(
        row.articlevector), Vectors.dense(weights), Vectors.dense(
            row.article_weights), int(row.clicked)


train_vector = train_data.rdd.map(get_user_weights).toDF(columns)
train = VectorAssembler().setInputCols(
    columns[2:6]).setOutputCol("features").transform(train_vector)
# train.show()

df = train.select(['user_id', 'article_id', 'clicked', 'features'])
df_array = df.collect()
df = pd.DataFrame(df_array)


def write_to_tfrecords(click_batch, feature_batch):
    # initialize writer
    writer = tf.io.TFRecordWriter(
        "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/TFRecords"
    )

    # 循环将所有样本一个个封装成example,写入这个文件
    for i in range(len(click_batch)):
        click = click_batch[i]
        feature = feature_batch[i].tostring()
        print(len(feature))
segment = preds_join
segment = segment.withColumnRenamed("prediction", "churn_prediction")

features = ('Total_Subscriptions', 'Customer_Lifetime','Avg_Subscription_Period', 'Avg_Subscription_Period_inMonths', 'Total_meals_Regular', 'Total_meals_Exceptional', 'Avg_Meal_Price',
            'sum(ProductDiscount)', 'sum(TotalDiscount)', 'Total_Price', 'Avg_Price', 'Avg_Credit', 'Total_Credit', 'Total_Subscriptions_14', 'Total_Subscriptions_15', 'Total_Subscriptions_16',
            'Total_Subscriptions_17', 'Total_Subscriptions_18', 'Total_Subscriptions_19', 'Count_Formula_DirectMail', 'Total_Formula_Duration_DirectMail', 'Count_Formula_Reg',
            'Total_Formula_Duration_Reg', 'count(ComplaintID)')

segment_join =  VectorAssembler()\
                            .setInputCols(features)\
                            .setOutputCol("features")\
                            .transform(segment)

#Selecting the features column for clustering
segment_features = segment_join.select("features")

#Creating the kmeans clustering model
kmeans = KMeans().setK(3).setSeed(1)
KM_model = kmeans.fit(segment_join)
clusters = KM_model.clusterCenters()
churn_segmentation = KM_model\
                        .transform(segment_join).select('CustomerID', 'Total_Subscriptions', 'Customer_Lifetime', 'Avg_Subscription_Period',                                                                                           'Avg_Subscription_Period_inMonths', 'Total_meals_Regular','Total_meals_Exceptional', 'Avg_Meal_Price', 'sum(ProductDiscount)',                                                                         'sum(TotalDiscount)', 'Total_Price', 'Total_Credit', 'Region','Total_Subscriptions_14','Total_Subscriptions_15', 'Total_Subscriptions_16',                                                             'Total_Subscriptions_17', 'Total_Subscriptions_18', 'Total_Subscriptions_19', 'Count_Formula_DirectMail',                                                                                             'Total_Formula_Duration_DirectMail', 'Count_Formula_Reg', 'Total_Formula_Duration_Reg', 'Avg_Price', 'Avg_Credit',
                                'count(ComplaintID)', col("prediction").alias("clusters"))


# COMMAND ----------

#Visualizing clusters
display(final_segmentation)
예제 #10
0
                                    outputCol="indexedFeatures",
                                    maxCategories=5).fit(test)

# indexedLabel -> label
labelConverter = IndexToString(inputCol="prediction",
                               outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# lr standard scale
standardscaler = StandardScaler().setInputCol("features").setOutputCol(
    "Scaled_features")
train = standardscaler.fit(train).transform(train)
test = standardscaler.fit(test).transform(test)

# modify unbalance
train_size = train.select("label").count()
negative_num = train.select("label").where("label==0").count()
balance_ratio = float(float(negative_num) / float(train_size))
train = train.withColumn(
    "classWeights",
    when(train.label == 1, balance_ratio).otherwise(1 - balance_ratio))

#-------------------train-----------------------#
#build models
md = LogisticRegression(labelCol="indexedLabel",
                        featuresCol="indexedFeatures",
                        weightCol="classWeights",
                        maxIter=50,
                        regParam=0.02)
# md = LogisticRegression(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10, regParam=0.05)
#List off all the columns
colum=window_dataset.columns
colum.remove("success_failure")
colum


# ### Vectorizing

# In[59]:


#Vectorizing the set of input features
from pyspark.ml.feature import VectorAssembler
df_vect = VectorAssembler(inputCols = colum, outputCol="features").transform(window_dataset)
df_vect.select("features", "success_failure").limit(5).toPandas()


# ### Scaling

# In[60]:


#Applying Min-Max scaling
from pyspark.ml.feature import MinMaxScaler
mm_scaler = MinMaxScaler(inputCol="features", outputCol="minmax_scaled_features")


# In[61]: