Exemplo n.º 1
0
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred):
    lp_data= get_labeled_points(start1, end2, df, sc, sql_context)
    print lp_data.count()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data)
    td = labelIndexer.transform(lp_data)
    label2index = {}
    for each in  sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]),
                key=lambda x: x[0]):
        label2index[int(each[0])] = int(each[1])
    print label2index

    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data)

    rf = get_model()

    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

    lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1)
    model = pipeline.fit(lp_train)
    lp_check = lp_data.filter(lp_data.date2>start2)
    predictions = model.transform(lp_check)
    predictions = val(predictions, label2index, sql_context)

    if is_pred:
        predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc())
        dfToTableWithPar(sql_context, predictions, "predictions", get_cur())
        for each in predictions.take(10):
            print each
Exemplo n.º 2
0
def mapClickCategoricalFeatures():
		

	indexed = ""

	df = getDataFrame(CLICKS_HDPFILEPATH)
	
	df.persist(StorageLevel.DISK_ONLY)

	print df.columns
	
	#select columns to be mapped
	click_cols = ["C2", "C3", "C4", "C5", "C7", "C8"]

	for col in click_cols:

		if(indexed == ""):	
			indexed = df
	
		print indexed
		outcol = col+"Index"
		indexer = StringIndexer(inputCol=col, outputCol=outcol)
		indexed = indexer.fit(indexed).transform(indexed)

	indexed.show()

	indexed.persist(StorageLevel.DISK_ONLY)

	#indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(PATH+"extraction/clicks1.csv")


	indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/clicks_23feb12.csv")
Exemplo n.º 3
0
def testClassification(data):
    # Train a GradientBoostedTrees model.

    stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    si_model = stringIndexer.fit(data)
    td = si_model.transform(data)

    rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13)

    trainData,testData = td.randomSplit([0.8,0.2],13)

    predictionDF = rf.fit(trainData).transform(testData)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.collect():
        print row

    scoresAndLabels = predictionDF\
       .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
    for sl in scoresAndLabels.collect():
        print sl
    evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC')
    metric = evaluator.evaluate(selected)
    print metric
def build_decisionTree(path):

    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show(truncate=False)

    dt = DecisionTreeClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
def build_randomForest(path):
    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show()

    rdf = RandomForestClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\
                            .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = rdf.fit(df)

    prediction = cvModel.transform(df)
    prediction.show()

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
Exemplo n.º 6
0
def train_random_forest(df):
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed",
                                seed=int(random.random()))
    return rf, rf.fit(td)
Exemplo n.º 7
0
def mapPublisherCategoricalFeatures():
	
	indexed = ""

	df = getDataFrame(PUBLISHERS_HDPFILEPATH)

	df.persist(StorageLevel.DISK_ONLY)

	print df.columns
	
	publisher_cols = ["C0", "C1", "C2", "C3"]
	
	for col in publisher_cols:

		if(indexed == ""):	
			indexed = df

		print indexed
		outcol = col+"Index"
		#stringindexer maps each value in inout colun into a double indexed value and creates a new column in dataframe
		indexer = StringIndexer(inputCol=col, outputCol=outcol)
		#fit and transform the columns using indexer		
		indexed = indexer.fit(indexed).transform(indexed)

	indexed.show()

	indexed.persist(StorageLevel.DISK_ONLY)

	indexed.select('C0Index', 'C1Index', 'C2Index', "C3Index").write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/publishers_23feb12.csv")
Exemplo n.º 8
0
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only
Exemplo n.º 9
0
Arquivo: ml.py Projeto: ribonj/lsir
def label(df, column):
    """
    Create a labeled column.
    """
    indexer = StringIndexer(inputCol=column, outputCol=column+'_label')
    df = indexer.fit(df).transform(df)
    return df
Exemplo n.º 10
0
def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdata = df
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-x")
        sm = si.fit(newdata)
        newdata = sm.transform(newdata).drop(c)
        newdata = newdata.withColumnRenamed(c+"-x", c)
    return newdata
Exemplo n.º 11
0
def events(df,column_name):
    i = column_name+"I"
    v = column_name+"V"
    stringIndexer = StringIndexer(inputCol=column_name, outputCol=i)
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    encoder = OneHotEncoder(inputCol=i, outputCol=v)
    encoded = encoder.transform(indexed)
    return encoded
Exemplo n.º 12
0
def indexStringColumns(df, cols):
    from pyspark.ml.feature import StringIndexer
    #variable newdf will be updated several times
    newdf = df
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf
Exemplo n.º 13
0
    def oneHotEncoding(self, df, input_col):
        stringInd = StringIndexer(inputCol=input_col, outputCol="indexed")
        model = stringInd.fit(df)
        td = model.transform(df)
        encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False)
        final_encoding = encoder.transform(td).select(df.id, 'features').cache()
        
        conv_udf = udf(lambda line: Vectors.dense(line).tolist())
        final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache()

        return final_encoding
Exemplo n.º 14
0
    def test_string_indexer_handle_invalid(self):
        df = self.spark.createDataFrame([
            (0, "a"),
            (1, "d"),
            (2, None)], ["id", "label"])

        si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
                            stringOrderType="alphabetAsc")
        model1 = si1.fit(df)
        td1 = model1.transform(df)
        actual1 = td1.select("id", "indexed").collect()
        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
        self.assertEqual(actual1, expected1)

        si2 = si1.setHandleInvalid("skip")
        model2 = si2.fit(df)
        td2 = model2.transform(df)
        actual2 = td2.select("id", "indexed").collect()
        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
        self.assertEqual(actual2, expected2)
Exemplo n.º 15
0
def base_features_gen_pipeline(input_descript_col="descript", input_category_col="category", output_feature_col="features", output_label_col="label"):
    indexer=StringIndexer(inputCol=input_category_col,outputCol=output_label_col)
    wordtokenizer=Tokenizer(inputCol=input_descript_col,outputCol="words")
    counter=CountVectorizer(inputCol="words",outputCol=output_feature_col)
    pipeline=Pipeline(stages=[indexer,wordtokenizer,counter])
    return pipeline
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("DecisionTreeClassificationExample")\
        .getOrCreate()

    # $example on$
    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.format("libsvm").load(
        "data/mllib/sample_libsvm_data.txt")

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(data)
    # Automatically identify categorical features, and index them.
    # We specify maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures")

    # Chain indexers and tree in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
def make_regr_model(data, sc, model_path, model_name, target, ml_model='default', save=True):

    t0 = time()
    # Stages for pipline
    stages = []

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Identify categorical and numerical variables
    catCols = [x for (x, dataType) in trainingData.dtypes if ((dataType == "string") | (dataType == "boolean"))]

    numCols = [x for (x, dataType) in trainingData.dtypes if (((dataType == "int") | (dataType == "bigint")
                                                                 | (dataType == "float") | (dataType == "double"))
               & (x != "target"))]

    # OneHotEncode categorical variables
    indexers = [StringIndexer(inputCol=column, outputCol=column + "-index", handleInvalid="keep") for column in catCols]

    encoder = OneHotEncoder(
        inputCols=[indexer.getOutputCol() for indexer in indexers],
        outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers]
    )
    assembler_cat = VectorAssembler(
        inputCols=encoder.getOutputCols(),
        outputCol="categorical-features",
        handleInvalid="skip"
    )

    stages += indexers
    stages += [encoder, assembler_cat]



    assembler_num = VectorAssembler(
        inputCols=numCols,
        outputCol="numerical-features",
        handleInvalid="skip"
    )

    # Standardize numerical variables
    scaler = StandardScaler(inputCol="numerical-features", outputCol="numerical-features_scaled")

    # Combine all features in one vector
    assembler_all = VectorAssembler(
        inputCols=['categorical-features', 'numerical-features_scaled'],
        outputCol='features',
        handleInvalid="skip"
    )

    stages += [assembler_num, scaler, assembler_all]

    # Train a RandomForest model.
    if ml_model == 'default':
        rf = RandomForestRegressor(labelCol="target", featuresCol="features")
    else:
        rf = ml_model

    stages += [rf]

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=stages)

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    #predictions.select("prediction", "target", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="target", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("RMSE = %g" % (0.0 + rmse))

    if save:
        # Final model saving and statistics writing
        tt = time() - t0
        timestamp = int(time())
        model.write().overwrite().save(model_path)

        cluster = Cluster(['127.0.0.1'], "9042")
        session = cluster.connect("models")
        query = ("INSERT INTO %s (model_name, timestamp, target, learning_time, model_path, stat)") % ("models_statistics")
        query = query + " VALUES (%s, %s, %s, %s, %s, %s)"
        session.execute(query, (model_name, timestamp, target, tt, model_path, rmse))
        session.shutdown()
        cluster.shutdown()

        # Stop spark session
        sc.stop()

    if not save:
        return model, sc
Exemplo n.º 18
0
spark = SparkSession.builder.master('spark://node01:7077').appName(
    'learn_ml').getOrCreate()

# 载入数据
df = spark.read.csv('hdfs://node01:9000/mushrooms.csv',
                    header=True,
                    inferSchema=True,
                    encoding='utf-8')

# 先使用StringIndexer将字符转化为数值,然后将特征整合到一起
from pyspark.ml.feature import StringIndexer, VectorAssembler
old_columns_names = df.columns
print(old_columns_names)
new_columns_names = [name + '-new' for name in old_columns_names]
for i in range(len(old_columns_names)):
    indexer = StringIndexer(inputCol=old_columns_names[i],
                            outputCol=new_columns_names[i])
    df = indexer.fit(df).transform(df)
vecAss = VectorAssembler(inputCols=new_columns_names[1:], outputCol='features')
df = vecAss.transform(df)
# 更换label列名
df = df.withColumnRenamed(new_columns_names[0], 'label')

# 创建新的只有label和features的表
data = df.select(['label', 'features'])

# 数据概观
print(data.show(5, truncate=0))

# 将数据集分为训练集和测试集
train_data, test_data = data.randomSplit([4.0, 1.0], 100)
def get_all_cat_cols_indexed(t_data, index_col):
    indexer = StringIndexer(inputCol=t_data.columns[0],
                            outputCol=index_col).setHandleInvalid("keep")
    t_data = indexer.fit(t_data).transform(t_data)
    t_data = t_data.select(index_col)
    return t_data
# there are some null values in this dataset, Remove it
df = df.dropna()

'''
Lets considers a flight to be "delayed" if it arrives late after 15 minutes than it was sheduled. 
Also, the mile column has data into miles, so lets change this to KM.
'''
df_withKM = df.withColumn('KM', round(df.mile * 1.60934, 0)).drop('mile')

# create a extra column called label based on whether there has been delay or not and assigned to 0 and 1
flight_data = df_withKM.withColumn('label', (df_withKM.delay >= 15).cast('integer'))

flight_data = flight_data.drop('delay')

# Change those catagorical text columns which is going to be used in ML to catagorical numerical columns
indexers = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# do one hot coding for catagorical numerical columns
onehot = OneHotEncoderEstimator(
    inputCols=['dow', 'carrier_idx'],
    outputCols=['dow_d', 'carrier_d']
)

# Extract the features columns
assembler = VectorAssembler(inputCols=['mon', 'dom', 'dow_d', 'carrier_d', 'KM', 'duration'],
                            outputCol='features')


# split data into train, test using 80% train and 20% test. Also assigns the random speed
x_train, x_test = flight_data.randomSplit([0.80, 0.20], seed=42)
Exemplo n.º 21
0
        .map(lambda words: Row(label=words[0], words=words[1:]))
    return spark.createDataFrame(rdd)


# load dataframes
train_data = load_dataframe("Spark/data/20ng-train-all-terms.txt")
test_data = load_dataframe("Spark/data/20ng-test-all-terms.txt")

# Count word frequency
vectorizer = CountVectorizer(inputCol="words", outputCol="bag_of_words")
vectorizer_transformer = vectorizer.fit(train_data)
train_bag_of_words = vectorizer_transformer.transform(train_data)
test_bag_of_words = vectorizer_transformer.transform(test_data)

# Create numeric labels
label_indexer = StringIndexer(inputCol="label", outputCol="label_index")
label_indexer_transformer = label_indexer.fit(train_bag_of_words)

train_bag_of_words = label_indexer_transformer.transform(train_bag_of_words)
test_bag_of_words = label_indexer_transformer.transform(test_bag_of_words)

# Create classifier
classifier = NaiveBayes(labelCol="label_index",
                        featuresCol="bag_of_words",
                        predictionCol="label_index_predicted")

# Train model
classifier_transformer = classifier.fit(train_bag_of_words)
test_predicted = classifier_transformer.transform(test_bag_of_words)

test_predicted.select("label_index", "label_index_predicted").limit(10).show()
Exemplo n.º 22
0
# 如果表字段只要有一个是空的,我都过滤掉:
q_l_df.na.drop()
q_l_df.na.drop(thresh=1)  # 只要一个为空就过滤掉

# 或者我填充下缺失值
q_l_df_fix = q_l_df.na.fill({'county': "未收集"})
# 看下效果
q_l_df.na.fill({'county': "未收集"}). \
    where(f.col("county").isNull()).count()

# 其实长度也有为null的
# q_l_df.where(f.col("Residential").isNull()).select(f.length("Residential")).show()

# 把某个字段转化为一个数字
string_index = StringIndexer(inputCol="county", outputCol="county_number")
q_l2_df = string_index.fit(q_l_df_fix).transform(q_l_df_fix)
# .select(f.col("section_number"), f.col("section"))

# 看看现在的表结构
q_l2_df.printSchema()

# 恩,我想知道两个字段没有关联关系
q_l2_df.corr("q_l", "county_number")
# 算两个字段的方差
q_l2_df.cov("q_l", "county_number")

# 我想知道现在某个字段分布
q_l2_df.groupBy("county").agg(f.count("county").alias("c")).orderBy(
    f.desc("c"))
# 中位分布
    "inferSchema", True).option("delimiter", ",").load("adult.csv")
data = data.withColumnRenamed("age", "label").select(
    "label",
    col("education-num").alias("education-num"),
    col(" hours-per-week").alias("hours-per-week"),
    col(" education").alias("education"),
    col(" fnlwgt").alias("fnlwgt"),
    col(" sex").alias("sex"),
    col(" relationship").alias("relationship"))
data = data.select(data.label.cast("double"), "education-num",
                   "hours-per-week", "education", "sex", "fnlwgt",
                   "relationship")

new_data = data.toDF("label", "education-num", "hours-per-week", "education",
                     "sex", "fnlwgt", "relationship")
indexer = StringIndexer(inputCol="education", outputCol="new_education")
indexed = indexer.fit(new_data).transform(new_data)

indexer1 = StringIndexer(inputCol="sex", outputCol="new_sex")
indexed1 = indexer1.fit(indexed).transform(indexed)

indexer2 = StringIndexer(inputCol="relationship", outputCol="new_rel")
indexed2 = indexer2.fit(indexed1).transform(indexed1)

indexed2 = indexed2.drop("sex", "education", "relationship")
indexed2.show()

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=indexed2.columns[1:],
                            outputCol="features")
data = assembler.transform(indexed2)
Exemplo n.º 24
0
# The following code does three things with pipeline:
# 
# * **`StringIndexer`** all categorical columns
# * **`OneHotEncoder`** all categorical index columns
# * **`VectorAssembler`** all feature columns into one vector column

# ### Categorical columns

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
import pyspark.sql.functions as F

# categorical columns
categorical_columns = cuse.columns[0:3]
stage_string = [StringIndexer(inputCol= c, outputCol= c+"_string_encoded") for c in categorical_columns]
stage_one_hot = [OneHotEncoder(inputCol= c+"_string_encoded", outputCol= c+ "_one_hot") for c in categorical_columns]

ppl = Pipeline(stages=stage_string + stage_one_hot)
df = ppl.fit(cuse).transform(cuse)
df.toPandas().to_csv('cuse_afterTransform.csv')
df.select("age", 'age_string_encoded').distinct().sort(F.asc("age_string_encoded")).show()
df.select("education").distinct().show()
df.select("wantsMore").distinct().show()

# In[2]:
# ### Build VectorAssembler stage
df.columns

assembler = VectorAssembler(
  inputCols=['age_one_hot',
Exemplo n.º 25
0
df_node2 = df_node1.dropna(subset=[
    "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp",
    "Parch", "Ticket", "Fare", "Cabin", "Embarked"
],
                           how="any",
                           thresh=12)

df_node3 = df_node2.randomSplit(seed=1234, weights=[0.7, 0.3])

mmi_value_0_node4 = ["Sex", "Embarked", "Survived"]
mmi_value_1_node4 = ["indexedSex", "indexedEmbarked", "indexedSurvived"]
stages_node4 = []
for i in range(len(mmi_value_0_node4)):
    stages_node4.append(
        StringIndexer(inputCol=mmi_value_0_node4[i],
                      outputCol=mmi_value_1_node4[i],
                      handleInvalid="error",
                      stringOrderType="frequencyDesc"))

mmi_value_0_node5 = ["indexedSex", "indexedEmbarked"]
mmi_value_1_node5 = ['sexVec', 'embarkedVec']
stages_node5 = []
for i in range(len(mmi_value_0_node5)):
    stages_node5.append(
        OneHotEncoder(inputCol=mmi_value_0_node5[i],
                      outputCol=mmi_value_1_node5[i]))

pipeline_stage_node6 = VectorAssembler(
    outputCol="features",
    inputCols=["Pclass", "sexVec", "Age", "SibSp", "Fare", "embarkedVec"])
pipeline_stage_node7 = RandomForestClassifier(featureSubsetStrategy="auto",
                                              numTrees=20,
    CASE
     WHEN (pickup_hour <= 6 OR pickup_hour >= 20) THEN "Night" 
     WHEN (pickup_hour >= 7 AND pickup_hour <= 10) THEN "AMRush" 
     WHEN (pickup_hour >= 11 AND pickup_hour <= 15) THEN "Afternoon"
     WHEN (pickup_hour >= 16 AND pickup_hour <= 19) THEN "PMRush"
    END as TrafficTimeBins
    FROM taxi_test 
"""
taxi_df_test_with_newFeatures = sqlContext.sql(sqlStatement)

## CACHE DATA-FRAME IN MEMORY & MATERIALIZE DF IN MEMORY
taxi_df_test_with_newFeatures.cache()
taxi_df_test_with_newFeatures.count()

## INDEX AND ONE-HOT ENCODING
stringIndexer = StringIndexer(inputCol="vendor_id", outputCol="vendorIndex")
model = stringIndexer.fit(taxi_df_test_with_newFeatures) # Input data-frame is the cleaned one from above
indexed = model.transform(taxi_df_test_with_newFeatures)
encoder = OneHotEncoder(dropLast=False, inputCol="vendorIndex", outputCol="vendorVec")
encoded1 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="rate_code", outputCol="rateIndex")
model = stringIndexer.fit(encoded1)
indexed = model.transform(encoded1)
encoder = OneHotEncoder(dropLast=False, inputCol="rateIndex", outputCol="rateVec")
encoded2 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="payment_type", outputCol="paymentIndex")
model = stringIndexer.fit(encoded2)
indexed = model.transform(encoded2)
encoder = OneHotEncoder(dropLast=False, inputCol="paymentIndex", outputCol="paymentVec")
Exemplo n.º 27
0
# In[326]:

print "Creating sparse vectors for all data based on this new dictionary"
t0 = time()
dfTrainSelect=dfTrain.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema)
dfTestSelect=dfTest.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema)
dfTrainSelect.take(1)
dfTestSelect.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[328]:

from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect)


# In[329]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10)


# In[330]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
Exemplo n.º 28
0
def main():
    # Unzip file on a temporary folder
    unzip_files()

    if files_source == "hdfs":
        training_data = spark.read.load(
            "/tmp/training.1600000.processed.noemoticon.csv", format="csv")
    elif files_source == "local":
        training_data = spark.read.load(
            "tmp/training.1600000.processed.noemoticon.csv", format="csv")
    training_data = training_data.withColumnRenamed("_c0", "label") \
        .withColumnRenamed("_c1", "tweet_id") \
        .withColumnRenamed("_c2", "date") \
        .withColumnRenamed("_c3", "query") \
        .withColumnRenamed("_c4", "user") \
        .withColumnRenamed("_c5", "tweet")

    # Load the amount of lines for training defined on arg sample_size. If equals zero, use the whole dataset
    if sample_size > 0:
        training_data = training_data.sample(sample_size /
                                             training_data.count())

    ## Preprocess dataset
    training_data = training_data.select(functions.col("label"),
                                         functions.col("tweet"))

    # Run the cleansing UDF for tweet column
    udf_cleansing = functions.udf(cleansing)
    training_data = training_data.withColumn(
        "tweet_cleansed", udf_cleansing(functions.col("tweet")))

    # Tokenizing
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words")
    training_data = tokenizer.transform(training_data)

    # Generating features
    from pyspark.ml.feature import HashingTF
    hashingTF = HashingTF(inputCol="words", outputCol="features")
    training_data = hashingTF.transform(training_data)

    # Generate label indexes
    from pyspark.ml.feature import StringIndexer
    stringIndexer = StringIndexer(inputCol="label", outputCol="labelIndex")
    model = stringIndexer.fit(training_data)
    training_data = model.transform(training_data)

    # Split dataset into training and test according to test_size_frac arg
    training, test = training_data.randomSplit(
        [1 - test_size_fraction, test_size_fraction])

    # Training the model
    from pyspark.ml.classification import NaiveBayes
    #Naive bayes
    nb = NaiveBayes(featuresCol="features",
                    labelCol="labelIndex",
                    predictionCol="NB_pred",
                    probabilityCol="NB_prob",
                    rawPredictionCol="NB_rawPred")
    nbModel = nb.fit(training)
    cv = nbModel.transform(test)
    total = cv.count()
    correct = cv.where(cv['labelIndex'] == cv['NB_pred']).count()
    accuracy = correct / total

    # Saving trained model for usage in a Pipeline (so you don't need to re-train everytime you need to use it)
    model_folder = os.path.join(base_folder, 'saved_models')
    print(model_folder)

    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    model_full_path = os.path.join(model_folder, "twitter_sentiment_spark")
    if files_source == "hdfs":
        model_full_path = "file://" + model_full_path
    nbModel.write().overwrite().save(model_full_path)

    # Save Labels reference table
    labels = cv.select("labelIndex", "label").distinct() \
        .withColumnRenamed("label", "label_predicted") \
        .withColumnRenamed("labelIndex", "label_id")

    labels.toPandas().to_csv(os.path.join(model_folder, "labels.csv"),
                             index=False)

    # Save evaluations
    sys.stdout = open(os.path.join(model_folder, "evaluation.txt"), "w")
    print("\nTotal:", total, "\nCorrect:", correct, "\nAccuracy:", accuracy)
    sys.stdout.close()

    # Delete temporary folder
    if os.path.exists(temporary_folder):
        shutil.rmtree(temporary_folder)
Exemplo n.º 29
0
    .master("local") \
    .appName("KaggleLab") \
    .getOrCreate()

trainingData = spark.read.csv(
    "/Users/thai-anthantrong/Documents/MS_BIG_DATA/Cours/SD701/KaggleLab/train-data.csv", header=True, inferSchema=True)

testData = spark.read.csv(
    "/Users/thai-anthantrong/Documents/MS_BIG_DATA/Cours/SD701/KaggleLab/test-data.csv", header=True, inferSchema=True)

# stages in our Pipeline
stages = []

# Index labels, adding metadata to the label column.
# Fit on whole trainingData to include all labels in index.
labelIndexer = StringIndexer(inputCol="Cover_Type", outputCol="label").fit(trainingData)
stages += [labelIndexer]

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="Cover_Type_pred", labels=labelIndexer.labels)

# All columns
all_cols = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
            "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
            "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
            "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6",
            "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13",
            "Soil_Type14", "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20",
            "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26", "Soil_Type27",
            "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34",
            "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40"]
Exemplo n.º 30
0
# In[16]:

print "Creating feature vectors"
t0 = time()
dfTrainVec=dfTrain.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema)
dfTestVec=dfTest.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema)
tt = time() - t0
print "Dataframe created in {} second".format(round(tt,3))


# In[19]:

print "Indexing labels"
t0 = time()
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainVec)
dfTrainIdx = string_indexer_model.transform(dfTrainVec)
dfTrainIdx.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[20]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='featureVectors', labelCol='target_indexed', maxDepth=10)


# In[21]:
# MAGIC
# MAGIC For simplicity's sake, we will use One-Hot Encoding to convert all categorical variables into binary vectors. We will use a combination of StringIndexer and OneHotEncoderEstimator to convert the categorical variables. The `OneHotEncoderEstimator` will return a `SparseVector`.
# MAGIC
# MAGIC Since we will have more than 1 stage of feature transformations, we use a Pipeline to tie the stages together. This simplifies our code.

# COMMAND ----------

# MAGIC %md The ML package needs the label and feature vector to be added as columns to the input dataframe. We set up a pipeline to pass the data through transformers in order to extract the features and label. We index each categorical column using the `StringIndexer` to a column of number indices, then convert the indexed categories into one-hot encoded variables with at most a single one-value. These binary vectors are appended to the end of each row. Encoding categorical features allows decision trees to treat categorical features appropriately, improving performance. We then use the `StringIndexer` to encode our labels to label indices.

# COMMAND ----------

categoricalColumns = ["OriginAirportCode", "Carrier", "DestAirportCode"]
stages = []  # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol + "Index")
    # Use OneHotEncoderEstimator to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(dropLast=False, inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Using the slightly older OneHotEncoder (instead of OneHotEncoderEstimator) for compatibility reasons when operationalizing within the DSVM
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(),
                            outputCol=categoricalCol + "classVec")
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="DepDel15", outputCol="label")
stages += [label_stringIdx]

# COMMAND ----------

# MAGIC %md Now we need to use the `VectorAssembler` to combine all the feature columns into a single vector column. This includes our numeric columns as well as the one-hot encoded binary vector columns.
Exemplo n.º 32
0

from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField,DoubleType

schema = StructType([StructField('label',DoubleType(),True),StructField('bigramVectors',VectorUDT(),True)])


features=dfBigram.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)

print "Features from bigrams created"

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"


dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol=string_indexer.getOutputCol(), maxDepth=10)


from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')


from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
Exemplo n.º 33
0
# we can remove the header by setting a value equal to the first row of the rdd
# and then filter it out.  then we define schema from the rdd and use that to make
# a dataframe
data = sc.textFile(
    "hdfs://ch1-hadoop-ns/user/bcraft/boosted_trees_test_data.csv").map(
        lambda line: line.split(','))
header = data.first()
data = data.filter(lambda row: row != header)
schema = data.map(lambda x: Row(id=x[0], make=x[1], vdps=x[2], label=x[3]))
df = sqlContext.createDataFrame(schema)

# string indexer for our categorical features
# this indexes each categorical feature and we will
# save them in a data frame that maps the make name to the string
# for persistence purposes
indexer = StringIndexer(inputCol="make", outputCol="makeIDX")
df = indexer.fit(df).transform(df)
make_idx_mappings = df.select('make', 'makeIDX').distinct().show()

# one hot encoder
# this will convert the indexed strings to sparse one hot vectors
# think of this as dummy feature creation
encoder = OneHotEncoder(inputCol="makeIDX", outputCol="make_sparse_vect")
df = encoder.transform(df)

# spark models expect to see a feature vector and a prediction column
# so we need to put all our features into a vector, in this case
# the sparse vector and vdp count, we also have to do some
# data type transformations from string to double
df = df.withColumn("vdp_int", df["vdps"].cast("double"))
df = df.withColumn("label_int", df["label"].cast("double"))
Exemplo n.º 34
0
		 r.stars, r.attributes['Price Range'],
		 False if r.attributes['Good For'] is None else r.attributes['Good For']['dinner'],
		 False if r.attributes['Good For'] is None else r.attributes['Good For']['lunch'],
		 False if r.attributes['Good For'] is None else r.attributes['Good For']['breakfast'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['romantic'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['upscale'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['casual'],
		 False if (r.attributes['Alcohol'] is None or r.attributes['Alcohol'] == 'none') else True,
		 False if r.attributes['Take-out'] is None else r.attributes['Take-out']]
	).toDF(clustering_columns)

# drop row with null values
lv_clustering_data = lv_clustering_data.dropna()

#Neighborhood feature engineering
stringIndexer = StringIndexer(inputCol="neighborhood", outputCol="neigh_index")
lv_model = stringIndexer.fit(lv_clustering_data)
lv_indexed = lv_model.transform(lv_clustering_data)
encoder = OneHotEncoder(dropLast=False, inputCol="neigh_index", outputCol="neigh_vec")
lv_encoded = encoder.transform(lv_indexed)

#initial feature set
# assembler = VectorAssembler(
#     inputCols=["stars", "price_range", "neigh_vec"],
#     outputCol="features_vec")

#expanded feature set
feature_columns = clustering_columns[2:]
feature_columns.append("neigh_vec")
assembler = VectorAssembler(
    inputCols=feature_columns,
Exemplo n.º 35
0
                       ("null", "unknow", "", "None") or x == None else x)
#splitCalUDF = F.udf(lambda x : float(x.split("*")[0])*float(x.split("*")[1]), returnType=StringType())
#缺失处理
data = data.withColumn("gender", cleanStringUDF("gender"))
#                  .withColumn("religiousness",splitCalUDF("religiousness"))
#类型处理
feature1_list = [
    'age', 'label', 'religiousness', 'education', 'occupation', 'rating'
]
feature2_list = ['gender', 'children']
for c in feature1_list:
    data = data.withColumn(c, data[c].cast(DoubleType()))

indexers = [
    StringIndexer(inputCol=c,
                  outputCol='{0}_indexed'.format(c),
                  handleInvalid='error') for c in feature2_list
]
encoders = [
    OneHotEncoder(dropLast=True,
                  inputCol=indexer.getOutputCol(),
                  outputCol="{0}_encoded".format(indexer.getOutputCol()))
    for indexer in indexers
]
assembler = VectorAssembler(inputCols=feature1_list +
                            [encoder.getOutputCol() for encoder in encoders],
                            outputCol="features")
feature_pipeline = Pipeline(stages=indexers + encoders + [assembler])
feature_model = feature_pipeline.fit(data)

#index y
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

categoricalColumns = ["UNIQUE_CARRIER", "ORIGIN", "DEST"]
numericalColumns = ["DISTANCE"]

# Convert string categorical columns to indexed integers
indexers = [
	StringIndexer(inputCol=c, outputCol ="{0}_indexed".format(c))
	for c in categoricalColumns
]

# OneHot Encoding
encoders = [
	OneHotEncoder(
		inputCol=indexer.getOutputCol(),
		outputCol ="{0}_encoded".format(indexer.getOutputCol())
		)
	for indexer in indexers
]

# Assembler for categorical columns
assemblerCategorical = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol= "cat")
stages = indexers+encoders+ [assemblerCategorical]
pipelineCategorical = Pipeline(stages=stages)
df = pipelineCategorical.fit(df).transform(df)

# Assembler for Numerical columns
assemblerNumerical = VectorAssembler(inputCols = numericalColumns, outputCol = "num")
pipelineNumerical = Pipeline(stages = [assemblerNumerical])
df = pipelineNumerical.fit(df).transform(df)
Exemplo n.º 37
0
        data.append(tmp)

    print(len(data))
    df = sqlContext.createDataFrame(data, schema=["category", "text"])

    # regular expression tokenizer
    regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

    # stop words
    stop_words = list(set(stopwords.words('english')))

    stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)

    # bag of words count
    count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
    label_string_index = StringIndexer(inputCol="category", outputCol="label")
    label_string_index.setHandleInvalid("keep")

    pipeline = Pipeline(stages=[regex_tokenizer, stop_words_remover, count_vectors, label_string_index])
    (training_data, test_data) = df.randomSplit([0.8, 0.2], seed=100)
    pipeline_fit = pipeline.fit(training_data)
    pipeline_fit.save("rf_pipeline")

    training_data_set = pipeline_fit.transform(training_data)
    training_data_set.show(5)

    # stages = pipeline_fit.stages
    # vec = [s for s in stages if isinstance(s, CountVectorizerModel)]
    # v1 = vec[0].vocabulary
    # print(len(v1))
Exemplo n.º 38
0
    data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame
    data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(','))
    if train:
        data = data.map(
            lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),
                          'class_'+str(line[0]),int(line[0])) )
    else:
        # Test data gets dummy labels. We need the same structure as in Train data
        data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) ) 
    return sqlcontext.createDataFrame(data, ['features', 'category','label'])

train_df = load_data_frame("train.csv")
test_df = load_data_frame("test.csv", shuffle=False, train=False)
from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol="category", outputCol="index_category")
fitted_indexer = string_indexer.fit(train_df)
indexed_df = fitted_indexer.transform(train_df)

from distkeras.transformers import *
from pyspark.ml.feature import OneHotEncoder
####OneHot
nb_classes = 9
encoder = OneHotTransformer(nb_classes, input_col='label', output_col="label_encoded")
dataset_train = encoder.transform(indexed_df)
dataset_test = encoder.transform(test_df)

###encoder
from pyspark.ml.feature import MinMaxScaler
transformer = MinMaxTransformer(n_min=0.0, n_max=1.0, \
                                o_min=0.0, o_max=250.0, \
Exemplo n.º 39
0
print("The data contains %d records." % cars.count(), '\n')

cars = cars.withColumnRenamed("ncyl", "cyl")
cars = cars.withColumn('length_meters', round(cars.length * 0.0254, 3))

cars = cars.withColumn('weight_kg', round(cars.weight / 2.205, 0))

cars = cars.withColumn('avg_mpg', round((cars.city_mpg + cars.hwy_mpg) / 2, 1)) \
            .drop("city_mpg", "hwy_mpg")

cars = cars.withColumn(
    'consumption', round((100 * 3.785411784) / (cars.avg_mpg * 1.609344), 2))

print("Cars with null cyl", cars.filter('cyl IS NULL').count(), '\n')

indexer = StringIndexer(inputCol='origin', outputCol='origin_idx')

# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

pd.set_option('display.max_columns', None)  # all cols
pd.set_option('display.width', 161)
print(cars.toPandas().sample(12))

print(indexer)
# View the first five records
cars.sample(False, .25).show()

# Check column data types
Exemplo n.º 40
0
from pyspark.sql.types import StringType

df = df.withColumn("Pclass", df["Pclass"].cast(StringType()))
df = df.withColumn("SibSp", df["SibSp"].cast(StringType()))
df = df.withColumn("Parch", df["Parch"].cast(StringType()))
df = df.withColumn("Relatives", df["Relatives"].cast(StringType()))

from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
# Categorical features
categoricalColumns = [
    'Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch', 'Relatives'
]
stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

# Numeric features
numericCols = ['Age', 'Fare']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

# Pipeline
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
Exemplo n.º 42
0
23


from pyspark.sql.functions import date_format
#data_3 = data_2.withColumn("new_dates",date_format("dates","YYYY-MM-dd HH"))
data_3 = data_2.withColumn("new_dates",date_format("dates","YYYY-MM-dd"))
data_3.createOrReplaceTempView("data_3")
data_3.take(1)
[Row(Agency='NYPD', Complaint Type='Noise - Commercial', 
Descriptor='Loud Music/Party', Location Type='Club/Bar/Restaurant', 
Incident Zip='11238', City='BROOKLYN', Borough='BROOKLYN', 
Latitude='40.677476821236894', Longitude='-73.96893730309779', 
dates=datetime.datetime(2010, 3, 6, 23, 38, 30), new_dates='2010-03-06 23')]

>>> from pyspark.ml.feature import StringIndexer
>>> indexer = StringIndexer(inputCol = "Agency",outputCol = "Agency_onehot")
>>> indexed = indexer.fit(data_3).transform(data_3)
>>> indexed.take(5)

# finding the unique values in a column
data_3.select("Agency").distinct().show()
agency_uniq = data_3.select("Agency").distinct().rdd.map(lambda r: r[0]).collect()
len(agency_uniq)
29

data_3.select("Complaint Type").distinct().show()
complaint_uniq = data_3.select("Complaint Type").distinct().rdd.map(lambda r: r[0]).collect()
len(complaint_uniq) 
279
complaint_uniq_list = ['Traffic Signal Condition', 'Cranes and Derricks', 'SAFETY', 'ELECTRIC', 'Tanning', 'DOOR/WINDOW', 'Comments', 'Noise - Helicopter', 'STRUCTURAL', 'Broken Parking Meter', 'Window Guard', 'Broken Muni Meter', 'Highway Condition', 'Street Condition', 'FLOORING/STAIRS', 'Hazardous Materials', 'DOF Literature Request', 'Vending', 'Ferry Permit', 'PAINT - PLASTER', 'Taxi Report', 'OUTSIDE BUILDING', 'Advocate-Prop Refunds/Credits', 'Drinking Water', 'UNSANITARY CONDITION', 'Public Toilet', 'Bus Stop Shelter Complaint', 'GENERAL CONSTRUCTION', 'Municipal Parking Facility', 'DOF Property - RPIE Issue', 'Mosquitoes', 'DOF Property - Reduction Issue', 'Taxi Compliment', 'Animal in a Park', 'Animal Abuse', 'Advocate-Business Tax', 'Smoking', 'Illegal Animal Kept as Pet', 'Parking Card', 'Injured Wildlife', 'Noise - House of Worship', 'AGENCY', 'DHS Advantage -Landlord/Broker', 'Asbestos/Garbage Nuisance', 'Advocate - Levy', 'MOLD', 'Sanitation Condition', 'Special Natural Area District (SNAD)', 'Home Delivered Meal Complaint', 'Illegal Parking', 'APPLIANCE', 'Building Condition', 'Noise - Residential', 'Portable Toilet', 'Illegal Animal - Sold/Kept', 'Sewer', 'Drug Activity', 'Registration and Transfers', 'Killing/Trapping Pigeon', 'DOF Parking - DMV Clearance', 'Noise Survey', 'Noise - Commercial', 'Highway Sign - Dangling', 'Water System', 'Adopt-A-Basket', 'Squeegee', 'Air Quality', 'Advocate-Co-opCondo Abatement', 'Lead', 'Street Sign - Missing', 'Home Delivered Meal - Missed Delivery', 'Lost Property', 'Health', 'SG-99', 'DRIE', 'DCA / DOH New License Application Request', 'CONSTRUCTION', 'Derelict Vehicle', 'ELEVATOR', 'OEM Literature Request', 'NONCONST', 'DOF Property - Property Value', 'LinkNYC', 'Senior Center Complaint', 'Sweeping/Missed-Inadequate', 'Utility Program', 'DOF Property - City Rebate', 'X-Ray Machine/Equipment', 'Water Maintenance', 'Advocate-Commercial Exemptions', 'Blocked Driveway', 'Beach/Pool/Sauna Complaint', 'Homeless Encampment', 'Housing - Low Income Senior', 'Bike/Roller/Skate Chronic', 'Taxi Complaint', 'Sidewalk Condition', 'HEAT/HOT WATER', 'Unspecified', 'Meals Home Delivery Required', 'HEAP Assistance', 'Litter Basket / Request', 'Lifeguard', 'Stalled Sites', 'DOF Parking - Address Update', 'Maintenance or Facility', 'Consumer Complaint', 'For Hire Vehicle Complaint', 'Water Conservation', 'Research Questions', 'HPD Literature Request', 'Illegal Tree Damage', 'CST', 'Advocate - Lien', 'DOF Parking - Tax Exemption', 'Request Xmas Tree Collection', 'Benefit Card Replacement', 'Indoor Sewage', 'Weatherization', 'Asbestos', 'Unsanitary Animal Pvt Property', 'Discipline and Suspension', 'Water Quality', 'Derelict Bicycle', 'Sweeping/Missed', 'Eviction', 'GENERAL', 'Standing Water', 'Noise - Park', 'Construction', 'Cooling Tower', 'Bus Stop Shelter Placement', 'DOR Literature Request', 'Poison Ivy', 'Missed Collection (All Materials)', 'Disorderly Youth', 'Highway Sign - Damaged', 'PAINT/PLASTER', 'Bike Rack Condition', 'Non-Residential Heat', 'Illegal Animal Sold', 'Forensic Engineering', 'Home Care Provider Complaint', 'Other Enforcement', 'Found Property', 'Homeless Person Assistance', 'Posting Advertisement', 'Legal Services Provider Complaint', 'Scaffold Safety', 'Miscellaneous Categories', 'Recycling Enforcement', 'LEAD', 'Noise', 'Home Repair', 'Elder Abuse', 'Advocate - Other', 'New Tree Request', 'Boilers', 'DOF Property - Update Account', 'Industrial Waste', 'Sweeping/Inadequate', 'DOF Property - Owner Issue', 'Tattooing', "Alzheimer's Care", 'Dead/Dying Tree', 'Forms', 'Mold', 'Collection Truck Noise', 'SNW', 'Street Light Condition', 'Plumbing', 'Calorie Labeling', 'Ferry Complaint', 'DOF Parking - Payment Issue', 'Elevator', 'Day Care', 'Building/Use', 'DOF Property - Request Copy', 'Homebound Evacuation 4', 'Trans Fat', 'Advocate-UBT', 'Bridge Condition', 'Drinking', 'Housing Options', 'Request Large Bulky Item Collection', 'Public Payphone Complaint', 'Transportation Provider Complaint', 'Summer Camp', 'PLUMBING', 'BEST/Site Safety', 'NORC Complaint', 'Case Management Agency Complaint', 'Taxpayer Advocate Inquiry', 'No Child Left Behind', 'Emergency Response Team (ERT)', 'Question', 'Animal Facility - No Permit', 'Advocate - RPIE', 'Trapping Pigeon', 'FHE', 'Standpipe - Mechanical', 'Root/Sewer/Sidewalk Condition', 'City Vehicle Placard Complaint', 'Parent Leadership', 'DHS Advantage - Third Party', 'Street Sign - Damaged', 'Investigations and Discipline (IAD)', 'Safety', 'Food Poisoning', 'Non-Emergency Police Matter', 'Unlicensed Dog', 'General Construction/Plumbing', 'Panhandling', 'Teaching/Learning/Instruction', 'HEATING', 'Street Sign - Dangling', 'DOF Parking - Request Status', 'Dead Tree', 'Damaged Tree', 'Advocate-SCRIE/DRIE', 'Select Message Type...', 'SCRIE', 'Noise - Vehicle', 'Special Projects Inspection Team (SPIT)', 'Interior Demo', 'Traffic/Illegal Parking', 'Overflowing Recycling Baskets', 'Snow', 'Rodent', 'Radioactive Material', 'Foam Ban Enforcement', 'Highway Sign - Missing', 'Unsanitary Animal Facility', 'Overflowing Litter Baskets', 'Harboring Bees/Wasps', 'Bottled Water', 'Hazardous Material', 'Illegal Fireworks', 'Unleashed Dog', 'Traffic', 'Food Establishment', 'Derelict Vehicles', 'WATER LEAK', 'Advocate-Personal Exemptions', 'Graffiti', 'VACANT APARTMENT', 'DPR Internal', 'OEM Disabled Vehicle', 'Noise - Street/Sidewalk', 'Dirty Conditions', 'Plant', 'FCST', 'Electronics Waste', 'Curb Condition', 'Violation of Park Rules', 'Tunnel Condition', 'Indoor Air Quality', 'SRDE', 'DOF Property - State Rebate', 'Bereavement Support Group', 'For Hire Vehicle Report', 'DOF Parking - Request Copy', 'Urinating in Public', 'Ferry Inquiry', 'Unsanitary Pigeon Condition', 'Vacant Lot', 'DHS Income Savings Requirement', 'General Question', 'Overgrown Tree/Branches', 'DOF Property - Payment Issue', 'Advocate-Prop Class Incorrect', 'FATF', 'Damaged or Dead Tree', 'School Maintenance', 'DHS Advantage - Tenant', 'ATF', 'Advocate-Property Value', 'Electrical', 'Pet Shop']
Exemplo n.º 43
0
    os.makedirs(output_dir)


    features = sqc.read.parquet(input_features)
    features = features.filter(features['cls']!='None')\
                        .select(['cls', 'features'])\
                        .cache()
    print features

    features = sqc.createDataFrame(features.map(normalizer))
    print features


    training, valid = features.randomSplit([0.75, 0.25])

    labelIndexer = StringIndexer(inputCol="cls", outputCol="label")

    model = labelIndexer.fit(training)
    training = model.transform(training).rdd.map(lambda row: LabeledPoint(row.label, row.features))
    valid = model.transform(valid).rdd.map(lambda row: LabeledPoint(row.label, row.features))

    print training.first()
    #lr = LogisticRegression()
    #pipeline = Pipeline(stages=[labelIndexer,lr])



    # fit
    model = LogisticRegressionWithLBFGS.train(training, numClasses=10)

    #model = pipeline.fit(training)
Exemplo n.º 44
0
    spark = SparkSession.builder.appName('Create tf-idf').getOrCreate()

    data = spark.read.load(sys.argv[1])

    df = data.filter((col('date') >= '1895') & (col('seq') =='1')) \
            .select(year('date').alias('year'), 'id', 'text')

    # https://danvatterott.com/blog/2018/07/08/aggregating-sparse-and-dense-vectors-in-pyspark/
    def dense_to_array(v):
        new_array = list([float(x) for x in v])
        return new_array

    dense_to_array_udf = udf(dense_to_array, ArrayType(FloatType()))

    indexer = StringIndexer(inputCol="id", outputCol="label")
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    vectorizer = CountVectorizer(inputCol="tokens", outputCol="rawFeatures")
    idf = IDF(inputCol="rawFeatures", outputCol="vector", minDocFreq=1)

    pipeline = Pipeline(stages=[indexer, tokenizer, vectorizer, idf])
    model = pipeline.fit(df)

    results = model.transform(df) \
        .select(year('date').alias('year'), 'label', 'vector') \
        .withColumn('vector', dense_to_array_udf('vector'))

    results = model.transform(df).select('year', 'label', 'vector')

    results.write \
        .partitionBy('year') \
Exemplo n.º 45
0
    return (row.label,SparseVector(len(dico),vector_dict))


from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField,DoubleType

schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)])


features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)

print "Features created"

from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"

lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol())

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

lr_model = lr.fit(featIndexed)

dfTestTok = tokenizer.transform(dfTest)
featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)
testIndexed = string_indexer_model.transform(featuresTest)
Exemplo n.º 46
0
train_df = spark.sql(query)
train_df = train_df.withColumn('id', F.col('id') - 1)

query = """
select category, text, row_number() over (order by id1) as id
from test_df
"""

test_df = spark.sql(query)
test_df = test_df.withColumn('id', F.col('id') - 1)
test_df.show(5)

########################################################################################################
# Build pipeline and run
indexer = StringIndexer(inputCol="category", outputCol="label")
tokenizer = RegexTokenizer(pattern=u'\W+',
                           inputCol="text",
                           outputCol="words",
                           toLowercase=False)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")
lr = LogisticRegression(maxIter=20, regParam=0.001)

# Builing model pipeline
pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr])

# Train model on training set
model = pipeline.fit(
    train_df
)  #if you give new names to your indexed datasets, make sure to make adjustments here
Exemplo n.º 47
0
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1]))

print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)

grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
Exemplo n.º 48
0
strat_train_df = train_df.stat.sampleBy('delayed', fractions, seed=rnd_seed)
strat_train_df.groupBy("delayed").count().show()
# count of delayed=0.0
count_not_delayed = strat_train_df.groupBy("delayed").count().where("delayed = 0.0").select(["count"]).first()[0]

# count of delayed=1.0
count_delayed = strat_train_df.groupBy("delayed").count().where("delayed = 1.0").select(["count"]).first()[0]

total = count_not_delayed + count_delayed

print("Not Delayed: {0}%, Delayed: {1}%".format(np.round(100 * float(count_not_delayed) / total, 2), np.round(100 * float(count_delayed) / total, 2)))



colName ="carrier"
carrierIndexer = StringIndexer(inputCol=colName, outputCol="{0}_indexed".format(colName)).fit(strat_train_df)

indexed_df = carrierIndexer.transform(strat_train_df)



# create a new "carrier_indexed" column
(indexed_df.select(["origin", "dest", "carrier", "carrier_indexed"]).sample(fraction=0.001, withReplacement=False, seed=rnd_seed).show())



# check the encoded carrier values
carrierIndexer.labels


# check the carrier code and index mapping
pandas_df['dayofweek'] = pandas_df['Dates'].dt.dayofweek
pandas_df['week'] = pandas_df['Dates'].dt.weekofyear
pandas_df['x_sim'] = pandas_df['X'].str[1:8]
pandas_df['X'] = pandas_df['X'].str[1:8]
pandas_df['y_sim'] = pandas_df['Y'].str[0:6]
pandas_df['X'] = pd.to_numeric(pandas_df['X'])
pandas_df['Y'] = pd.to_numeric(pandas_df['Y'])
pandas_df['x_sim'] = pd.to_numeric(pandas_df['x_sim'])
pandas_df['y_sim'] = pd.to_numeric(pandas_df['y_sim'])

#send back to the RDD
data_df = sqlContext.createDataFrame(pandas_df)

#encode the police dept as a feature

stringIndexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Index")
model = stringIndexer.fit(data_df)
indexed = model.transform(data_df)
encoder = OneHotEncoder(dropLast=False, inputCol="PdDistrict_Index", outputCol="pd")
encoded = encoder.transform(indexed)

#remove data_df from memory
data_df.unpersist() 

#encode the dependent variable - category_predict
classifyIndexer = StringIndexer(inputCol="Category", outputCol="Category_Index")
classifymodel = classifyIndexer.fit(encoded)
encoded2 = classifymodel.transform(encoded)


Exemplo n.º 50
0
dataset = dataset.withColumn(
    "classWeights",
    when(dataset.default_payment_next_month == 1,
         BalancingRatio).otherwise(1 - BalancingRatio))
dataset.select("classWeights").show(5)

display(dataset.show(3))

cols = dataset.columns

categoricalColumns = ["SEX", "EDUCATION", "MARRIAGE"]
stages = []  # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="default_payment_next_month",
                                outputCol="label")
stages += [label_stringIdx]

# Transform all features into a vector using VectorAssembler
numericCols = [
    "LIMIT_BAL", "AGE", "PAY_0", "PAY_2", "PAY_3", "PAY_3", "PAY_4", "PAY_5",
      StructField("device_conn_type", StringType(), True),
      StructField("C14", DoubleType(), True),
      StructField("C15", DoubleType(), True),
      StructField("C16", DoubleType(), True),
      StructField("C17", DoubleType(), True),
      StructField("C18", DoubleType(), True),
      StructField("C19", DoubleType(), True),
      StructField("C20", DoubleType(), True),
      StructField("C21", DoubleType(), True)
    ])
    

from pyspark.ml.feature import StringIndexer
## Index labels, adding metadata to the label column.
## Fit on whole dataset to include all labels in index.
data = StringIndexer(inputCol="click", outputCol="label").fit(data).transform(data)
data.show()
## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡
#labelIndexer  ===> data


# RFormula
from pyspark.ml.feature import RFormula
## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles.
##特徵值要被修正formula" "
formula = RFormula(
    formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type",
    #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636
    #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7
    featuresCol="features",
    labelCol="label")
Exemplo n.º 52
0
def analyze(spark):
    # loading input files - pre-processed, load all csv file
    # path = "../data/pre-processed/*.csv"
    # allcsv = glob.glob(path)
    # input_file = allcsv
    # path = "preprocessed_data.csv"
    # allcsv = glob.glob(path)
    # input_file = allcsv
    # create spark session
    # spark = SparkSession.builder.appName("TestRBM").getOrCreate()
    # spark.sparkContext.setCheckpointDir("checkpoint/")
    # spark.sparkContext.setLogLevel("WARN")
    #
    # # read input files
    # df = spark.read \
    #     .option("header", "true") \
    #     .option("treatEmptyValuesAsNulls", "true") \
    #     .option("inferSchema", "true") \
    #     .option("charset", "UTF-8") \
    #     .csv(input_file)
    # df = df.select("MASV1", "F_MAMH", "F_MAKH", "TKET")
    # df = df.filter(df["F_MAKH"] == "MT")
    # # print(df.count())
    # df = df.withColumn("MASV1", df["MASV1"].cast(DoubleType()))
    # df = df.withColumn("MASV1", df["MASV1"].cast(LongType()))
    # df = df.withColumn("TKET", df["TKET"].cast(DoubleType()))
    # df.show()
    # print(df.rdd.getNumPartitions())
    # can tach train va test theo MASV
    spark.sparkContext.setCheckpointDir("hdfs://node3:54311/")
    # spark.sparkContext.setLogLevel("INFO")

    print("#####################Split train test######################")

    # train_df, test_df = get_train_test(df, spark)
    # test_input_output_df = test_df.randomSplit([0.8, 0.2]) #0 la input, 1 la cai output de minh join voi output cua rbm xem ket qua ok ko
    # train_df.coalesce(1).write.csv('train_df.csv')
    # test_input_output_df[0].coalesce(1).write.csv("test_input_df.csv")
    # test_input_output_df[1].coalesce(1).write.csv("test_output_df.csv")
    # train_df.toPandas().to_csv('train_df1.csv')
    # test_input_output_df[0].toPandas().to_csv("test_input_df1.csv")
    # test_input_output_df[1].toPandas().to_csv("test_output_df1.csv")

    train_df = load_csv_file("data/train_df1.csv", spark)
    test_input_output_df = [
        load_csv_file("data/test_input_df1.csv", spark),
        load_csv_file("data/test_output_df1.csv", spark)
    ]

    # train_df.show()
    # preprocess input
    # TKET to int (double score)
    print(
        "#####################Double Score To Index SoftMax######################"
    )
    train_input_rbm_df = train_df.withColumn("TKET", round(col("TKET") * 2).cast(LongType()))\
        .drop("F_MAKH")
    test_input_rbm_df = test_input_output_df[0].withColumn("TKET", round(col("TKET") * 2).cast(LongType()))\
        .drop("F_MAKH")

    # print(train_input_rbm_df.count())
    # print(train_input_rbm_df.select("MASV1", "F_MAMH", "TKET").distinct().count())
    # print(train_input_rbm_df.select("MASV1", "F_MAMH").distinct().count())
    train_input_rbm_df = train_input_rbm_df.groupBy("MASV1", "F_MAMH").agg(collect_list("TKET").alias("list_TKET"))\
        .withColumn("TKET", col("list_TKET")[0])
    # print(train_input_rbm_df.count())
    # print(train_input_rbm_df.select("MASV1", "F_MAMH", "TKET").distinct().count())
    # print(train_input_rbm_df.select("MASV1", "F_MAMH").distinct().count())

    # print(test_input_rbm_df.count())
    # print(test_input_rbm_df.select("MASV1", "F_MAMH", "TKET").distinct().count())
    # print(test_input_rbm_df.select("MASV1", "F_MAMH").distinct().count())
    test_input_rbm_df = test_input_rbm_df.groupBy("MASV1", "F_MAMH").agg(collect_list("TKET").alias("list_TKET"))\
        .withColumn("TKET", col("list_TKET")[0])
    # print(test_input_rbm_df.count())
    # print(test_input_rbm_df.select("MASV1", "F_MAMH", "TKET").distinct().count())
    # print(test_input_rbm_df.select("MASV1", "F_MAMH").distinct().count())
    # train_input_rbm_df = train_input_rbm_df.withColumn("SoftmaxIndex", col("TKET").cast(LongType()))\
    #                         .withColumn("Active", lit(1))
    # train_input_rbm_df.show()

    # train_input_rbm_df.cache()
    # #to softmax
    print("#####################To Binary SoftMax######################")
    value_to_binary_softmax_model = ValueToBinarySoftMaxModel(spark)\
        .setItemCol("F_MAMH")\
        .setOutputCol("Active")\
        .setSoftMaxUnit(21)\
        .setValueCol("TKET")
    train_input_rbm_df = value_to_binary_softmax_model.transform(
        train_input_rbm_df)
    train_input_rbm_df.printSchema()
    train_input_rbm_df.show()
    test_input_rbm_df = value_to_binary_softmax_model.transform(
        test_input_rbm_df)
    test_input_rbm_df.show()

    item_df = train_input_rbm_df.select("F_MAMH").distinct()
    number_of_item = item_df.count()
    print("Number of item:" + str(number_of_item))
    item_indexer = StringIndexer().setInputCol("F_MAMH").setOutputCol(
        "F_MAMH_index")
    item_index_model = item_indexer.fit(item_df)
    item_index_mapping = item_index_model.transform(item_df).withColumn(
        "F_MAMH_index",
        col("F_MAMH_index").cast(LongType())).cache()

    train_input_rbm_df = item_index_model.transform(
        train_input_rbm_df).withColumn("F_MAMH_index",
                                       col("F_MAMH_index").cast(LongType()))
    train_input_rbm_df.printSchema()
    test_input_rbm_df = test_input_rbm_df.join(item_index_mapping, ["F_MAMH"])
    group_user_model = GroupUserModel(spark, number_of_item)\
        .setItemCol("F_MAMH_index")\
        .setUserCol("MASV1")\
        .setValueCol("Active")
    train_input_rbm_df = group_user_model.transform(train_input_rbm_df)
    train_input_rbm_df.printSchema()
    train_input_rbm_df.show()
    test_input_rbm_df = group_user_model.transform(test_input_rbm_df)
    train_input_rbm_df.printSchema()
    test_input_rbm_df.show()

    print("#####################Training phase######################")
    # print(train_input_rbm_df.count())
    #Create RBM Model
    rbm_model = RBM(spark, number_of_item)\
        .setUserCol("MASV1")\
        .setSoftMaxUnit(21)\
        .setValueCol("Active")\
        .setLearningRate(0.1)\
        .setNumberOfHiddenNode(30)\
        .setIterNum(1)\
        .setOutputCol("prediction_prob")\
        .fit(train_input_rbm_df)

    print("#####################Predict phase######################")
    #transform output to expectation (Active la probability)
    prob_to_expect_model = ProbabilitySoftMaxToExpectationModel(spark).setUserCol("MASV1")\
        .setItemCol("F_MAMH_index")\
        .setValueCol("prediction_prob")\
        .setOutputCol("prediction")

    #predict
    output_rbm_df = rbm_model.transform(test_input_rbm_df)
    output_rbm_df.show()
    predict_expectation_df = prob_to_expect_model.transform(output_rbm_df)\
        .withColumn("prediction",  col("prediction") / 2)\
        .join(item_index_mapping, ["F_MAMH_index"])
    predict_expectation_df.show()
    predict_test_df = test_input_output_df[1].join(predict_expectation_df,
                                                   ["MASV1", "F_MAMH"])
    predict_test_df.show()
    #calculate error
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="TKET",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predict_test_df)
    print("Root-mean-square error = " + str(rmse))
from pyspark.ml.feature import IndexToString, StringIndexer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'"
          % (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    print("Transformed indexed column '%s' back to original string column '%s' using "
          "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
Exemplo n.º 54
0
def oneHot(df, base_col_name, col_name):
    from pyspark.sql import SparkSession
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
    import os
    import time

    #os.environ['SPARK_HOME'] = '/root/spark-2.1.1-bin'

    sparkConf = SparkConf() \
        .setAppName('pyspark rentmodel') \
        .setMaster('local[*]')
    sc = SparkContext.getOrCreate(sparkConf)

    sc.setLogLevel('WARN')

    spark = SparkSession(sparkContext=sc)

    df = df.select(base_col_name, col_name)
    df = df.filter(df[base_col_name].isNotNull())
    # StringIndexer'handleInvalid of python'version no have 'keep',so it can't process null value
    null_col_name = col_name + '_null'
    df = df.na.fill(null_col_name, col_name)
    df_NULL = df.filter(df[col_name] == 'NULL')

    df = df.filter(df[col_name].isNotNull())
    df = df.filter(df[col_name] != '')
    print('one-hot=======', col_name, df.count())

    temp_path = '/data/20180621/ALL_58_beijing_save_models/'

    if df_NULL.count() > 0:

        def udf_NULL(s):
            return null_col_name

        udf_transf = udf(udf_NULL)

        df_NULL = df_NULL.select('*',
                                 udf_transf(col_name).alias('tmp_col_name'))
        df_NULL = df_NULL.na.fill(null_col_name, 'tmp_col_name')
        df_NULL = df_NULL.drop(col_name)
        df_NULL = df_NULL.withColumnRenamed('tmp_col_name', col_name)

        df_no_NULL = df.filter(df[col_name] != 'NULL')
        df_no_NULL = df_no_NULL.withColumn('tmp_col_name', df[col_name])
        df_no_NULL = df_no_NULL.drop(col_name)
        df_no_NULL = df_no_NULL.withColumnRenamed('tmp_col_name', col_name)
        df = df_no_NULL.union(df_NULL)
        del df_no_NULL

    index_name = col_name + 'Index'
    vector_name = col_name + 'Vec'
    """
        StringIndexer可以设置handleInvalid='skip',但是不可以设置handleInvalid='keep'.
        设置这个会删除需要跳过的这一行,这样会导致用户体验差,因为用户输入
        一条数据,就直接给删了,什么都没有。因此暂不设置,新数据输入时,如果没有,
        可以在已经有的字符串中随机选择一个来替换没有的这个新字符串.
    """
    stringIndexer = StringIndexer(inputCol=col_name, outputCol=index_name)
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    encoder = OneHotEncoder(dropLast=False,
                            inputCol=index_name,
                            outputCol=vector_name)
    encoded = encoder.transform(indexed)

    #save
    stringIndexer.save(temp_path + 'stringIndexer' + col_name)
    model.save(temp_path + 'stringIndexer_model' + col_name)

    # StringIndexer(inputCol=col_name, outputCol=index_name)
    # onehotEncoderPath = temp_path + col_name
    # loadedEncoder = OneHotEncoder.load(onehotEncoderPath)
    # loadedEncoder.setParams(inputCol=index_name, outputCol=vector_name)
    # encoded = loadedEncoder.transform(df)
    # encoded.show()

    onehotEncoderPath = temp_path + col_name + '_new'
    encoder.save(onehotEncoderPath)

    sub_encoded = encoded.select(base_col_name, vector_name)

    return sub_encoded
    rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        print("Usage: gradient_boosted_trees", file=sys.stderr)
        exit(1)
    sc = SparkContext(appName="Jay")
    sqlContext = SQLContext(sc)

    # Load and parse the data file into a dataframe.
    df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()

    # Map labels into an indexed column of labels in [0, numLabels)
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    [train, test] = td.randomSplit([0.7, 0.3])
    testClassification(train, test)
    testRegression(train, test)
    sc.stop()
Exemplo n.º 56
0
cars = cars.withColumnRenamed("ncyl", "cyl")
cars = cars.withColumn('length_meters', round(cars.length * 0.0254, 3))

cars = cars.withColumn('weight_kg', round(cars.weight / 2.205, 0))

cars = cars.withColumn('avg_mpg', round((cars.city_mpg + cars.hwy_mpg) / 2, 1)) \
            .drop("city_mpg", "hwy_mpg")

cars = cars.withColumn(
    'consumption', round((100 * 3.785411784) / (cars.avg_mpg * 1.609344), 2))

pd.set_option('display.max_columns', None)  # all cols
pd.set_option('display.width', 161)
#print(cars.toPandas().sample(8), '\n')

indexer = StringIndexer(inputCol='type', outputCol='type_idx')

# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

# Check column data types
print('\n', cars.dtypes, '\n')

kars = cars.select('name', 'weight_kg', 'cyl', 'consumption', 'type',
                   'type_idx')

#print(kars.toPandas().sample(12))

onehot = OneHotEncoderEstimator(inputCols=['type_idx'],
# COMMAND ----------

# MAGIC %md
# MAGIC In this dataset, we have ordinal variables like education (Preschool - Doctorate), and also nominal variables like relationship (Wife, Husband, Own-child, etc). For simplicity's sake, we will use One-Hot Encoding to convert all categorical variables into binary vectors. It might be possible here to improve prediction accuracy by converting each categorical column with an appropriate method.
# MAGIC 
# MAGIC Here, we will use a combination of [StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer) and [OneHotEncoder](http://spark.apache.org/docs/latest/ml-features.html#onehotencoder) to convert the categorical variables. The OneHotEncoder will return a [SparseVector](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector).

# COMMAND ----------

###One-Hot Encoding
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
  
categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  model = stringIndexer.fit(dataset)
  indexed = model.transform(dataset)
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  encoded = encoder.transform(indexed)
  dataset = encoded

print dataset.take(1)

# COMMAND ----------

# MAGIC %md
# MAGIC The above code basically indexes each categorical column using the StringIndexer, and then converts the indexed categories into one-hot encoded variables. The resulting output has the binary vectors appended to the end of each row.

# COMMAND ----------
Exemplo n.º 58
0
import pickle
import cdsw

spark = SparkSession.builder \
      .appName("Telco Customer Churn") \
      .getOrCreate()
    
schemaData = StructType([StructField("state", StringType(), True),StructField("account_length", DoubleType(), True),StructField("area_code", StringType(), True),StructField("phone_number", StringType(), True),StructField("intl_plan", StringType(), True),StructField("voice_mail_plan", StringType(), True),StructField("number_vmail_messages", DoubleType(), True),     StructField("total_day_minutes", DoubleType(), True),     StructField("total_day_calls", DoubleType(), True),     StructField("total_day_charge", DoubleType(), True),     StructField("total_eve_minutes", DoubleType(), True),     StructField("total_eve_calls", DoubleType(), True),     StructField("total_eve_charge", DoubleType(), True),     StructField("total_night_minutes", DoubleType(), True),     StructField("total_night_calls", DoubleType(), True),     StructField("total_night_charge", DoubleType(), True),     StructField("total_intl_minutes", DoubleType(), True),     StructField("total_intl_calls", DoubleType(), True),     StructField("total_intl_charge", DoubleType(), True),     StructField("number_customer_service_calls", DoubleType(), True),     StructField("churned", StringType(), True)])
churn_data = spark.read.schema(schemaData).csv('/tmp/churn.all')

reduced_churn_data= churn_data.select("account_length", "number_vmail_messages", "total_day_calls",
                     "total_day_charge", "total_eve_calls", "total_eve_charge",
                     "total_night_calls", "total_night_charge", "total_intl_calls", 
                    "total_intl_charge","number_customer_service_calls")

label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label')
plan_indexer = StringIndexer(inputCol = 'intl_plan', outputCol = 'intl_plan_indexed')
pipeline = Pipeline(stages=[plan_indexer, label_indexer])
indexed_data = pipeline.fit(churn_data).transform(churn_data)

(train_data, test_data) = indexed_data.randomSplit([0.7, 0.3])

pdTrain = train_data.toPandas()
pdTest = test_data.toPandas()
features = ["intl_plan_indexed","account_length", "number_vmail_messages", "total_day_calls",
                     "total_day_charge", "total_eve_calls", "total_eve_charge",
                     "total_night_calls", "total_night_charge", "total_intl_calls", 
                    "total_intl_charge","number_customer_service_calls"]

param_numTrees = int(sys.argv[1])
param_maxDepth = int(sys.argv[2])
Exemplo n.º 59
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import StringIndexer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StringIndexerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    df = sqlContext.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])
    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    indexed = indexer.fit(df).transform(df)
    indexed.show()
    # $example off$

    sc.stop()
Exemplo n.º 60
0
data = spark.read.csv(base_path + file_name, header=True, inferSchema=True)
data.show()
data.printSchema()
#print(data.columns)

cols = [
    'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'
]

data_cols = data.select(cols)
data_cols.show()
final_data = data_cols.na.drop()

# Transform the categorical columns into numbers
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')

# A B C
# 0 1 2
# One hot encode ----> this is mapping everyting into [1, 0, 0] [0, 1, 0] etc.
gender_encoder = OneHotEncoder(
    inputCol='SexIndex', outputCol='SexVec'
)  # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1]

embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(
    inputCol='EmbarkedIndex', outputCol='EmbarkedVec'
)  # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1]

new_cols = ['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec']
assembler = VectorAssembler(inputCols=new_cols, outputCol='features')