Exemplo n.º 1
0
train, test = df_for_ml.randomSplit([0.8,0.2], seed = 42)


assembler1 = VectorAssembler(
        inputCols=['Action_ph', 'Session_ph', 'Nextsong_ph',
        'Downgrade_ph', 'Upgrade_ph', 'ThumbDown_ph', 'ThumbUp_ph', 'Home_ph',
        'Adv_ph','Addtolist_ph','Set_ph','Addfriend_ph','Error_ph', 'Help_ph',
        'Action_toSession', 'Nextsong_toAct', 'Downgrade_toAct', 'Upgrade_toAct',
        'ThumbDown_toAct', 'ThumbUp_toAct','Home_toAct','Adv_toAct',
        'Addtolist_toAct', 'Set_toAct','Addfriend_toAct','Error_toAct',
        'Help_toAct', 'Action_trend','Nextsong_trend','Nextsong_betweenHome',
        ],
        outputCol='NumFeatures')

scaler = Normalizer(inputCol='NumFeatures',
                    outputCol='ScaledNumFeatures',
                    p = 1.0)

assembler2 = VectorAssembler(inputCols = ['gender_num','level_num','ScaledNumFeatures'],
                             outputCol = 'features')

rf = RandomForestClassifier(featuresCol="features", labelCol="label")

pipeline = Pipeline(stages = [assembler1, scaler, assembler2, rf])


paramGrid = ParamGridBuilder()\
        .addGrid(scaler.p,[1.0,2.0])\
        .addGrid(rf.maxDepth,[5, 10]) \
        .addGrid(rf.numTrees, [20, 50]) \
        .addGrid(rf.minInstancesPerNode, [1, 10]) \
def main(argv=None):
    if argv is None:
        inputs_train = sys.argv[1]
        inputs_test = sys.argv[2]

    conf = SparkConf().setAppName('sentiment-analysis-word2vec-cluster')
    sc = SparkContext(conf=conf)
    sqlCt = SQLContext(sc)

    #read train json file and prepare data (label, feature)
    text = sqlCt.read.json(inputs_train)
    train = text.select('overall',
                        'reviewText').withColumnRenamed('overall', 'label')
    train.cache()

    ## DATA PROCESSING PIPELINE
    # Split at whitespace and characters that are not letter
    tokenizer = RegexTokenizer(inputCol="reviewText",
                               outputCol="words",
                               pattern="\\P{Alpha}+")

    # stopword remover
    remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

    pipeline_data_processing = Pipeline(stages=[tokenizer, remover])
    model_data_processing = pipeline_data_processing.fit(train)
    train_processed = model_data_processing.transform(train)
    train.unpersist()
    train_processed.cache()

    ## INTERMEDIATE STEP TO GET WORD VOCABULARY AND VECTOR
    # word2vec
    word2Vec = Word2Vec(inputCol="filtered_words",
                        outputCol="word2vec_features")
    model_word2Vec = word2Vec.fit(train_processed)
    # Dataframe dictionary of Word-vectors
    vocabulary = model_word2Vec.getVectors()
    vocabulary.cache()

    ## ML PIPELINE
    # WordCluster Features
    wordcluster = WordCluster(inputCol="filtered_words", predictionCol="cluster", \
                              k=3, vocabulary=vocabulary)

    # get vector of cluster frequency for each document
    count_vectorizer = CountVectorizer(inputCol="cluster", outputCol="count")

    # normalized cluster frequency vector for each document
    normalizer = Normalizer(inputCol="count", outputCol="features", p=1.0)

    # linear Regression Model
    lr = LinearRegression(maxIter=20, regParam=0.1)

    # Final Pipeline
    pipeline = Pipeline(stages=[wordcluster, count_vectorizer, normalizer, lr])

    ## FIT MODEL USING CROSS VALIDATION
    # Parameter grid for cross validation: numFeatures and regParam
    paramGrid = ParamGridBuilder() \
            .addGrid(wordcluster.k, [1000, 5000, 10000, 20000]) \
            .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1.0]) \
            .build()

    # 5-fold cross validation
    evaluator = RegressionEvaluator(metricName="rmse")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=5)

    # Run cross-validation, and choose the best set of parameters.
    model = crossval.fit(train_processed)

    # RMSE on train data
    prediction_train = model.transform(train_processed)
    rmse_train = evaluator.evaluate(prediction_train)
    train_processed.unpersist()
    vocabulary.unpersist()

    ## TEST DATA
    #read test json file and process data (label, feature)
    text = sqlCt.read.json(inputs_test)
    test = text.select('overall',
                       'reviewText').withColumnRenamed('overall', 'label')
    test_processed = model_data_processing.transform(test)

    # Evaluate the model on test data
    prediction_test = model.transform(test_processed)
    rmse_test = evaluator.evaluate(prediction_test)

    # Print Result
    result = "MODEL WITH Word Clustering features - best k = " \
          + str(model.bestModel.stages[0].getK()) + ":\n"
    result = result + "-Train RMSE: " + str(rmse_train) + "\n"
    result = result + "-Test RMSE: " + str(rmse_test) + "\n"
    print(result)
Exemplo n.º 3
0
    def featurize(self, col_name, option, skip=False):
        '''
        featurize a specific numerical feature

        Args:

            col_name: name of the target feature (string)
            option: featurization option (int), 1 for Standarization, 2 for L2-Normalization, 3 for Min-Max transformation
            skip: whether skip a non-numerical feature, default False
        '''
        n_null = self.data.where(self.data[col_name].isNull()).count()
        if n_null > 0:
            print("Drop {} null values in {}!".format(n_null, col_name))
            self.data = self.data.dropna()

        def ith_(v, i):
            '''
            helper function
            '''
            try:
                return float(v[i])
            except ValueError:
                return None

        if not isinstance(col_name, str):
            raise TypeError(
                'column name must be a string object. your input type is {}'.
                format(type(col_name)))
        if isinstance(option, int):
            options = [1, 2, 3]
            if option not in options:
                raise ValueError(
                    "option should be 1(Standarization), 2(L2-Normalization) or 3(Min-Max), your input is: {}"
                    .format(option))
        else:
            raise TypeError(
                'option must be an int object. your input type is {}'.format(
                    type(option)))
        df = self.data
        temp = df.select(col_name)
        types = [f.dataType for f in temp.schema.fields]
        type_list = ["IntegerType", "LongType", "DoubleType"]
        if str(types[0]) not in type_list:
            if not skip:
                raise TypeError(
                    'The column you try to featurize is {}, which is not a valid data type for this function. You may mant to use function to_double() to cast the column first '
                    .format(types[0]))
            else:
                warnings.warn("you are skipping a non-numerical feature!")

        if option == 1:
            df_stats = df.select(
                F.mean(F.col(col_name)).alias('mean'),
                F.stddev(F.col(col_name)).alias('std')).collect()
            mean = df_stats[0]['mean']
            std = df_stats[0]['std']
            data = df.withColumn(col_name, (df[col_name] - mean) / std)
            data_stats = data.select(
                F.mean(F.col(col_name)).alias('mean'),
                F.stddev(F.col(col_name)).alias('std')).collect()
            new_mean = data_stats[0]['mean']
            new_std = data_stats[0]['std']
            print("Standarization on {} is successful!".format(col_name))
            print("new mean: {}, new std: {}".format(new_mean, new_std))

        elif option == 2:
            assembler = VectorAssembler(inputCols=[col_name],
                                        outputCol="feature")
            assembled = assembler.transform(df)
            normalizer = Normalizer(inputCol="feature",
                                    outputCol="l2normFeature")
            l2NormData = normalizer.transform(assembled).drop("feature").drop(
                col_name)
            data = l2NormData.withColumnRenamed("l2normFeature", col_name)
            print("L2-Normalization on {} is successful!".format(col_name))
            ith = F.udf(ith_, DoubleType())
            data = data.withColumn(col_name, ith(col_name, F.lit(0)))

        elif option == 3:
            col_max = df.agg({col_name: "max"}).collect()[0][0]
            col_min = df.agg({col_name: "min"}).collect()[0][0]
            data = df.withColumn(col_name, (df[col_name] - col_min) /
                                 (col_max - col_min))
            new_max = data.agg({col_name: "max"}).collect()[0][0]
            new_min = data.agg({col_name: "min"}).collect()[0][0]
            print(
                "Min-Max Transformation on {} is successful!".format(col_name))
            print("new lower bound: {}, new upper bound: {}".format(
                new_min, new_max))

        self.data = data
        return
def process_dataframe(spark_session, hdfs_dir_input, hdfs_dir_output):

    numeric_features = ['bounces', 'events', 'page_views', 'sessions']

    add_one = pandas_udf(add_one_func, returnType=FloatType())
    box_cox = pandas_udf(box_cox_func, returnType=FloatType())
    log = pandas_udf(log1p_func, returnType=FloatType())

    df = spark_session.read.parquet(hdfs_dir_input)

    df_first_part_plus_one = (df.select(
        'client_id',
        add_one(col('bounces')).alias('bounces'),
        add_one(col('events')).alias('events'),
        add_one(col('page_views')).alias('page_views'),
        add_one(col('sessions')).alias('sessions'),
    ))

    df_first_part_box_cox = (df_first_part_plus_one.select(
        'client_id',
        box_cox(col('bounces')).alias('bounces'),
        box_cox(col('events')).alias('events'),
        box_cox(col('page_views')).alias('page_views'),
        box_cox(col('sessions')).alias('sessions'),
    ))

    second_part_columns = [
        'client_id', 'churned', 'is_desktop', 'is_mobile', 'is_tablet'
    ] if TRAINING_OR_PREDICTION == 'training' else [
        'client_id', 'is_desktop', 'is_mobile', 'is_tablet'
    ]

    df_second_part = (df.select(
        *second_part_columns,
        log(col('session_duration')).alias('session_duration')))

    imputer = Imputer(
        inputCols=numeric_features,
        outputCols=numeric_features).setStrategy("mean").setMissingValue(0.0)

    assembler = VectorAssembler(inputCols=numeric_features,
                                outputCol="features")

    normalizer = Normalizer(inputCol="features",
                            outputCol="normFeatures",
                            p=2.0)

    scaler = StandardScaler(inputCol="normFeatures",
                            outputCol="scaledFeatures",
                            withStd=True,
                            withMean=True)

    pipeline = Pipeline(stages=[imputer, assembler, normalizer, scaler])

    model = pipeline.fit(df_first_part_box_cox)

    result = model.transform(df_first_part_box_cox).drop(
        'bounces', 'events', 'page_views', 'sessions', 'features',
        'normFeatures', 'scaled')

    final_df_first_part = (result.rdd.map(extract).repartition(32).toDF(
        ['client_id']).withColumnRenamed('_2', 'bounces').withColumnRenamed(
            '_3',
            'events').withColumnRenamed('_4', 'page_views').withColumnRenamed(
                '_5', 'sessions').drop('scaledFeatures'))

    final_df = final_df_first_part.join(df_second_part, 'client_id')

    if TRAINING_OR_PREDICTION == 'training':
        final_df.drop('client_id').repartition(
            numPartitions=32).write.parquet(hdfs_dir_output)
    else:
        final_df.repartition(numPartitions=32).write.parquet(hdfs_dir_output)
Exemplo n.º 5
0
                trim(col("ARR_DELAY_FNUMBER_BEFORE")).cast(IntegerType()).alias("ARR_DELAY_FNUMBER_BEFORE"),
                trim(col("CUM_ARR_FNUMBER_DELAY")).cast(IntegerType()).alias("CUM_ARR_FNUMBER_DELAY"),
                trim(col("DEP_DELAY_FNUMBER_BEFORE")).cast(IntegerType()).alias("DEP_DELAY_FNUMBER_BEFORE"),
                trim(col("CUM_DEP_FNUMBER_DELAY")).cast(IntegerType()).alias("CUM_DEP_FNUMBER_DELAY")                        
               )

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer

#Generamos un vector con la columna label  y la columna array features
ignore = ['label']
assembler = VectorAssembler(inputCols=[x for x in df.columns if x not in ignore], outputCol='features_without_norm')
df = assembler.transform(df).select(["MONTH", 'label', 'features_without_norm']) 

# Normalizamos los datos 
normalizer = Normalizer(inputCol="features_without_norm", outputCol="features")
df_normalized = normalizer.transform(df).select(["MONTH", 'label', 'features'])

# Definimos nuestros datos de entrenamiento del modelo ("train") y los de prediccion ("test")
train=df_normalized.where(df.MONTH!=12)
test=df_normalized.where(df.MONTH==12)


#Dividimos el archivo en train a su vez en: train(90%) y evaluation(10%)
(train, evaluation) = train.randomSplit((0.9, 0.1))

# COMMAND ----------

# Entrenamos y calibramos el modelo (con los datos ya normalizados) modificando sus parametros internos viendo sus resultados en evaluation

from pyspark.sql.types import *
Exemplo n.º 6
0
    .appName("similairityExample")\
    .getOrCreate()

df = sqlContext.read.json('/home/sl4401/AA/wiki_**')
df = df.limit(1000)
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="[^A-Za-z]+", toLowercase=True)
tokenized_data = regexTokenizer.transform(df)
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_data = stopWordsRemover.transform(tokenized_data)
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20)
featurizedData = hashingTF.transform(filtered_data)
idf= IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(featurizedData)
featurized_data = idfModel.transform(featurizedData)
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(featurized_data)
import math
import pyspark.sql.functions as psf
from pyspark.sql.types import DoubleType
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType())
s=data.alias("i").join(data.alias("j"), psf.col("i.id") < psf.col("j.id"))\
      .select(
           psf.col("i.id").alias("src"), 
           psf.col("j.id").alias("dst"), 
           dot_udf("i.norm", "j.norm").alias("relationship"))\
      .sort("src", "dst")


v = featurized_data.select("id","features")
e = s.filter("relationship > 0.95")
#Concatenate all numeric features in one big vector
predictors = data_train.drop('StreetID','DaysWithoutSub','max(EndDate)', 'min(StartDate)','CustomerID', 'label')
va = VectorAssembler().setInputCols(predictors.columns).setOutputCol("va_features")

#Feature Reduction using Principal Components Analysis (PCA)
pca = PCA().setInputCol("va_features").setOutputCol("pcafeatures").setK(10)

#Chi Square Selector 
chisq = ChiSqSelector().setFeaturesCol("va_features").setOutputCol("chi_features").setLabelCol("label").setNumTopFeatures(30)


# #Standardization
stand = StandardScaler(inputCol="chi_features",outputCol="features",withStd=True, withMean=True)

# #Normalisation
normalizer =  Normalizer().setInputCol("va_features").setOutputCol("normFeatures").setP(1.0)

#Use first piple when making big benchmark
pipeline_train = Pipeline(stages=[va, pca, chisq, stand, normalizer])
train = pipeline_train.fit(train).transform(train)


#vector assembler for validation and standarizing it 
va_test = VectorAssembler().setInputCols(predictors.columns).setOutputCol("va_features_validation")
stand_test = StandardScaler(inputCol='va_features_validation',outputCol="features",withStd=True, withMean=True)
Pipeline_test = Pipeline(stages=[va_features_test,stand_test])
validation = Pipeline_test.fit(validation).transform(validation)

# COMMAND ----------

#Getting the most relevant features selected by Chi^2 
Exemplo n.º 8
0
def normalize_vectors(vectors):
    normalizer = Normalizer(inputCol="features", outputCol="nfeatures", p=2)
    return normalizer\
        .transform(vectors)\
        .select("nfeatures")\
        .withColumnRenamed("nfeatures", "features")
Exemplo n.º 9
0
                                 outputCol="scaledFeatures",
                                 withStd=False,
                                 withMean=True)
min_max_scaler = MinMaxScaler(inputCol="Features", outputCol="scaledFeatures")
max_abs_scaler = MaxAbsScaler(inputCol="Features", outputCol="scaledFeatures")

norm_standard_scaler = StandardScaler(inputCol="normFeatures",
                                      outputCol="scaledFeatures",
                                      withStd=False,
                                      withMean=True)
norm_min_max_scaler = MinMaxScaler(inputCol="normFeatures",
                                   outputCol="scaledFeatures")
norm_max_abs_scaler = MaxAbsScaler(inputCol="normFeatures",
                                   outputCol="scaledFeatures")

normalizer = Normalizer(inputCol="Features", outputCol="normFeatures")

######END PIPELINE

from pyspark.ml.classification import LogisticRegression, MultilayerPerceptronClassifier, DecisionTreeClassifier
import json

#Create a pipeline testing object and run tests
pt = PipelineTester(df)

#Logistic regression
pt.lr(base=indexers + encoders + target_indexer, fcs=fcs)
pt.lr(base=indexers + encoders + target_indexer,
      scaler=[standard_scaler],
      fcs=fcs[1:4])
pt.lr(base=indexers + encoders + target_indexer,
sdf_books = sqlContext.createDataFrame(pdf_books)
hashingTF = HashingTF(inputCol="words_stem", outputCol="words_rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(sdf_books)

# use idf to inverse frequency
idf = IDF(inputCol="words_rawFeatures", outputCol="words_features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("book_title", "words_stem","words_features").show()

# COMMAND ----------


# 3.2 l2 normalize the feature vector
normalizer = Normalizer(inputCol="words_features", outputCol="words_norm")
normalizedData = normalizer.transform(rescaledData)

normalizedData.select("book_title", "words_stem","words_features","words_norm").show()

# COMMAND ----------

# 4. apply kmeans clustering method on the normalized book description words

# Trains a k-means model, split all the books into 10 clusters according the similarities between book descriptions
kmeans = KMeans().setK(10).setSeed(1).setFeaturesCol("words_norm")
model = kmeans.fit(normalizedData)

# Make predictions, the clusters are respresented by numbers, from 0 to 9
clusters = model.transform(normalizedData)
clusters.select("book_title", "prediction").show()
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.feature import Normalizer
import random
import time

numHashTables=5

spark = SparkSession \
    .builder \
    .getOrCreate()

df = spark.read.parquet("user_score")
normalizer = Normalizer(inputCol="user_score", outputCol="normFeatures", p=2.0)
extended_user_df = normalizer.transform(df)
extended_user_df.cache()
# seed_user_df = extended_user_df.sample(0.1, False)

# print("no seed users: ",seed_user_df.count(),"   no of extended users:  ",extended_user_df.count())

# LSH Algorithm
start_time=time.time()
brp = BucketedRandomProjectionLSH(inputCol="normFeatures", outputCol="hashes", bucketLength=10000.0, numHashTables=numHashTables)
brp.setSeed(random.randint())
model = brp.fit(extended_user_df)

# Get the hashes for the users and convert them into a cluster ID number.
df_users = model.transform(extended_user_df)
df_users = df_users.withColumn('cluster_id', udf(lambda input: reduce(lambda x, y: x | y, [ 0x1 << i if value[0] != 0.0 else 0 for i, value in enumerate(input) ]), IntegerType())(df_users.hashes))
def preprocess_test(test, model=None):
    # test = test.dropna(axis=1, how='all', inplace=False)
    # for c in test.columns:
    #     if test.filter(col(c).isNotNull()).count() == 0:
    #         test = test.drop(c)

    print('Length of test : ' + str(len(test.columns)))

    if model == 'xgb':
        cols = [x for x in test.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        test = clip(test, cols)

        # test = test.resample('H').mean()

        # test = test.rolling(window=50).mean()

        test = get_mean_of_cyl_values(test)
        test = test.fillna(0)

        return test

    elif model == 'lstm':

        cols = [x for x in test.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))
        test = clip(test, cols)

        test = get_mean_of_cyl_values(test)

        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))
        print(test.schema)
        test = test.fillna(0)

        cols = [x for x in test.columns if x not in ['datetime']]


        assembler = VectorAssembler().setInputCols \
            (cols).setOutputCol("features")
        print('assembler')
        transformed = assembler.transform(test)

        # Normalize each Vector using $L^1$ norm.
        normalizer = Normalizer(inputCol="features",
                                outputCol="normFeatures",
                                p=1.0)
        l1NormData = normalizer.transform(transformed)

        scaler = StandardScaler(inputCol="normFeatures",
                                outputCol="scaledFeatures",
                                withStd=True,
                                withMean=False)

        # Compute summary statistics by fitting the StandardScaler
        scalerModel = scaler.fit(l1NormData)
        # Normalize each feature to have unit standard deviation.
        scaledData = scalerModel.transform(l1NormData)
        # train = scaledData.drop(*cols)
        del test, transformed, l1NormData

        n_components_ = 50

        pca = PCA(k=n_components_,
                  inputCol="scaledFeatures",
                  outputCol="pcaFeatures")
        model = pca.fit(scaledData)

        vds_5 = model.transform(scaledData).select(['pcaFeatures', 'datetime'])
        print(vds_5)

        def extract(row):
            return (row.datetime, ) + tuple(row.pcaFeatures.toArray().tolist())

        vds_5 = vds_5.rdd.map(extract).toDF(["datetime"])
        print(vds_5)

        vds_5 = vds_5.drop(*['pcaFeatures', 'datetime'])

        return vds_5

    elif model == 'svm':
        # test = test.toPandas()
        # test = clip_data(test)
        cols = [x for x in test.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        test = clip(test, cols)

        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        # test = test.toPandas()
        # test_max = test.resample('H').max().add_suffix('_max')
        # test_min = test.resample('H').min().add_suffix('_min')
        # test_std = test.resample('H').std().add_suffix('_std')
        # test = test.resample('H').mean()
        #
        # test = pd.concat([test, test_max], axis=1, sort=False)
        # test = pd.concat([test, test_min], axis=1, sort=False)
        # test = pd.concat([test, test_std], axis=1, sort=False)
        # del test_max, test_min,
        # gc.collect()

        # test = test.toHandy()

        test = get_mean_of_cyl_values(test)

        # vds_5 = test
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        # vds_5 = vds_5.replace(to_replace=0, value=1)

        # vds_5 = vds_5.pct_change(periods=1, fill_method='ffill')

        # window = Window.orderBy('datetime') \
        #     .rowsBetween(-sys.maxsize, 0)
        #
        # def ffill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # def bfill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # for column in cols:
        #     vds_5 = vds_5.withColumn(column,ffill(col(column)))
        #
        # for column in cols:
        #     vds_5 = vds_5.withColumn(column,bfill(col(column)))

        test = test.fillna(0)
        # vds_5 = vds_5.fillna(method='ffill')
        # vds_5 = vds_5.fillna(method='bfill')

        return test

    elif model == 'perm':

        # test = test.resample('H').mean()
        # test = test.rolling(window=20).mean()

        cols = [x for x in test.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        test = test.fillna(0)
        test = clip(test, cols)

        # window = Window.orderBy('datetime') \
        #     .rowsBetween(-sys.maxsize, 0)
        #
        # def ffill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # def bfill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # for column in cols:
        #     test = test.withColumn(column,ffill(col(column)))
        #
        # for column in cols:
        #     test = test.withColumn(column,bfill(col(column)))

        test = test.fillna(0)

        return test
def preprocess_train(train, model=None, spark=None):
    if model == 'xgb':
        # train = train.dropna(axis=1, how='all', inplace=False)
        cols = [x for x in train.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))
        train = clip(train, cols)

        # train = train.resample('H').mean()

        train = get_mean_of_cyl_values(train)

        train = train.fillna(0)

        # train.show(n=5)

        return train

    elif model == 'lstm':

        # train = train.dropna(axis=1, how='all', inplace=False)

        cols = [x for x in train.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))
        train = clip(train, cols)

        train = get_mean_of_cyl_values(train)

        # train_max = train.resample('H').max().add_suffix('_max')
        # train_min = train.resample('H').min().add_suffix('_min')
        # train_std = train.resample('H').std().add_suffix('_std')
        # train = train.resample('H').mean()
        #
        # train = pd.concat([train, train_max], axis=1, sort=False)
        # train = pd.concat([train, train_min], axis=1, sort=False)
        # train = pd.concat([train, train_std], axis=1, sort=False)
        # del train_max, train_min,
        # gc.collect()

        # train = train.rolling(window=150).mean()

        cols = [x for x in train.columns if x not in ['datetime']]
        # function to calculate number of seconds from number of days
        days = lambda i: i * 86400
        #
        # train = train.withColumn('datetime', train.datetime.cast('timestamp'))
        #
        # # create window by casting timestamp to long (number of seconds)
        # w = (Window.orderBy('datetime').rowsBetween(-50, 0))
        # for column in cols:
        #     train = train.withColumn(column, avg(train[column]).over(w))

        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))
        print(train.schema)
        train = train.fillna(0)
        #
        # window = Window.orderBy('datetime') \
        #     .rowsBetween(-sys.maxsize, 0)
        #
        # def ffill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # def bfill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # for column in cols:
        #     train = train.withColumn(column, ffill(col(column)))
        #
        # for column in cols:
        #     train = train.withColumn(column, bfill(col(column)))

        # train = train.fillna(0)
        #
        # vds_5 = train

        # del train
        # gc.collect()
        #
        # vds_5 = vds_5.replace(to_replace=0, value=1)
        #
        # vds_5 = vds_5.pct_change(periods=1, fill_method='ffill')
        # #
        # vds_5 = vds_5.fillna(method='ffill')
        # vds_5 = vds_5.fillna(method='bfill')
        cols = [x for x in train.columns if x not in ['datetime']]
        # vds_55 = normalize(vds_5)
        # vds_55 = scale(vds_55)

        assembler = VectorAssembler().setInputCols \
            (cols).setOutputCol("features")
        print('assembler')
        transformed = assembler.transform(train)

        # Normalize each Vector using $L^1$ norm.
        normalizer = Normalizer(inputCol="features",
                                outputCol="normFeatures",
                                p=1.0)
        l1NormData = normalizer.transform(transformed)

        scaler = StandardScaler(inputCol="normFeatures",
                                outputCol="scaledFeatures",
                                withStd=True,
                                withMean=False)

        # Compute summary statistics by fitting the StandardScaler
        scalerModel = scaler.fit(l1NormData)
        # Normalize each feature to have unit standard deviation.
        scaledData = scalerModel.transform(l1NormData)
        # train = scaledData.drop(*cols)
        del train, transformed, l1NormData

        n_components_ = 50
        # pca = FastICA(n_components=n_components_)
        #
        # dump(pca, 'pca.joblib')
        #
        # pca2_results = pca.fit_transform(scaledData)
        # # n_comp=pca.n_components_
        # n_comp = n_components_
        # print('Number of componeds : ' + str(n_comp))
        # print(pca2_results)
        # print (len(pca2_results[:, 1]))

        # for i in range(0, n_comp):
        #     vds_5['pca_' + str(i)] = 0
        #     # print(len(vds_5['pca_' + str(i)]))
        #     # print(len(pca2_results[:, i]))
        #     vds_5['pca_' + str(i)] = pca2_results[:, i]

        # pca_columns = [x for x in vds_5.columns if x.startswith('pca')]
        # vds_5 = vds_5[pca_columns]

        pca = PCA(k=n_components_,
                  inputCol="scaledFeatures",
                  outputCol="pcaFeatures")
        model = pca.fit(scaledData)

        vds_5 = model.transform(scaledData).select(['pcaFeatures', 'datetime'])
        print(vds_5)

        def extract(row):
            return (row.datetime, ) + tuple(row.pcaFeatures.toArray().tolist())

        vds_5 = vds_5.rdd.map(extract).toDF(["datetime"])
        print(vds_5)

        vds_5 = vds_5.drop(*['pcaFeatures', 'datetime'])

        return vds_5

    elif model == 'svm':

        cols = [x for x in train.columns if x not in ['datetime']]
        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))
        train = clip(train, cols)

        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))

        # train_max = train.resample('H').max().add_suffix('_max')
        # train_min = train.resample('H').min().add_suffix('_min')
        # train_std = train.resample('H').std().add_suffix('_std')
        # train = train.resample('H').mean()
        #
        # train = pd.concat([train, train_max], axis=1, sort=False)
        # train = pd.concat([train, train_min], axis=1, sort=False)
        # train = pd.concat([train, train_std], axis=1, sort=False)
        # del train_max, train_min,
        # gc.collect()

        train = get_mean_of_cyl_values(train)

        vds_5 = train
        print('Test Columns : ' + str(len(train.columns)))
        print('Test Rows : ' + str(train.count()))

        # vds_5 = vds_5.replace(to_replace=0, value=1)

        # vds_5 = vds_5.pct_change(periods=1, fill_method='ffill')

        # window = Window.orderBy('datetime') \
        #     .rowsBetween(-sys.maxsize, 0)
        #
        # def ffill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # def bfill(column):
        #     return last(column, ignorenulls=True).over(window)
        #
        # for column in cols:
        #     vds_5 = vds_5.withColumn(column, ffill(col(column)))
        #
        # for column in cols:
        #     vds_5 = vds_5.withColumn(column, bfill(col(column)))

        vds_5 = vds_5.fillna(0)
        # vds_5 = vds_5.fillna(method='ffill')
        # vds_5 = vds_5.fillna(method='bfill')

        return vds_5
Exemplo n.º 14
0
spark = SparkSession \
    .builder \
    .getOrCreate()

df = spark.read.parquet('C:/*****/hmp.parquet')
df.createOrReplaceTempView('df')

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Normalizer, MinMaxScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol="class", outputCol="classIndex")
encoder = OneHotEncoder(inputCol="classIndex", outputCol="categoryVec")
vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)

MinMaxScaler = MinMaxScaler(inputCol="features", outputCol="features_minmax")

pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer,MinMaxScaler])
model = pipeline.fit(df)
prediction = model.transform(df)
prediction.show()

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Normalizer, MinMaxScaler, OneHotEncoderEstimator
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol="class", outputCol="classIndex")
encoder = OneHotEncoderEstimator(inputCol="classIndex", outputCol="categoryVec")
vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
Exemplo n.º 15
0
def main(train_data, test_data, sc, sqlContext, output):
    text = sqlContext.read.json(train_data)

    train_df = text.select(text.reviewText, text.overall.alias("label"))

    #Regextokenizer to split the words
    regexTokenizer = RegexTokenizer(inputCol="reviewText",
                                    outputCol="words",
                                    pattern="\\W")
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")

    hashingTF = HashingTF(inputCol="filtered",
                          outputCol="rawFeatures",
                          numFeatures=1000)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    normalizer = Normalizer(inputCol="features",
                            outputCol="normFeatures",
                            p=1.0)

    lr = LinearRegression(maxIter=20, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(
        stages=[regexTokenizer, remover, hashingTF, idf, normalizer, lr])

    paramGrid = (ParamGridBuilder().addGrid(
        hashingTF.numFeatures, [1000, 5000, 10000]).addGrid(
            lr.regParam,
            [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).build())

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(),
                              numFolds=5)  # 5 fold cross validation

    cv_model = crossval.fit(train_df)

    # Training Data Evaluation
    train_prediction = cv_model.transform(train_df)
    print train_prediction.show()
    train_evaluator = RegressionEvaluator(metricName="rmse",
                                          labelCol="label",
                                          predictionCol="prediction")
    train_rmse = train_evaluator.evaluate(train_prediction)

    text_test = sqlContext.read.json(test_data)
    test_df = text_test.select(text_test.reviewText,
                               text_test.overall.alias("label"))

    # Test Data Evaluation
    test_prediction = cv_model.transform(test_df)
    print test_prediction.show()
    test_evaluator = RegressionEvaluator(metricName="rmse",
                                         labelCol="label",
                                         predictionCol="prediction")
    test_rmse = test_evaluator.evaluate(test_prediction)

    print("Training Root mean square error = " + str(train_rmse))
    print("Testing Root mean square error = " + str(test_rmse))

    #output writen to file
    out_file = open(output, 'w')
    out_file.write(str(train_rmse))
    out_file.write(str(test_rmse))
    out_file.close()
Exemplo n.º 16
0
    layer3 = tf.layers.dense(layer2, 256, activation=tf.nn.sigmoid)
    layer4 = tf.layers.dense(layer3, 784, activation=tf.nn.sigmoid)
    loss = tf.losses.mean_squared_error(layer4, x)
    return loss

if __name__ == '__main__':
    spark = SparkSession.builder \
        .appName("examples") \
        .master('local[8]').config('spark.driver.memory', '4g') \
        .getOrCreate()

    df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand())
    mg = build_graph(small_model)

    va = VectorAssembler(inputCols=df.columns[1:785], outputCol='feats').transform(df).select(['feats'])
    na = Normalizer(inputCol='feats', outputCol='features', p=1.0).transform(va).select(['features'])

    #demonstration of options. Not all are required
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel=None,
        tfOutput='out:0',
        tfOptimizer='adam',
        tfLearningRate=.001,
        iters=10,
        predictionCol='predicted',
        partitions=3,
        miniBatchSize=256,
        verbose=1
Exemplo n.º 17
0
    def trainModel(self):

        logger.info("Training the model...")

        query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

        def SQLtoURL(query):

            data = query.replace('\n', ' ').replace('\t', ' ').replace(
                '   ', ' ').replace('  ', ' ')
            return data

        def QueryXXXXX(query, file=None):

            session = Session()
            response = session.post(
                data={
                    'tq': query,
                },
                url='https://app.XXXXXX.com/api/v2/projects/115659/cql/',
                headers={
                    'Authorization':
                    'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'
                },
            )
            return response.content

        table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),
                                         'utf-8'))['table']
        title_list = [x['c'] for x in table['rows']]
        table_cols = [d['label'] for d in table['cols']]

        def convert_row(row):
            rowlist = [d['v'] for d in row]
            return rowlist

        rd = self.sc.parallelize(title_list).map(convert_row)
        titleData = self.spark.createDataFrame(rd, table_cols)
        titleData = titleData.dropna()

        hebrew_stopwords = stop_words()

        def rmv(words):
            for punc in punctuation:
                words = words.replace(punc, "")
            for hword in hebrew_stopwords:
                words = words.replace(hword, " ")
            return words

        self.spark.udf.register("rmv", rmv, StringType())
        titleData.registerTempTable("wordstable")
        cleanedSentenceData = self.spark.sql(
            "select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable"
        )
        tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
        wordsData = tokenizer.transform(cleanedSentenceData)

        cv = CountVectorizer(inputCol="words",
                             outputCol="rawFeatures",
                             minDF=2.0)
        cvModel = cv.fit(wordsData)
        featurizedData = cvModel.transform(wordsData)

        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(featurizedData)
        rescaledData = idfModel.transform(featurizedData)

        lda = LDA(k=100)
        ldaModel = lda.fit(rescaledData)
        postFactorizedData = ldaModel.transform(rescaledData)

        norm = Normalizer(inputCol="topicDistribution",
                          outputCol="normTopicDist")
        scaledFactorizedNormalizedData = norm.transform(postFactorizedData)

        self.model = scaledFactorizedNormalizedData

        logger.info("model is built!")
# In[212]:

features_vectorized = vectorAssembler.transform(data)

# In[213]:

features_vectorized.show()

# In[214]:

from pyspark.ml.feature import Normalizer

# In[247]:

normalizer = Normalizer(inputCol='features', outputCol='features_norm', p=2.0)

# In[248]:

normalized_data = normalizer.transform(features_vectorized)

# In[250]:

normalized_data.show()

# In[251]:

#Normalization does not work for NAIVE BAYES .tHIS IMPLIES USE OTHER STANDADIZATION METHODS

# In[218]:
Exemplo n.º 19
0
# MAGIC * here we are using countVectorizer so we can work with actual words and see how the topics are described later on

# COMMAND ----------

tokenizer = Tokenizer(inputCol='message', outputCol='words')

stopWordsRemover = StopWordsRemover(inputCol='words', outputCol='noStopWords')

countVectorizer = CountVectorizer(vocabSize=1000,
                                  inputCol='noStopWords',
                                  outputCol='tf',
                                  minDF=1)

idf = IDF(inputCol='tf', outputCol='idf')

normalizer = Normalizer(inputCol='idf', outputCol='features')

lda = LDA(k=7, maxIter=10)

pipeline = Pipeline(stages=[
    tokenizer, stopWordsRemover, countVectorizer, idf, normalizer, lda
])

model = pipeline.fit(pages)

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ### Apply the model on the data
# MAGIC
Exemplo n.º 20
0

#Feature Vector ngrams+word
def concat(type):
    def concat_(*args):
        return list(chain.from_iterable((arg if arg else [] for arg in args)))

    return udf(concat_, ArrayType(type))


concat_arrays_udf = concat(StringType())
df_feature = df_stop.select("user", concat_arrays_udf("stop_words",
                                                      "com_skips"))

# Count Vectorize function over the feature vector
hashingTF = HashingTF(numFeatures=285,
                      inputCol='concat_(stop_words, com_skips)',
                      outputCol='features')
tf1 = hashingTF.transform(df_feature)

# Normalize the counts so that they are a percentage of total counts of the features
tf_norm1 = Normalizer(inputCol="features", outputCol="features_norm",
                      p=1).transform(tf1)

# Standardize the vector based on average use of each feature among all users
stdscaler = StandardScaler(inputCol='features_norm',
                           outputCol='scaled',
                           withMean=True)
scale_fit1 = stdscaler.fit(tf_norm1)
scaled1 = scale_fit1.transform(tf_norm1)
# Une dataframe est un objet immutable, donc pas la peine d'essayer de modifier une colonne,
# à la place on crée une deuxième dataframe où on ajoute la colonne qu'on veut.
dfVect = dfBigram.withColumn("words", udfVectorizeUni("words"))
# On a bien remplacé ici du coup les mots par les vecteurs sparse
print "DataFrame(1-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse"
dfVect.show()

udfVectorizeBi = UserDefinedFunction(lambda x: vectorizeBi(x), VectorUDT())
dfVect2 = dfVect.withColumn("bigrams", udfVectorizeBi("bigrams"))
print "DataFrame(bi-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse"
dfVect2.show()

# Pour les opérations de traitement du langage, il est d'usage de normaliser (L2)
# les vecteurs de features : c'est ce qui marche le mieux apparemment.
from pyspark.ml.feature import Normalizer
normalizerUni = Normalizer(inputCol='words', outputCol='normWords', p=2.0)
normalizerBi = Normalizer(inputCol="bigrams", outputCol='normBigrams', p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words', 'normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words', outputCol='wordsTF', numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
Exemplo n.º 22
0
def transform(df, spark, sql_query = None, numerical_features = [], categorical_features = [],\
              normalize = True, normalize_p=2):

    # Apply SQL query
    if sql_query != None:

        df.createOrReplaceTempView("netlytics")
        # Execute Query
        result_df = spark.sql(sql_query)
        df = result_df

    # Transform Strings in OneHot
    schema = df.schema
    feat_to_type = {}
    for struct in schema:
        feat_to_type[struct.name] = str(struct.dataType)

    for feature in categorical_features:

        # Replaces None
        k = col(feature)
        df = df.withColumn(feature, when(k.isNull(), "__NA__").otherwise(k))

        stringIndexer = StringIndexer(inputCol=feature,
                                      outputCol=feature + "_indexed",
                                      handleInvalid="skip")
        model = stringIndexer.fit(df)
        df = model.transform(df)

        encoder = OneHotEncoder(inputCol=feature + "_indexed",
                                outputCol=feature + "_encoded")
        df = encoder.transform(df)

    # Extract Features
    def extract_features(row, numerical_features, feat_to_type):
        output_features = {}

        fields = list(row.asDict().keys())
        for field in fields:
            if field in numerical_features and feat_to_type[
                    field] != "StringType":
                output_features[field] = float(row[field])
            if field.endswith("_encoded"):
                output_list = list(row[field])
                for i, v in enumerate(output_list):
                    tmp_field = field + "_" + str(i)
                    output_features[tmp_field] = float(v)

        features = [
            v for k, v in sorted(output_features.items(),
                                 key=operator.itemgetter(0))
        ]

        old_dict = row.asDict()
        old_dict["features"] = DenseVector(features)
        new_row = Row(**old_dict)
        return new_row

    #spark = df.rdd.
    rdd = df.rdd.map(
        lambda row: extract_features(row, numerical_features, feat_to_type))
    df = spark.createDataFrame(rdd, samplingRatio=1, verifySchema=False)

    # Normalize
    if normalize:
        normalizer = Normalizer(inputCol="features",
                                outputCol="featuresNorm",
                                p=normalize_p)
        df = normalizer.transform(df)
        df = df.drop("features")
        df = df.withColumnRenamed("featuresNorm", "features")

    # Delete intermediate columns:
    schema = df.schema
    feat_to_type = {}
    for struct in schema:
        feat_to_type[struct.name] = str(struct.dataType)

    for feature in feat_to_type:
        if feat_to_type[feature] != "StringType":
            if feature.endswith("_encoded") or feature.endswith("_indexed"):
                df = df.drop(feature)

    return df
Exemplo n.º 23
0
def build_model(df_ml):
    '''
    Function builds a classification model based on the user features
    
    INPUT:
        df_ml 
        
    OUTPUT:
        model - final trained model
    '''

    # split into train, test and validation sets (60% - 20% - 20%)
    df_ml = df_ml.withColumnRenamed("churn", "label")

    train, test_valid = df_ml.randomSplit([0.7, 0.3], seed=2048)
    test, validation = test_valid.randomSplit([0.5, 0.5], seed=2048)

    # index and encode categorical features gender, level and state

    stringIndexerGender = StringIndexer(inputCol="gender",
                                        outputCol="genderIndex",
                                        handleInvalid='skip')
    stringIndexerLevel = StringIndexer(inputCol="last_level",
                                       outputCol="levelIndex",
                                       handleInvalid='skip')
    stringIndexerState = StringIndexer(inputCol="last_state",
                                       outputCol="stateIndex",
                                       handleInvalid='skip')

    encoder = OneHotEncoderEstimator(
        inputCols=["genderIndex", "levelIndex", "stateIndex"],
        outputCols=["genderVec", "levelVec", "stateVec"],
        handleInvalid='keep')

    # create vector for features
    features = [
        'genderVec', 'levelVec', 'stateVec', 'days_active', 'avg_songs',
        'avg_events', 'thumbs_up', 'thumbs_down', 'addfriend'
    ]
    assembler = VectorAssembler(inputCols=features, outputCol="rawFeatures")

    # normalize features
    normalizer = Normalizer(inputCol="rawFeatures",
                            outputCol="features",
                            p=1.0)

    # initialize random forest classifier with tuned hyperparameters
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="features",
                                numTrees=120,
                                impurity='gini',
                                maxDepth=5,
                                featureSubsetStrategy='sqrt')

    # assemble pipeline
    pipeline = Pipeline(stages=[
        stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder,
        assembler, normalizer, rf
    ])

    # fit model
    model = pipeline.fit(train)

    # predict churn
    pred_train = model.transform(train)
    pred_test = model.transform(test)
    pred_valid = model.transform(validation)

    # evaluate results
    predictionAndLabels = pred_train.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # print F1-score
    print("Train F1: %s" % metrics.fMeasure())

    predictionAndLabels = pred_test.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # F1 score
    print("Test F1: %s" % metrics.fMeasure())

    predictionAndLabels = pred_valid.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # F1 score
    print("Validation F1: %s" % metrics.fMeasure())

    return model
OHEStage = OneHotEncoderEstimator(
    inputCols = [col + '_INDEX' for col in COLUMNS_OHE],
    outputCols = [col + '_VEC' for col in COLUMNS_OHE]
)
pipelineStages += [OHEStage]

sparseVectorCols = [col + '_VEC' for col in COLUMNS_OHE] + [col + '_INDEX' for col in COLUMNS_HIGH_CARD]
assembler = VectorAssembler(
    inputCols = sparseVectorCols, 
    outputCol = 'features'
)
pipelineStages += [assembler]

normalizer = Normalizer(
    inputCol = 'features',
    outputCol = 'normFeatures'
)
pipelineStages += [normalizer]


pipeline = Pipeline(stages = pipelineStages)
pipelineModel = pipeline.fit(train_df)
train_df = pipelineModel.transform(train_df)

for col in COLUMNS_OHE:
    train_df.drop(col, col + '_VEC')

for col in COLUMNS_HIGH_CARD:
    train_df.drop(col, col + '_INDEX')

train_df.drop('features')
Exemplo n.º 25
0
# split the desc field
tokenizer = Tokenizer(inputCol='desc_clean', outputCol='desc_words')
df = tokenizer.transform(df0)
#df.show()
#df.select('desc_words').show(10)

# compute TF-IDF
hashingTF = HashingTF(inputCol='desc_words', outputCol='desc_words_tf')
tf = hashingTF.transform(df).cache()
idf = IDF(inputCol='desc_words_tf', outputCol='desc_words_tfidf').fit(tf)
tfidf = idf.transform(tf).cache()
#print('tfidf for each job:', tfidf.select('desc_words_tfidf').show(10,truncate=False))

# data normalization
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm")
tfidf = normalizer.transform(tfidf)
#tfidf.select("id", "norm").show(6)

# compute similarity between jobs and resume
import pyspark.sql.functions as psf 
from pyspark.sql.types import DoubleType
print('\nCompute the similarity between jobs and resume...')
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType()) # define dot-product function
tfidf = tfidf.alias("a1").join(tfidf.alias("a2"), psf.col("a1.id") == 0)\
        .select(
            psf.col("a1.job"),
            psf.col("a1.id").alias("id1"), 
            psf.col("a2.id").alias("id2"), 
            dot_udf("a1.norm", "a2.norm").alias("similarity"))
#tfidf.show(10)
Exemplo n.º 26
0
def runModel(regressionMethodName,
             stationID,
             stationDataFrame,
             featureInputCols,
             normalize,
             splitMethod='random'):
    print("=" * 80)
    print('Station:{0}'.format(stationID))
    print(
        'Model:{0}, Normalize:{1}, LinkFunction:{2}, train/test splitMethod:{3}'
        .format(regressionMethodName, normalize, labelLinkFunction,
                splitMethod))
    print(featureInputCols)

    oneHot = OneHotEncoderEstimator(
        inputCols=["hourOfDay", "dayOfWeek"],
        outputCols=["hourOfDayVector", "dayOfWeekVector"])

    stationSummaryAll = stationDataFrame.groupBy('station_id').agg(
        count('label'), sum('label'), avg("label"), stddev_pop("label"))
    stationAvg = stationSummaryAll.select('avg(label)').where(
        col('station_id') == stationID).collect()
    stationSum = stationSummaryAll.select('sum(label)').where(
        col('station_id') == stationID).collect()
    stationStd = stationSummaryAll.select('stddev_pop(label)').where(
        col('station_id') == stationID).collect()
    stationNonZeroCount = stationSummaryAll.select('count(label)').where(
        col('station_id') == stationID).collect()
    stationCount = stationSummaryAll.select('count(label)').where(
        col('station_id') == "None").collect()

    featureInputCols.extend(["hourOfDayVector", "dayOfWeekVector"])
    assembler = VectorAssembler(inputCols=featureInputCols,
                                outputCol='features')

    if normalize == True:
        normalizer = Normalizer(inputCol="features",
                                outputCol="normFeatures",
                                p=1.0)
        featureName = "normFeatures"
        regressionMethod, regressionModelParameters = selectRegressionMethod(
            'rf', featureName)
        pipeline = Pipeline(
            stages=[oneHot, assembler, normalizer, regressionMethod])
    else:
        featureName = "features"
        regressionMethod, regressionModelParameters = selectRegressionMethod(
            'rf', featureName)
        pipeline = Pipeline(stages=[oneHot, assembler, regressionMethod])

    trainingDates = ['2016-10-01 00:00:00', '2017-9-30 23:59:59']

    testDates = ['2017-10-01 00:00:00', '2017-10-31 23:59:59']

    dates = {'train': trainingDates, 'test': testDates}

    if splitMethod == 'random':
        # Split the data into training and test sets (30% held out for testing)
        (trainingData, testData) = stationDataFrame.randomSplit([0.6, 0.4])

    else:
        (trainingData, testData) = timeSeriesTestTrain(stationDataFrame, dates)

    #fit model and make predictions
    model = pipeline.fit(trainingData)
    predictedData = model.transform(testData)
    #predictedData.select("prediction", "label", featureName).show(5)
    predictedData
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    evaluator2 = RegressionEvaluator(labelCol="label",
                                     predictionCol="prediction",
                                     metricName="r2")
    evaluator3 = RegressionEvaluator(labelCol="label",
                                     predictionCol="prediction",
                                     metricName="explainedVariance")

    rmse = evaluator.evaluate(predictedData)
    rSquared = evaluator2.evaluate(predictedData)
    varianceExplained = evaluator2.evaluate(predictedData)

    print(
        "RMSE, R2, and variance explained on test data = {0:6.3f}, {1:6.3f}, {2:6.3f}"
        .format(rmse, rSquared, varianceExplained))
    print()
    basetime = 1541216769
    experimentTimeStamp = int((time.time() - basetime) / 6)
    experiment = {
        experimentTimeStamp: {
            "station": stationID,
            'stationNonZeroCount': stationNonZeroCount,
            'stationCount': stationCount,
            'stationSum': stationSum,
            'stationAvg': stationAvg,
            'stationStd': stationStd,
            'regressionMethodName': regressionMethodName,
            'normalize': normalize,
            'linkFunctionLabel': labelLinkFunction,
            'featureInputCols': featureInputCols,
            'rmse': rmse,
            'rSquared': rSquared,
            'varianceExplained': varianceExplained,
            'version': "Added OneHotEncode for hOD, dOW",
            'trainSplitMethod': splitMethod
        }
    }
    experiments.update(experiment)
    with open(pathFigure + "experiments.json", "w") as f:
        json.dump(experiments, f)
    return ()
for i in range(len(pred)):
    if pred[i] == list(Y_test)[i]:
        corr+=1
    else :
        pass
print('正确率:'+str(corr*1.01/len(pred)/1.01))

#Spark上进行BP神经网络建模
from pyspark.sql import Row
from pyspark.ml.feature import Normalizer  
lines = sc.textFile("hdfs:///lushun/a.txt")
parts = lines.map(lambda l: l.split(" "))
df = parts.map(lambda p: Row(features=p[:-1], labe1=int(p[-1])))
df = spark.createDataFrame(df)
df.createOrReplaceTempView("df")
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)  
l1NormData = normalizer.transform(df)
l1NormData = spark.sql("SELECT labe1,normFeatures FROM l1NormData")  
l1NormData.show()       
from pyspark.ml.classification import MultilayerPerceptronClassifier  
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
splits = lInfNormData.randomSplit([0.7, 0.3]) 
train = splits[0]  
test = splits[1]
layers = [36300, 200, 200, 6] 
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers,seed=1234)
model = trainer.fit(train)  
# compute accuracy on the test set  
result = model.transform(test)  
predictionAndLabels = result.select("prediction", "label")  
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")  
Exemplo n.º 28
0
fittedmaScaler.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import Normalizer
manhattanDistance = Normalizer().setP(1).setInputCol("features")
manhattanDistance.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import StringIndexer
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()

# COMMAND ----------

valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show()

# COMMAND ----------
Exemplo n.º 29
0
def normalize_vectors(input_col: str, output_col: str, df: DataFrame):
    """Normalize a column of vectors so they all have a magnitude of 1"""
    normalizer = Normalizer(inputCol=input_col, outputCol=output_col)
    return normalizer.transform(df)
Exemplo n.º 30
0
tokenizer = Tokenizer(inputCol="doc_text", outputCol="doc_words")
df = tokenizer.transform(df)
df.show()

# 计算每篇文档的TF-IDF
hashingTF = HashingTF(inputCol='doc_words', outputCol="doc_words_tf")
#hashingTF = HashingTF()
tf = hashingTF.transform(df).cache()
idf = IDF(inputCol='doc_words_tf', outputCol="doc_words_tfidf").fit(tf)
tfidf = idf.transform(tf).cache()
print('\n 每个文档的TFIDF')
tfidf.select('doc_words_tfidf').show(truncate=False)

# 数据规范化,默认为2阶范式
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="doc_words_tfidf",
                        outputCol="norm")  #默认.setP(2.0)
tfidf = normalizer.transform(tfidf)
tfidf.select('norm').show(truncate=False)

import pyspark.sql.functions as psf
from pyspark.sql.types import DoubleType
dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType())
tfidf.alias("a1").join(tfidf.alias("a2"), psf.col("a1.id") < psf.col("a2.id"))\
    .select(
        psf.col("a1.id").alias("id1"),
        psf.col("a2.id").alias("id2"),
        dot_udf("a1.norm", "a2.norm").alias("dot"))\
    .sort("id1", "id2")\
    .show()