def prepare_df_for_prediction(self, dataframe: DataFrame, label_to_predict,
                               categorical_features, continuous_features):
     # stages = self.build_pipeline_stages(categorical_features, continuous_features)
     # pipelined_dataframe = self.pipeline_dataframe(stages, dataframe)
     hasher = FeatureHasher(
         inputCols=[*categorical_features, *continuous_features],
         outputCol='features')
     featurized = hasher.transform(dataframe)
     label_features = featurized.select(label_to_predict,
                                        'features').withColumnRenamed(
                                            label_to_predict, 'label')
     return label_features
예제 #2
0
def ml_transformer(df,feature_all,response_feature):
    '''
    preprocess the data ready for the logistic regression model
    '''
    feature_only = feature_all.remove(response_feature)
    hasher = FeatureHasher(inputCols=feature_only,outputCol="features")
    df_featurized = hasher.transform(df)
    df_train, df_test = df_featurized.randomSplit([0.8, 0.2], seed=12345)
    df_size = float(df_train.select(response_feature).count())
    num_positives = df_train.select(response_feature).where('{}==1'.format(response_feature)).count()
    num_negatives = df_train.select(response_feature).where('{}==0'.format(response_feature)).count()
    balance_ratio = 1- num_positives/df_size
    df_train=df_train.withColumn("classWeights", when(df_train.response_feature == 1,balance_ratio).otherwise(1-balance_ratio))
    return df_train,df_test
예제 #3
0
def featurehasher(request):
    print("into featurehasher")
    value = "featurehasher"
    file_id = request.GET['fileid']
    print(file_id)
    spark = sparkSession(request)
    print("Created Spark Session")
    spark.sql('use hivedb')
    formFile = get_object_or_404(CSVFile, id=file_id)
    filePath = BASE_DIR + '\\' + str(formFile.file)
    filename = filePath
    projectid = formFile.project_fk.id

    csvpath = filename
    seperate = csvpath.split('/')
    for temp in seperate:
        pass
    splitcsv = temp.split('.')
    csvname = splitcsv[0]
    fid = str(file_id)
    pid = str(projectid)
    tablename = csvname + '_' + fid + '_' + pid
    print(tablename)
    datapreprocess = tablename + '_prerocessing'

    #spark.sql("select * from "+str(datapreprocess)+"").show()
    df = spark.table(datapreprocess)
    header = df.columns
    print(header)
    hasher = FeatureHasher(inputCols=header, outputCol="features")

    featurized = hasher.transform(df)
    featurized.show(truncate=False)
    dff = featurized

    scaler = StandardScaler(inputCol="features",
                            outputCol="scaledFeatures",
                            withStd=True,
                            withMean=False)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(dff)

    # Normalize each feature to have unit standard deviation.
    scaledData = scalerModel.transform(dff)
    scaledData.show(truncate=False)

    return JsonResponse({"success": True}, status=200)
예제 #4
0
def create_featureHasher(input_col:str, nq:int) -> FeatureHasher:
    """
    Create a Feature Hasher for a specified column 
    Uses as output colum the input + _encoded (creates oneHotEncodings for strings)
    
    Parameters
    ----------
    input_col: str
        Name of the Input Column
    nq: Int
        Number of Quantiles to use
        
        
    Return
    ------
    FeatureHasher
    """
    output_col = input_col + "_encoded"
    return FeatureHasher(numFeatures=nq,
                         inputCols=[input_col],
                         outputCol=output_col)
result = model.transform(df)
result.show(truncate=False)

# COMMAND ----------

###Feature hashing

from pyspark.ml.feature import FeatureHasher

dataset = spark.createDataFrame([(2.2, True, "1", "foo"),
                                 (3.3, False, "2", "bar"),
                                 (4.4, False, "3", "baz"),
                                 (5.5, False, "4", "foo")],
                                ["real", "bool", "stringNum", "string"])

hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
                       outputCol="features")

featurized = hasher.transform(dataset)
featurized.show(truncate=False)

# COMMAND ----------

####Feature transformer (transforming sentences into words)
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

sentenceDataFrame = spark.createDataFrame(
    [(0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"),
     (2, "Logistic,regression,models,are,neat")], ["id", "sentence"])
예제 #6
0
df.cache()

total_detections = df.select("HasDetections").where(df.HasDetections==1).count()

print("Total Rows: {0}".format(df.count()))
print("Total HasDetections: {0}".format(total_detections))
print("******    Crosstabulations   ******")
df.crosstab("HasDetections", "SkuEdition").show(truncate=False)
df.crosstab("HasDetections", "ProductName").show(truncate=False)
df.crosstab("HasDetections", "AVProductsEnabled").show(truncate=False)
df.crosstab("HasDetections", "IsBeta").show(truncate=False)
df.crosstab("HasDetections", "Platform").show(truncate=False)
df.crosstab("HasDetections", "Census_DeviceFamily").show(truncate=False)
df.crosstab("HasDetections", "Census_OSInstallTypeName").show(truncate=False)

all_columns = df.columns
label_col = ["HasDetections"]
meta_cols = ["MachineIdentifier"]
#feature_cols = ["SkuEdition", "ProductName", "AVProductsEnabled", "IsBeta", "Platform", "Census_DeviceFamily", "Census_OSInstallTypeName"]
feature_cols = list(set(all_columns) - set(label_col) - set(meta_cols))
ordered_cols = list(label_col + meta_cols + feature_cols)

#hasher = FeatureHasher(numFeatures=len(feature_cols), inputCols=feature_cols, outputCol="features",  categoricalCols=feature_cols)
hasher = FeatureHasher(numFeatures=len(feature_cols), inputCols=feature_cols, outputCol="features",  categoricalCols=feature_cols)
df_features = df.sample(fraction=0.50, seed=3)
df_features = df.select(*ordered_cols)
df_features = hasher.transform(df_features)

chi_test = ChiSquareTest.test(df_features, "features", "HasDetections")

예제 #7
0
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import FeatureHasher
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("FeatureHasherExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame([
        (2.2, True, "1", "foo"),
        (3.3, False, "2", "bar"),
        (4.4, False, "3", "baz"),
        (5.5, False, "4", "foo")
    ], ["real", "bool", "stringNum", "string"])

    hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
                           outputCol="features")

    featurized = hasher.transform(dataset)
    featurized.show(truncate=False)
    # $example off$

    spark.stop()
예제 #8
0
drop_cols = list(
    set(df.columns) - set(meta_cols) - set(continuous_cols) -
    set(categorical_cols) - set(feature_cols))

df = df.drop(*drop_cols)
df.cache()

print("Creating Splits")
train, test = df.randomSplit([0.7, 0.3])

print("Selected Features Count: {0}".format(len(feature_cols)))
print("Selected Features: {0}".format(feature_cols))

print("Building Pipeline")
categorical_hasher = FeatureHasher(inputCols=categorical_cols,
                                   outputCol="categorical_features",
                                   categoricalCols=categorical_cols)
continuous_vector = VectorAssembler(inputCols=continuous_cols,
                                    outputCol="continuous_vector")
scaler = MinMaxScaler(min=0.0,
                      max=1.0,
                      inputCol=continuous_vector.getOutputCol(),
                      outputCol="continuous_features")
features = VectorAssembler(inputCols=feature_cols, outputCol="features")
classifier = LogisticRegression(regParam=0.01,
                                maxIter=100,
                                aggregationDepth=2,
                                fitIntercept=True,
                                family="binomial",
                                elasticNetParam=0.0)
one_rest = OneVsRest(classifier=classifier,
예제 #9
0
df = df.withColumnRenamed("click", "label")

df_train, df_test = df.randomSplit([0.7, 0.3], 42)

df_train.cache()

df_test.cache()

categorical = df_train.columns
categorical.remove('label')
print(categorical)

from pyspark.ml.feature import FeatureHasher

hasher = FeatureHasher(numFeatures=10000,
                       inputCols=categorical,
                       outputCol="features")

hasher.transform(df_train).select("features").show()

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20,
                                regParam=0.000,
                                elasticNetParam=0.000)

stages = [hasher, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)
예제 #10
0
]
drop_cols = list(set(df.columns) - set(selected_cols) - set(meta_cols))

print("Performing Cleanup")
df = df.drop(*drop_cols)
df.cache()

print("Creating Splits")
train, test = df.randomSplit([0.7, 0.3])

print("Selected Features Count: {0}".format(len(selected_cols)))
print("Selected Features: {0}".format(selected_cols))

print("Building Pipeline")
hasher = FeatureHasher(numFeatures=1024,
                       inputCols=selected_cols,
                       outputCol="features",
                       categoricalCols=selected_cols)
forest = RandomForestClassifier(featuresCol="features",
                                labelCol="HasDetections",
                                predictionCol="prediction",
                                probabilityCol="probability")

pipeline = Pipeline(stages=[hasher, forest])
evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections",
                                              predictionCol="prediction",
                                              metricName="accuracy")

print("Configuring Validation")
params = ParamGridBuilder() \
   .addGrid(hasher.numFeatures, [1024]) \
   .addGrid(forest.maxDepth, [30]) \
예제 #11
0
ratio = 1.0
counts = train.select(f'_c{label_idx}').groupBy(
    f'_c{label_idx}').count().collect()
higher_bound = counts[1][1]
treshold = int(ratio * float(counts[0][1]) / counts[1][1] * higher_bound)

rand_gen = lambda x: randint(0, higher_bound) if x == 0 else -1
udf_rand_gen = udf(rand_gen, IntegerType())
train = train.withColumn('rand_idx', udf_rand_gen('_c0'))
train_subsample = train.filter(train['rand_idx'] < treshold)
train_subsample = train_subsample.drop('rand_idx')

train_subsample.select(f'_c{label_idx}').groupBy(
    f'_c{label_idx}').count().show(n=5)

# パイプラインの構築
hasher = FeatureHasher(numFeatures=262144,
                       inputCols=real_features + category_features,
                       outputCol='features',
                       categoricalCols=category_features)
lr = LogisticRegression(featuresCol='features', labelCol=f'_c{label_idx}')
pipeline = Pipeline(stages=[hasher, lr])

model = pipeline.fit(train_subsample)
print(model.stages[-1].coefficients)

predictions = model.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol=f'_c{label_idx}',
                                              metricName='f1')
f1 = evaluator.evaluate(predictions)
print(f'f1 = {f1}')
from pyspark.ml.feature import FeatureHasher
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

from pyspark.sql import Column
import pyspark.sql.functions as F

data = [(0, 2.0, True, "1", "foo"), (1, 3.0, False, "2", "bar"),
        (0, 2.0, True, "1", "foo"), (1, 3.0, False, "2", "bar")]
cols = ["label", "real", "bool", "stringNum", "string"]
feature_cols = ["real", "bool", "stringNum", "string"]
x = spark.createDataFrame(data, cols)
h = FeatureHasher(numFeatures=4,
                  inputCols=feature_cols,
                  outputCol="features",
                  categoricalCols=feature_cols)
x_df = h.transform(x)
x_df.show(truncate=False)

s = ChiSqSelector(numTopFeatures=2,
                  labelCol="label",
                  featuresCol="features",
                  outputCol="selectedFeatures")
m = s.fit(x_df)
m_df = m.transform(x_df)
m_df.show(truncate=False)
s_df = m_df.select(*(m_df.columns[column_index]
                     for column_index in m.selectedFeatures))
s_df.show(truncate=False)
m.selectedFeatures
    "AMT_PAYMENT").cache()
# joining to the master table
df_train = df_train.join(df_prev_app_means, ['SK_ID_CURR'], how='left').cache()
df_train = df_train.join(df_payment_means, ['SK_ID_CURR'], how='left').cache()
# filling nan values
df_train = df_train.na.fill(0).cache()

logger.info("# Rows:" + str(df_train.count()))
logger.info("# Cols:" + str(len(df_train.columns)))
labelIndexer = StringIndexer(inputCol="label",
                             outputCol="indexedLabel").fit(df_train)
labeled = labelIndexer.transform(df_train)
hasher = FeatureHasher(inputCols=[
    column for column in list(set(df_train.columns)) if column != 'label'
],
                       outputCol="indexedFeatures",
                       numFeatures=len([
                           column for column in list(set(df_train.columns))
                           if column != 'label'
                       ]))
featurized = hasher.transform(df_train)

# Split the data into training and test sets (30% held out for testing)
trainingData, testData = df_train.randomSplit([0.7, 0.3], seed=1234)

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures",
                            numTrees=20,
                            maxDepth=15)

# Chain indexers and forest in a Pipeline