Пример #1
0
 def test_attr_spark(self):
     conf = SparkConf().setAppName("toy_test").setMaster('local[2]')
     num_partitions = 2
     enumerator = "join"
     model_type = "regression"
     label = 'target'
     sparkContext = SparkContext(conf=conf)
     sqlContext = SQLContext(sparkContext)
     train_df = sqlContext.read.csv("toy_train.csv", header='true',
                         inferSchema='true')
     test_df = sqlContext.read.csv("toy.csv", header='true',
                         inferSchema='true')
     # initializing stages of main transformation pipeline
     stages = []
     # list of categorical features for further hot-encoding
     cat_features = ['a', 'b', 'c']
     for feature in cat_features:
         string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index").setHandleInvalid("skip")
         encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"])
         encoder.setDropLast(False)
         stages += [string_indexer, encoder]
     assembler_inputs = [feature + "_vec" for feature in cat_features]
     assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs")
     stages += [assembler]
     assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features")
     stages += [assembler_final]
     pipeline = Pipeline(stages=stages)
     train_pipeline_model = pipeline.fit(train_df)
     test_pipeline_model = pipeline.fit(test_df)
     train_df_transformed = train_pipeline_model.transform(train_df)
     test_df_transformed = test_pipeline_model.transform(test_df)
     train_df_transformed = train_df_transformed.withColumn('model_type', sf.lit(0))
     test_df_transformed = test_df_transformed.withColumn('model_type', sf.lit(0))
     decode_dict = {}
     counter = 0
     cat = 0
     for feature in cat_features:
         colIdx = test_df_transformed.select(feature, feature + "_index").distinct().rdd.collectAsMap()
         colIdx = {k: v for k, v in sorted(colIdx.items(), key=lambda item: item[1])}
         for item in colIdx:
             decode_dict[counter] = (cat, item, colIdx[item], counter)
             counter = counter + 1
         cat = cat + 1
     train_df_transform_fin = train_df_transformed.select('features', label, 'model_type')
     test_df_transform_fin = test_df_transformed.select('features', label, 'model_type')
     lr = LinearRegression(featuresCol='features', labelCol=label, maxIter=10, regParam=0.0, elasticNetParam=0.8)
     lr_model = lr.fit(train_df_transform_fin)
     eval = lr_model.evaluate(test_df_transform_fin)
     f_l2 = eval.meanSquaredError
     pred = eval.predictions
     pred_df_fin = pred.withColumn('error', spark_utils.calc_loss(pred[label], pred['prediction'], pred['model_type']))
     predictions = pred_df_fin.select('features', 'error').repartition(num_partitions)
     converter = IndexToString(inputCol='features', outputCol='cats')
     all_features = list(decode_dict)
     predictions = predictions.collect()
     spark_join = spark_slicer.parallel_process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                   k=self.k, w=self.w, loss_type=self.loss_type, enumerator="join")
     spark_union = spark_union_slicer.process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                   k=self.k, w=self.w, loss_type=self.loss_type, enumerator="union")
     self.assertEqual(3, len(spark_join.slices))
     print("check1")
     self.assertEqual(spark_join.min_score, spark_union.min_score)
     print("check2")
     self.assertEqual(spark_join.keys, spark_union.keys)
     print("check3")
     self.assertEqual(len(spark_join.slices), len(spark_union.slices))
     print("check4")
     idx = -1
     for sliced in spark_join.slices:
         idx += 1
         self.assertEqual(sliced.score, spark_union.slices[idx].score)
     print("check5")
Пример #2
0
    rf = RandomForestClassifier(featuresCol='features',
                                labelCol="target",
                                numTrees=10)
    rf_model = rf.fit(train_df)
    predictions = rf_model.transform(test_df)
    # Select example rows to display.
    predictions.select("id", "features", "target", "prediction", "model_type")
    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="target",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    loss = 1.0 - accuracy
    pred_df_fin = predictions.withColumn(
        'error',
        spark_utils.calc_loss(predictions["target"], predictions['prediction'],
                              predictions['model_type']))
    predictions = pred_df_fin.select('id', 'features',
                                     'error').repartition(num_partitions)
    all_features = range(predictions.toPandas().values[0][0])

    if enumerator == "join":
        join_data_parallel.parallel_process(all_features,
                                            predictions,
                                            loss,
                                            sparkContext,
                                            debug=debug,
                                            alpha=alpha,
                                            k=k,
                                            w=w,
                                            loss_type=loss_type)
    elif enumerator == "union":
Пример #3
0
    train, test = train_test_split(df_transform_fin, test_size=0.3, random_state=0)
    train_df = sqlContext.createDataFrame(train)
    test_df = sqlContext.createDataFrame(test)
    decode_dict = {}
    counter = 0
    cat = 0
    for feature in cat_features:
        colIdx = dataset_transformed.select(feature, feature + "_index").distinct().rdd.collectAsMap()
        colIdx = {k: v for k, v in sorted(colIdx.items(), key=lambda item: item[1])}
        for item in colIdx:
            decode_dict[counter] = (cat, item, colIdx[item])
            counter = counter + 1
        cat = cat + 1
    lr = LinearRegression(featuresCol='features', labelCol=label, maxIter=10, regParam=0.3, elasticNetParam=0.8)
    lr_model = lr.fit(train_df)
    eval = lr_model.evaluate(test_df)
    f_l2 = eval.meanSquaredError
    pred = eval.predictions
    pred_df_fin = pred.withColumn('error', spark_utils.calc_loss(pred[label], pred['prediction'], pred['model_type']))
    predictions = pred_df_fin.select('features', 'error').repartition(num_partitions)
    converter = IndexToString(inputCol='features', outputCol='cats')
    all_features = range(predictions.toPandas().values[0][0].size)
    predictions = predictions.collect()
    k = 10
    if enumerator == "join":
        spark_slicer.parallel_process(all_features, predictions, f_l2, sparkContext, debug=debug, alpha=alpha, k=k, w=w,
                                      loss_type=loss_type, enumerator=enumerator)
    elif enumerator == "union":
        spark_union_slicer.process(all_features, predictions, f_l2, sparkContext, debug=debug, alpha=alpha, k=k, w=w,
                                   loss_type=loss_type, enumerator=enumerator)
Пример #4
0
                                test_size=0.3,
                                random_state=0)
 train_df = sqlContext.createDataFrame(train)
 test_df = sqlContext.createDataFrame(test)
 lr = LinearRegression(featuresCol='features',
                       labelCol='target',
                       maxIter=10,
                       regParam=0.3,
                       elasticNetParam=0.8)
 lr_model = lr.fit(train_df)
 eval = lr_model.evaluate(test_df)
 f_l2 = eval.meanSquaredError
 pred = eval.predictions
 pred_df_fin = pred.withColumn(
     'error',
     spark_utils.calc_loss(pred['target'], pred['prediction'],
                           pred['model_type']))
 predictions = pred_df_fin.select('id', 'features',
                                  'error').repartition(num_partitions)
 converter = IndexToString(inputCol='features', outputCol='cats')
 all_features = range(predictions.toPandas().values[1][1].size)
 k = 10
 if enumerator == "join":
     join_data_parallel.parallel_process(all_features,
                                         predictions,
                                         f_l2,
                                         sparkContext,
                                         debug=debug,
                                         alpha=alpha,
                                         k=k,
                                         w=w,
                                         loss_type=loss_type)
Пример #5
0
     for item in colIdx:
         decode_dict[counter] = (cat, item, colIdx[item])
         counter = counter + 1
     cat = cat + 1
 lr = LinearRegression(featuresCol='features',
                       labelCol=label,
                       maxIter=10,
                       regParam=0.3,
                       elasticNetParam=0.8)
 lr_model = lr.fit(train_df)
 eval = lr_model.evaluate(test_df)
 f_l2 = eval.meanSquaredError
 pred = eval.predictions
 pred_df_fin = pred.withColumn(
     'error',
     spark_utils.calc_loss(pred[label], pred['prediction'],
                           pred['model_type']))
 predictions = pred_df_fin.select('features',
                                  'error').repartition(num_partitions)
 converter = IndexToString(inputCol='features', outputCol='cats')
 all_features = list(decode_dict.keys())
 predictions = predictions.collect()
 if enumerator == "join":
     spark_slicer.parallel_process(all_features,
                                   predictions,
                                   f_l2,
                                   sparkContext,
                                   debug=debug,
                                   alpha=alpha,
                                   k=k,
                                   w=w,
                                   loss_type=loss_type,