Python MLUtils示例，pyspark.mllib.util.MLUtils Python示例

示例#1

0

显示文件

文件： spark_lr.py 项目： xiadx/rec-sys

def main():
    if len(sys.argv) != 2:
        print("parameter error")
        sys.exit(1)

    arg = json.loads(sys.argv[1])

    spark = SparkSession.builder \
        .appName("spark_lr") \
        .enableHiveSupport() \
        .getOrCreate()

    scene = arg["scene"]
    version = arg["version"]
    model = arg["model"]
    sample_path = arg["sample-path"]
    num_features = arg["num-features"]
    num_classes = arg["num-classes"]

    train_file = sample_path.replace(
        "{scene}.{model}.{version}.[train|test]",
        "%s.%s.%s.train" % (scene, model, version))
    test_file = sample_path.replace("{scene}.{model}.{version}.[train|test]",
                                    "%s.%s.%s.test" % (scene, model, version))

    train = MLUtils.loadLibSVMFile(spark.sparkContext, train_file,
                                   num_features)
    test = MLUtils.loadLibSVMFile(spark.sparkContext, test_file, num_features)

    print(train.count())
    print(test.count())
    print(train.getNumPartitions())
    print(test.getNumPartitions())

示例#2

0

显示文件

def main(sc, sql_context, is_hive=True):
    lp_train = MLUtils.loadLabeledPoints(sc,
                                         "bintrade.ml.diff.label_point.train")
    lp_check = MLUtils.loadLabeledPoints(sc,
                                         "bintrade.ml.diff.label_point.check")

    model = GradientBoostedTrees.trainRegressor(lp_train, {},
                                                numIterations=50,
                                                maxDepth=10)

    preds = model.predict(lp_check.map(lambda x: x.features))
    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=False)

    for each in labels_and_preds.take(100):
        print each

    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=True)
    for each in labels_and_preds.take(100):
        print each

    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - x[1], 2)).sum() / labels_and_preds.count()
    print mse
    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - 1.0, 2)).sum() / labels_and_preds.count()
    print mse

示例#3

0

显示文件

文件： functions.py 项目： Sapphirine/Baseball_Pitch_Prediction_Model

def create_vector_file(pitch_outcome, path, folder):
    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.util import MLUtils

    pitch_o_RDD = pitch_outcome.rdd
    x = pitch_o_RDD.map(lambda data: LabeledPoint(data[13],[data[0],data[1],data[2],data[3],\
            data[4],data[5],data[6],data[7],data[8],data[9],data[10],data[11],data[12],data[14],\
            data[15],data[16],data[17],data[18],data[19],data[20]]))
    MLUtils.saveAsLibSVMFile(x, path + folder)

示例#4

0

显示文件

def cross_validation_task_C(X,
                            estimator,
                            sqlContext,
                            class_type,
                            features_col,
                            sc,
                            k_folds=10):
    kf = KFold(n_splits=k_folds)
    maem = []
    maeni = []

    for train_index, test_index in kf.split(X):
        sparse_data = []
        test_data = []
        cl_cl = []

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]

        train_topic = sqlContext.createDataFrame(X_train)
        test_topic = sqlContext.createDataFrame(X_test)

        # True:  DecisionTree
        # False: NaiveBayes

        if (class_type):
            pred = pd.DataFrame(columns=['class', 'prediction'])
            train_topic = MLUtils.convertVectorColumnsFromML(
                train_topic, features_col)
            test_topic = MLUtils.convertVectorColumnsFromML(
                test_topic, features_col)

            for index, row in train_topic.toPandas().iterrows():
                sparse_data.append(
                    LabeledPoint(float(row['class']), row[features_col]))

            for index, row in test_topic.toPandas().iterrows():
                cl_cl.append(row['class'])
                test_data.append(row[features_col])

            model = DecisionTree.trainClassifier(sc.parallelize(sparse_data),
                                                 5, {})

            pred['class'] = cl_cl
            pred['prediction'] = model.predict(
                sc.parallelize(test_data)).collect()
            maem_aux, maeni_aux = mae_ms(pred)

        else:
            pred = estimator.fit(train_topic).transform(test_topic).select(
                'class', 'prediction').toPandas()
            maem_aux, maeni_aux = mae_ms(pred)

        maem.append(maem_aux)
        maeni.append(maeni_aux)

    return (np.mean(maem), np.mean(maeni))

示例#5

0

显示文件

文件： document_vectorizer.py 项目： akshayjh/sparkgram

    def write_rdd(rdd, path, out_type = 'pickleFile', db_path = None, db_fields = {}) : 
        """
        Write an RDD to disk with a given output type and optionally adding an entry to an RDD metadata database. 
        
        **Input**

        *rdd* : the rdd to write to disk

        *path* : absolute path for the output

        **Optional Keywords**

        *out_type* : which type of Spark output to create 

        *db_path* : if you want to add the entries to a database, add the path here

        *db_fields* : if you specify a *db_path* you must also specify a dictionary of database fields and their values
        """
        
        if out_type == 'pickleFile' : 
            rdd.saveAsPickleFile(path)        
        elif out_type == 'textFile' : 
            rdd.saveAsTextFile(path)
        elif out_type == 'libsvm' : 
            from pyspark.mllib.util import MLUtils
            MLUtils.saveAsLibSVMFile(rdd, path)
            
        else : 
            raise RuntimeError("out_type must be either 'pickleFile' or 'textFile'")

        if db_path is not None : 
            # open the database connection -- if the database doesn't exist it will automatically be created
            import sqlite3
            import time
            conn = sqlite3.connect(db_path)

            with conn :
                c = conn.cursor()

                # create the RDDs table if it doesn't exist
                c.execute('create TABLE if NOT EXISTS RDDs (path text, date_time text, filter text, description text, script text, year_start INTEGER, year_end INTEGER)')

                filter_text = db_fields.get('filter', '')
                description_text = db_fields.get('description', '')
                script_text = db_fields.get('script', '')
                year_start = db_fields.get('year_start', 0)
                year_end = db_fields.get('year_end', 0)

                # form data tuple
                date = time.localtime()
                date_string = '%s-%02d-%02d_%02d:%02d:%02d'%(date.tm_year, int(date.tm_mon), int(date.tm_mday), 
                                                   int(date.tm_hour), int(date.tm_min), int(date.tm_sec))
                data = (path, date_string, filter_text, description_text, script_text, year_start, year_end)
                c.execute('INSERT INTO RDDs VALUES (?,?,?,?,?,?,?)', data)

示例#6

0

显示文件

def random_forest():
    conf = SparkConf().setAppName('RF')
    sc = SparkContext(conf=conf)
    # print("\npyspark version:" + str(sc.version) + "\n")

    data = MLUtils.loadLibSVMFile(sc, './data/sample_libsvm_data.txt')
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    model = RandomForest.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=4,
                                         maxBins=32)

    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda v, p: v != p).count() / float(
        testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())
    # Save and load model
    model.save(sc, ".model/myRandomForestClassificationModel")
    sameModel = RandomForestModel.load(
        sc, "./model/myRandomForestClassificationModel")

示例#7

0

显示文件

文件： spark_prediction_RDD.py 项目： qingpeng/jgi-ViCA

def prediction(model_directory, libsvm_file, outputfile):
    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")

    model = LogisticRegressionModel.load(sc, model_directory)
    #print "numfeature",model.numFeatures
    #print "aaaaaaaa"
    vectors = MLUtils.loadLibSVMFile(sc,
                                     libsvm_file,
                                     numFeatures=model.numFeatures)
    vectors.cache()
    model.clearThreshold()
    # vector = vectors.collect()
    # for v in vector:
    #
    #     features = v.features
    #     print features
    #     print "bbbb",len(features),model.predict(Vectors.dense(features))
    # exit()
    scores = vectors.map(lambda p: (model.predict(Vectors.dense(p.features))))
    #   lambda p: (p.label, model.predict(p.features)))
    scores_list = scores.collect()
    file_out_obj = open(outputfile, 'w')
    for score in scores_list:
        #print '----->',score
        file_out_obj.write(str(score) + '\n')
    file_out_obj.close()

示例#8

0

显示文件

文件： LearningTest.py 项目： RaidriarB/PythonSparkMachineLearningTest-backend

def train():
    data = MLUtils.loadLibSVMFile(sc, TEST_DATA_PATH)
    print("[INFO] load complete.")
    # 划分训练集
    data = data.randomSplit([0.2, 0.8])[0]
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData,
                                         numClasses=NUM_OF_CLASSES,
                                         categoricalFeaturesInfo={},
                                         numTrees=NUM_OF_TREES,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=MAXDEPTH,
                                         maxBins=MAXBINS)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('[INFO] Test Error = ' + str(testErr))
    print('[INFO] Learned classification forest model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, TEST_MODEL_PATH)
    sameModel = RandomForestModel.load(sc, TEST_MODEL_PATH)

示例#9

0

显示文件

文件： test_util.py 项目： drewrobb/spark

 def test_append_bias_with_sp_vector(self):
     data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
     expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
     # Returned value must be SparseVector
     ret = MLUtils.appendBias(data)
     self.assertEqual(ret, expected)
     self.assertEqual(type(ret), SparseVector)

示例#10

0

显示文件

文件： analyze_anomaly_full_data.py 项目： bibudhlahiri/healthcare

def split_data():
  try:
    #pat_proc = sc.textFile("hdfs://master:54310/bibudh/healthcare/data/cloudera_challenge/pat_proc_libsvm_format") 
    #sqlContext.createDataFrame(pat_proc.map(lambda x: custom_encode(x)).take(10000)).foreach(check_for_ascending)
    #map(lambda w: check_for_ascending(w), pat_proc.map(lambda x: custom_encode(x)).take(10000000))
    #pat_proc = sqlContext.read.format("libsvm").load(home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*') #This gives a DataFrame
    pat_proc = MLUtils.loadLibSVMFile(sc, home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*').toDF() #Naive Bayes expects
    #data as an RDD of LabeledPoint
    print("pat_proc.count() = " + str(pat_proc.count())) #150,127 rows, the two columns are ['label', 'features']
    
    anom = pat_proc.filter(pat_proc.label == 1) #This can be done since we have called toDF() on output of loadLibSVMFile()
    benign = pat_proc.filter(pat_proc.label == 0)
    n_benign = benign.count()
    
    #Take a random sample of 50K from benign
    frac = 50000/n_benign
    (into_model, for_finding_more) = benign.randomSplit([frac, 1 - frac])
    print("into_model.count() = " + str(into_model.count()) + ", for_finding_more.count() = " + str(for_finding_more.count()))
    
    for_modeling = anom.unionAll(into_model)
    #for_modeling = for_modeling.rdd #LogisticRegressionWithSGD works on RDD of LabeledPoint objects
    (train, test) = for_modeling.randomSplit([0.5, 0.5])
    test_data_size = test.count()
    print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
    ret_obj = {'train': train, 'test': test, 'for_finding_more': for_finding_more}
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return ret_obj

示例#11

0

显示文件

文件： random_forest.py 项目： bangjieliu/SparkService

def Random_Forest(filename, sc):

	filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt"
	# Load and parse the data file into an RDD of LabeledPoint.
	data = MLUtils.loadLibSVMFile(sc, filename)
	# Split the data into training and test sets (30% held out for testing)
	(trainingData, testData) = data.randomSplit([0.7, 0.3])

	# Train a RandomForest model.
	#  Empty categoricalFeaturesInfo indicates all features are continuous.
	#  Note: Use larger numTrees in practice.
	#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
	model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
	                                     numTrees=3, featureSubsetStrategy="auto",
	                                     impurity='gini', maxDepth=4, maxBins=32)

	# Evaluate model on test instances and compute test error
	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification forest model:')
	print(model.toDebugString())

	# Save and load model
	#model.save(sc, "target/tmp/myRandomForestClassificationModel")
	#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")

示例#12

0

显示文件

 def test_append_bias_with_sp_vector(self):
     data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
     expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
     # Returned value must be SparseVector
     ret = MLUtils.appendBias(data)
     self.assertEqual(ret, expected)
     self.assertEqual(type(ret), SparseVector)

示例#13

0

显示文件

def predict():

	testData = MLUtils.loadLibSVMFile(sc,INPUT_DATA_PATH)
	print("[INFO] load complete.")

	model = RandomForestModel.load(sc,TEST_MODEL_PATH)

	# Evaluate model on test instances and compute test error
	predictions = model.predict(testData.map(lambda x: x.features))

	lst = predictions.collect()
	with open(TEST_PREDICT_PATH+"/"+time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())+".txt",'w') as f:
		for k in lst:
			f.write(str(k)+"\n")

	labelsAndPredictions = testData.map(lambda lp: tobin(lp.label)).zip(predictions.map(lambda lp: tobin(lp)))

	#print(labelsAndPredictions.collect())

	metrics = BinaryClassificationMetrics(labelsAndPredictions)

	# Area under precision-recall curve
	print("Area under PR = %s" % metrics.areaUnderPR)

	# Area under ROC curve
	print("Area under ROC = %s" % metrics.areaUnderROC)
	#print(labelsAndPredictions.collect())

	testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(testData.count())
	print('[INFO] Test Error = ' + str(testErr))

示例#14

0

显示文件

def main():
    options = parse_args()

    sc = SparkContext(appName="PythonRandomForestClassificationExample")
    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, options.data_file)
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=4,
                                         maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, options.output_model)
    sameModel = RandomForestModel.load(sc, options.output_model)

示例#15

0

显示文件

文件： meteos-script-1.6.0.py 项目： ncarkaci/meteos

    def _load_data(self, path):

        dataset_format = self.job_args.get('dataset_format')

        if dataset_format == 'libsvm':
            return MLUtils.loadLibSVMFile(self.context, path)
        else:
            return self.context.textFile(path).cache()

示例#16

0

显示文件

def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)

    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    # https://spark.apache.org/docs/latest/ml-migration-guides.html
    new_df = MLUtils.convertVectorColumnsToML(
        df.withColumn('features', udf(df.features)))

    return new_df

示例#17

0

显示文件

    def saveaslibSVMfile(self):
        """
        保存libsvm格式的文件
        :return:
        """
        sc = SparkContext(master="local[2]",
                          appName="SaveAsLibSVMFile" +
                          os.path.basename(self.__savepath))
        features = sc.textFile(self.__featurespath)
        TOTALFEATUREANDLABEL = sc.accumulator([], ListParamForLabeledPoint())

        def codechange(line):
            """
            根据“_v”切分出类别信息
            :param line:关键帧的特征
            :return: （类别号，特征）
            """
            classname = os.path.basename(line[0]).split("_v")[0]
            classnum = self.__classmap[classname]
            # ResultIterable = list(line[1])
            # features = ResultIterable[0] + ResultIterable[1] + ResultIterable[2]
            # print(len(features))
            return (classnum, list(line[1]))

        def getfeaturesandlabel(line):
            """
            返回LabeledPoint类型的标签和特征组合
            :param line:（类别号，特征）
            :return:返回LabeledPoint类型的标签和特征组合
            """
            #global TOTALFEATUREANDLABEL
            return LabeledPoint(line[0], Vectors.dense(line[1]))
            #TOTALFEATUREANDLABEL += [LabeledPoint(line[0], Vectors.dense(line[1]))]

        featuresandlabel = features.map(lambda x: x.split(" ")).map(
            lambda x: (x[1], x[2:])).map(codechange).map(
                getfeaturesandlabel).repartition(1)
        featuresandlabel.count()
        print(featuresandlabel.count())
        #totalfeatureandlabel = TOTALFEATUREANDLABEL.value
        MLUtils.saveAsLibSVMFile(featuresandlabel, self.__savepath)
        sc.stop()

示例#18

0

显示文件

文件： linear_regression.py 项目： honeycombcmu/SparkService

def LinearRegression(trainFile, testFile, taskid,sc):
	# filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
	# data = sc.textFile(filename)
	# parsedData = data.map(parsePoint)

	trainData = MLUtils.loadLibSVMFile(sc, trainFile)
	testData = MLUtils.loadLibSVMFile(sc, testFile)

	# train the model
	model = LinearRegressionWithSGD.train(trainData)

	# Evaluate the model on training data
	# predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features)))
	MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count()
	print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")

	# Save and load model
	#model.save(sc, "myModelPath")
	#sameModel = LinearRegressionModel.load(sc, "myModelPath")

示例#19

0

显示文件

def train_model(filename='final_tip_all.txt',
                test_portion=0.2,
                cat_var=cat_var_dic,
                n_tree=250,
                mode_feature_strat='auto',
                max_deep=5,
                max_bin=32):
    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose

    sc = SparkContext()

    sqlContext = SQLContext(sc)

    spark = SparkSession.builder.appName("RandomForestRegressor").getOrCreate()

    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, filename)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData,
     testData) = data.randomSplit([1 - test_portion, test_portion])

    ##### TREAT TEMP AS CONTINUOUS ####
    model = RandomForest.trainRegressor(
        trainingData,
        categoricalFeaturesInfo=cat_var,
        numTrees=n_tree,
        featureSubsetStrategy=mode_feature_strat,
        impurity='variance',
        maxDepth=max_deep,
        maxBins=max_bin)

    ############ prediction !!!! ####
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) *
                                       (v - p)).sum() / float(testData.count())
    testRMSE = math.sqrt(testMSE)

    #predictions.takeSample(withReplacement = False, num = 5)
    # convert the rdd object to dataframe as follows
    df_predictions = predictions.map(lambda x: (x, )).toDF()
    df_predictions.cache()
    #df_predictions.show(5, False)

    #print('Learned regression forest model:')
    #print(model.toDebugString())
    print('Test Root Mean Squared Error on ' + filename + ' = ' +
          str(testRMSE))

示例#20

0

显示文件

文件： binaryclassifier.py 项目： Kapetis/binary_classifiers

def npmat_to_rdd_wreadwrite(sc, X, Y, f_name, delete_file=False):
    """
    Takes a data prepared for scikit model X in numpy matrix format, Y one-dimensional numpy array 
    and writes to file in libsvm format with filename string f_name provided (could delete automatically), 
    then reads from file directly into spark RDD object (for given Sparkcontext sc)

    """
    sklearn.datasets.dump_svmlight_file(X, Y, f_name, zero_based=False)
    read_rdd = MLUtils.loadLibSVMFile(sc, f_name)
    if delete_file:
        os.remove(f_name)
    return read_rdd

示例#21

0

显示文件

文件： binaryclassifier.py 项目： Kapetis/binary_classifiers

def npmat_to_rdd_wreadwrite(sc,X,Y,f_name,delete_file=False):
    """
    Takes a data prepared for scikit model X in numpy matrix format, Y one-dimensional numpy array 
    and writes to file in libsvm format with filename string f_name provided (could delete automatically), 
    then reads from file directly into spark RDD object (for given Sparkcontext sc)

    """
    sklearn.datasets.dump_svmlight_file(X,Y,f_name,zero_based=False)
    read_rdd= MLUtils.loadLibSVMFile(sc, f_name)
    if delete_file:
      os.remove(f_name)
    return read_rdd

示例#22

0

显示文件

文件： cold_start.py 项目： jenniferhe/Song_Recommender_System

def eval_logreg(new_df, filename):
    (train, test) = new_df.randomSplit([0.8, 0.2], 24)
    train = train.withColumnRenamed('prediction', 'label')
    test = test.withColumnRenamed('prediction', 'label')
    df = MLUtils.convertVectorColumnsFromML(train, "features")
    parsedData = df.select(col("label"), col("features")).rdd.map(
        lambda row: LabeledPoint(row.label, row.features))
    model = LogisticRegressionWithLBFGS.train(parsedData, numClasses=50)
    model.save(spark.sparkContext, filename)
    # sameModel = LogisticRegressionModel.load(spark.sparkContext, "LogRegLBFGSModel")
    labelsAndPreds = parsedData.map(lambda p:
                                    (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(
        lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
    print("LogReg Small Training Error = " + str(trainErr))
    df = MLUtils.convertVectorColumnsFromML(test, "features")
    parsed_test = df.select(col("label"), col("features")).rdd.map(
        lambda row: LabeledPoint(row.label, row.features))
    testErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(
        parsed_test.count())
    print("LogReg Small Test Error = " + str(testErr))

示例#23

0

显示文件

文件： engine.py 项目： qingpeng/jgi-ViCA

    def __init__(self, sc):
        """Init the engine and train the model
        """

        logger.info("Starting up the GeneLearn Engine: ")

        self.sc = sc

        logger.info("Loading training data...")
        dataset_path = "/Users/qingpeng/Dropbox/Development/Bitbucket/jgi-genelearn/scripts/Flask"
        training_file_path = os.path.join(dataset_path, 'training.svmlib')
        training = MLUtils.loadLibSVMFile(sc, training_file_path)
        self.model = LogisticRegressionWithLBFGS.train(training)

示例#24

0

显示文件

文件： spark_evaluating_model_RDD.py 项目： qingpeng/jgi-ViCA

def testing_model(model_directory, libsvm, prediction, report, prc_file):
    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")
    model = LogisticRegressionModel.load(sc, model_directory)
    testing_rdd = MLUtils.loadLibSVMFile(sc, libsvm,
                                         numFeatures=model.numFeatures)
    testing_rdd.cache()
    au_prc, precision, recall, thresholds, y_true, y_scores = evaluate_model(
        testing_rdd, model)
    print 'evaluating_model done!\n'
    write_to_report(au_prc, precision, recall, thresholds, report)
    print 'write_to_report done!\n'
    write_to_prediction(y_true, y_scores, prediction)
    print 'write_to_prediction done!\n'
    draw_prc(precision, recall, prc_file, au_prc)
    print 'draw_prc done!\n'

示例#25

0

显示文件

文件： spark_es_log_ml.py 项目： lzbgt/spark_yarn_cfg

def analysis(df):
    """ML in Spark
    """
    htf = MLHashingTF(inputCol="message", outputCol="tf")
    tf = htf.transform(df)
    idf = MLIDF(inputCol="tf", outputCol="idf")
    tfidf = idf.fit(tf).transform(tf)
    #tfidf.show(truncate=True)

    #sum_ = udf(lambda v: float(v.values.sum()), DoubleType())
    #res_df = tfidf.withColumn("idf_sum", sum_("idf"))
    res_df = MLUtils.convertVectorColumnsFromML(tfidf, 'idf')
    ml_dataset = res_df.rdd.map(lambda x: x.idf).collect()
    model = KMeans.train(sc.parallelize(ml_dataset), 5, 50)

    return res_df, model

示例#26

0

显示文件

文件： spark_training_model_RDD.py 项目： qingpeng/jgi-ViCA

def training(model_directory, libsvm, scaler):
    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")
    training_rdd = MLUtils.loadLibSVMFile(sc, libsvm)
    training_rdd.cache()
    if scaler == '1':
        label = training_rdd.map(lambda x: x.label)
        features = training_rdd.map(lambda x: x.features)

        scaler1 = StandardScaler().fit(features)
        data1 = label.zip(scaler1.transform(features))
        # convert into labeled point
        data2 = data1.map(lambda x: LabeledPoint(x[0], x[1]))
        model_logistic = LogisticRegressionWithLBFGS.train(data2)
    else:
        model_logistic = LogisticRegressionWithLBFGS.train(training_rdd)
    model_logistic.save(sc, model_directory)

示例#27

0

显示文件

 def test_load_vectors(self):
     import shutil
     data = [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]
     temp_dir = tempfile.mkdtemp()
     load_vectors_path = os.path.join(temp_dir, "test_load_vectors")
     try:
         self.sc.parallelize(data).saveAsTextFile(load_vectors_path)
         ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path)
         ret = ret_rdd.collect()
         self.assertEqual(len(ret), 2)
         self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))
         self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))
     except:
         self.fail()
     finally:
         shutil.rmtree(load_vectors_path)

示例#28

0

显示文件

def predict():
    conf = SparkConf().setMaster("local").setAppName("My App")
    sc = SparkContext(conf=conf)

    testData = MLUtils.loadLibSVMFile(sc, TEST_DATA_PATH)

    model = RandomForestModel.load(sc, TEST_MODEL_PATH)

    predictions = model.predict(testData.map(lambda x: x.features))

    predictlabel_list = predictions.collect()

    rateOFeachSort_dict = analyse_result(predictlabel_list)

    save(predictlabel_list)

    return rateOFeachSort_dict

示例#29

0

显示文件

文件： server.py 项目： wangzaisheng01/np2016

    def do_POST(self):
        response_code = 200
        response = ""
        var_len = int(self.headers.get('Content-Length'))
        content = self.rfile.read(var_len)
        payload = json.loads(content)

        # 如果是训练请求，训练然后保存训练完的神经网络
        if payload.get('train'):
            # 转化数据格式
            TrainData = ""
            for d in payload['trainArray'][0]['y0']:
                TrainData = TrainData + " " + ('%d' % d)
            TrainData = '%d' % (payload['trainArray'][0]['label']
                                ) + "," + TrainData.lstrip() + "\n"
            print(TrainData)
            Addoutput = open('LabeledPointsdata.txt', 'a')
            Addoutput.write(TrainData)
            Addoutput.close()

        # 如果是预测请求，返回预测值
        elif payload.get('predict'):
            try:
                training = MLUtils.loadLabeledPoints(sc,
                                                     "LabeledPointsdata.txt")
                print('Begin NaiveBayes tranning!')
                model = NaiveBayes.train(training, 1.0)
                print('Trainning over!')
                print(payload['image'])
                response = {
                    "type": "test",
                    "result": str(model.predict(payload['image']))
                }
            except:
                response_code = 500
        else:
            response_code = 400

        self.send_response(response_code)
        self.send_header("Content-type", "application/json")
        self.send_header("Access-Control-Allow-Origin", "*")
        self.end_headers()
        if response:
            self.wfile.write(json.dumps(response))
        return

示例#30

0

显示文件

文件： feature.py 项目： LuisFalva/ophilea

    def __index_row_matrix_rdd(self, scale_df):
        """

        :param scale_df:
        :return:
        """
        try:
            vector_mllib = MLUtils.convertVectorColumnsFromML(
                scale_df, 'scaled_features').drop('features')
            vector_rdd = vector_mllib.select(
                'scaled_features',
                'id').rdd.map(lambda x: IndexedRow(x[1], x[0]))
            self.__logger.info("Build Index Row Matrix RDD")
            return IndexedRowMatrix(vector_rdd)
        except TypeError as te:
            raise OpheliaMLException(
                f"An error occurred while calling __index_row_matrix_rdd() method: {te}"
            )

示例#31

0

显示文件

文件： gradient_boostedtrees.py 项目： bangjieliu/SparkService

def Gradient_BoostedTrees(filename, sc):
	# Load and parse the data file.
	data = MLUtils.loadLibSVMFile(sc, "/Users/Jacob/SparkService/data/sample_libsvm_data.txt")
	# Split the data into training and test sets (30% held out for testing)
	(trainingData, testData) = data.randomSplit([0.7, 0.3])

	# Train a GradientBoostedTrees model.
	#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
	#         (b) Use more iterations in practice.
	model = GradientBoostedTrees.trainClassifier(trainingData,
	                                             categoricalFeaturesInfo={}, numIterations=3)

	# Evaluate model on test instances and compute test error
	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification GBT model:')
	print(model.toDebugString())

示例#32

0

显示文件

文件： test_util.py 项目： drewrobb/spark

 def test_load_vectors(self):
     import shutil
     data = [
         [1.0, 2.0, 3.0],
         [1.0, 2.0, 3.0]
     ]
     temp_dir = tempfile.mkdtemp()
     load_vectors_path = os.path.join(temp_dir, "test_load_vectors")
     try:
         self.sc.parallelize(data).saveAsTextFile(load_vectors_path)
         ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path)
         ret = ret_rdd.collect()
         self.assertEqual(len(ret), 2)
         self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))
         self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))
     except:
         self.fail()
     finally:
         shutil.rmtree(load_vectors_path)

示例#33

0

显示文件

文件： decisionTree.py 项目： garethdavidjones/Election-Contrib

def main(input_file):

    sc = pyspark.SparkContext(appName="DecisionTree")

    data = MLUtils.loadLabeledPoints(sc, input_file)

    trainingData, testData = data.randomSplit([0.70, 0.3])
    # Cache in memory for faster training
    trainingData.cache()

    model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini',
                 categoricalFeaturesInfo={}, maxDepth=16, maxBins=10)

    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    # print tree_model.toDebugString()
    print ""
    print ""
    print "Test Erros: {}".format(round(testErr,4))

示例#34

0

显示文件

文件： random_forest.py 项目： tsingfu/Spark-for-Python

def random_forest():
    """
    使用mllib对Spark安装包mllib的测试数据集做随机森林测试
    80%数据作为训练数据  20%数据作为测试数据
    :return:
    """
    data_rdd = MLUtils.loadLibSVMFile(
        sc, '{}/mllib/sample_libsvm_data.txt'.format(current_dir))

    train_data_rdd, test_data_rdd = data_rdd.randomSplit([0.8, 0.2])
    model = RandomForest.trainClassifier(train_data_rdd,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3)

    # 根据测试集的features预测laber值为0还是1
    predict_rdd = model.predict(test_data_rdd.map(lambda x: x.features))

    # 测试集实际的laber值
    labels_rdd = test_data_rdd.map(lambda lp: lp.label).zip(predict_rdd)

    # 测试样本中预测值与实际值不符的百分比(错误率)
    print(labels_rdd.filter(lambda x: x[0] != x[1]).count())
    test_err = labels_rdd.filter(lambda x: x[0] != x[1]).count() / float(
        test_data_rdd.count())
    print("test error rate:{}".format(test_err))

    # 保存 训练好的模型
    model_path = "{}/my_random_forest_model".format(current_dir)
    if not os.path.exists(model_path):
        model.save(sc, model_path)

    trained_model = RandomForestModel.load(
        sc, "{}/my_random_forest_model".format(current_dir))
    print(trained_model.toDebugString())
    return trained_model

示例#35

0

显示文件

文件： random_forest_classification_example.py 项目： 0xqq/spark

"""
from __future__ import print_function

import sys

from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonRandomForestClassificationExample")
    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                         numTrees=3, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=4, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())

示例#36

0

显示文件

文件： Content_KMeans.py 项目： ellenkimsy/Big-Data-Homework

    parts = line.strip().split("::")
    return (int(parts[0]),int(parts[1]),float(parts[2]))

if __name__ =="__main__":
    if(len(sys.argv)!=2):
       	print "Usage: /path to spark/bin/spark-submit name.py movieDir"


    # step 1 - create spark context
    conf = SparkConf().setAppName("KMeans-Content")\
       .set("spark.executor.memory","1g")
    sc = SparkContext()


    # step 2 - load in input file
    data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat")
    labels = data.map(lambda x:x.label)
    features = data.map(lambda x:x.features)

  
    # step 3 - standarize the data with unit values and 0 mean
    scaler = StandardScaler(withMean=False,withStd=True).fit(features)

    data2 = labels.zip(scaler.transform(features))

    numFeatures = len(data2.values().take(10)[0])
    print "Type of data2: ",type(data2) #RDD
    print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd
    print "Sample: ",data2.values().take(1)[0]

    # splitting up the data to training, validation and testing models.

示例#37

0

显示文件

文件： decision_tree_runner.py 项目： spacedotworks/apache-spark-decision-tree

    exit(1)


if __name__ == "__main__":
    if len(sys.argv) > 2:
        usage()
    sc = SparkContext(appName="PythonDT")

    # Load data.
    dataPath = 'train_svm'# 'data/mllib/sample_libsvm_data.txt'
    if len(sys.argv) == 2:
        dataPath = sys.argv[1]
    if not os.path.isfile(dataPath):
        sc.stop()
        usage()
    points = MLUtils.loadLibSVMFile(sc, dataPath)

    # Re-index class labels if needed.
    (reindexedData, origToNewLabels) = reindexClassLabels(points)
    numClasses = len(origToNewLabels)
    # Train a classifier.
    categoricalFeaturesInfo = {}  # no categorical features
    #model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses,
    #                                     categoricalFeaturesInfo=categoricalFeaturesInfo)
    model = RandomForest.trainClassifier(reindexedData, numClasses=numClasses,categoricalFeaturesInfo={},numTrees=30,featureSubsetStrategy='auto', impurity='gini', maxDepth=8, maxBins=40, )
    # Print learned tree and stats.
    print origToNewLabels
    print "Trained DecisionTree for classification:"
#    print "  Model numNodes: %d" % model.numNodes()
#    print "  Model depth: %d" % model.depth()
    print "  Training accuracy: %g" % getAccuracy(model, reindexedData)

示例#38

0

显示文件

文件： logs_reg.py 项目： honeycombcmu/SparkService

def logsreg(loadTrainingFilePath, sc):
	# Load training data in LIBSVM format
	loadTrainingFilePath = '/Users/Jacob/repository/SparkService/data/sample_libsvm_data.txt'
	data = MLUtils.loadLibSVMFile(sc, loadTrainingFilePath)
	
	
	# Split data into training (60%) and test (40%)
	traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
	traindata.cache()

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3)

	# Compute raw scores on the test set
	predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	Json.generateJson("LogisticRegression", "12345678", traindata, predictionAndLabels);

	print 'Completed.'
	# Instantiate metrics object
	# metrics = MulticlassMetrics(predictionAndLabels)

	# # Overall statistics
	# precision = metrics.precision()
	# recall = metrics.recall()
	# f1Score = metrics.fMeasure()
	# #confusion_matrix = metrics.confusionMatrix().toArray()

	# print("Summary Stats")
	# print("Precision = %s" % precision)
	# print("Recall = %s" % recall)
	# print("F1 Score = %s" % f1Score)


	# # Statistics by class
	# labels = traindata.map(lambda lp: lp.label).distinct().collect()
	# for label in sorted(labels):
	#     print("Class %s precision = %s" % (label, metrics.precision(label)))
	#     print("Class %s recall = %s" % (label, metrics.recall(label)))
	#     print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

	# # Weighted stats
	# print("Weighted recall = %s" % metrics.weightedRecall)
	# print("Weighted precision = %s" % metrics.weightedPrecision)
	# print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
	# print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
	# print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

	# #return model parameters
	# res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)),
	# 	   ('3','Yes','Precision', metrics.precision(0.0)),
	# 	   ('4','Yes','Recall', metrics.recall(0.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)),
	#        ('3','Yes','Precision', metrics.precision(1.0)),
	# 	   ('4','Yes','Recall', metrics.recall(1.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)),
	#        ('3','Yes','Precision', metrics.precision(2.0)),
	#        ('4','Yes','Recall', metrics.recall(2.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))]	

	# #save output file path as JSON and dump into dumpFilePath
	# rdd = sc.parallelize(res)
	# SQLContext.createDataFrame(rdd).collect()
	# df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value'])

	#tempDumpFilePath = dumpFilePath + "/part-00000"
	#if os.path.exists(tempDumpFilePath):
	#	os.remove(tempDumpFilePath)

	#df.toJSON().saveAsTextFile(hdfsFilePath)
	#tmpHdfsFilePath = hdfsFilePath + "/part-00000"
	#subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath])

	# Save and load model
	#clusters.save(sc, "myModel")
	#sameModel = KMeansModel.load(sc, "myModel")

示例#39

0

显示文件

文件： random_forrest2.py 项目： yiransheng/w251-project

from math import log, exp
from pyspark import SparkContext
from pyspark.sql import SQLContext

from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

sc = SparkContext()
sqlContext = SQLContext(sc)


data = MLUtils.loadLibSVMFile(sc, "hdfs:///hndata/docvecs")
data = data.map(lambda lp: LabeledPoint(exp(lp.label)-1.0, lp.features))

# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
rr = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    numTrees=5, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=4, maxBins=32)

predictions = rr.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest rr:')

示例#40

0

显示文件

文件： CorrelationTwo.py 项目： lanimc/BIGDATAProject

from pyspark.mllib.stat import Statistics
from pyspark.mllib.util import MLUtils


if __name__ == "__main__":
    if len(sys.argv) not in [1, 2]:
        print("Usage: correlations (<file>)", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonCorrelations")
    if len(sys.argv) == 2:
        filepath = sys.argv[1]
    else:
        filepath = 'data/mllib/sample_linear_regression_data.txt'
    corrType = 'pearson'

    points = MLUtils.loadLibSVMFile(sc, filepath)\
        .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))

    print()
    print('Summary of data file: ' + filepath)
    print('%d data points' % points.count())

    # Statistics (correlations)
    print()
    print('Correlation (%s) between label and each feature' % corrType)
    print('Feature\tCorrelation')
    numFeatures = points.take(1)[0].features.size
    labelRDD = points.map(lambda lp: lp.label)
    for i in range(numFeatures):
        featureRDD = points.map(lambda lp: lp.features[i])
        corr = Statistics.corr(labelRDD, featureRDD, corrType)
        print('%d\t%g' % (i, corr))

示例#41

0

显示文件

文件： rf_donations.py 项目： garethdavidjones/Election-Contrib

import pyspark
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

sc = pyspark.SparkContext(appName="RandomForest")

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLabeledPoints(sc, 'gs://cs123data/Output/AmountVectors2/')
# Split the data into training and test sets
trainingData, testData = data.randomSplit([0.7, 0.3])
trainingData.cache()
testData.cache()

model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    numTrees=20, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=3, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())

示例#42

0

显示文件

文件： __init__.py 项目： MarioPerezEsteso/PySpark-Projects

sys.path.append("/path/to/spark/python")

try:
    from pyspark import SparkContext, SparkConf
    from pyspark.mllib.tree import RandomForest, RandomForestModel
    from pyspark.mllib.util import MLUtils
    print ("Successfully imported Spark Modules")
except ImportError as e:
    print ("Can not import Spark Modules", e)
    sys.exit(1)

if __name__ == "__main__":
    conf = SparkConf().setAppName("RandomForest_Iris")
    sc = SparkContext(conf = conf)
    print "Loading data..."
    data = MLUtils.loadLibSVMFile(sc, '../../data/iris/iris.scale')
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    # Train a RandomForest model.
    model = RandomForest.trainClassifier(trainingData, numClasses=4,
                                         categoricalFeaturesInfo={},
                                         numTrees=5, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=4, maxBins=32)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())
    # Save model
    model.save(sc, "model")

示例#43

0

显示文件

文件： dataset_example.py 项目： ArafathC/spark

def summarize(dataset):
    print "schema: %s" % dataset.schema().json()
    labels = dataset.map(lambda r: r.label)
    print "label average: %f" % labels.mean()
    features = dataset.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print "features average: %r" % summary.mean()

if __name__ == "__main__":
    if len(sys.argv) > 2:
        print >> sys.stderr, "Usage: dataset_example.py <libsvm file>"
        exit(-1)
    sc = SparkContext(appName="DatasetExample")
    sqlContext = SQLContext(sc)
    if len(sys.argv) == 2:
        input = sys.argv[1]
    else:
        input = "data/mllib/sample_libsvm_data.txt"
    points = MLUtils.loadLibSVMFile(sc, input)
    dataset0 = sqlContext.inferSchema(points).setName("dataset0").cache()
    summarize(dataset0)
    tempdir = tempfile.NamedTemporaryFile(delete=False).name
    os.unlink(tempdir)
    print "Save dataset as a Parquet file to %s." % tempdir
    dataset0.saveAsParquetFile(tempdir)
    print "Load it back and summarize it again."
    dataset1 = sqlContext.parquetFile(tempdir).setName("dataset1").cache()
    summarize(dataset1)
    shutil.rmtree(tempdir)

示例#44

0

显示文件

文件： PySpark.py 项目： boweiz/SparkService

#	parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
#	# Build the model (cluster the data)
#	clusters = KMeans.train(parsedData, 3, maxIterations=10, runs=30, initializationMode="random")
#
#	WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
#	print("Within Set Sum of Squared Error = " + str(WSSSE))
#
#	res = [('k_means',dumpFilePath, WSSSE)]
#	rdd = sc.parallelize(res)
#	SQLContext.createDataFrame(rdd).collect()
#	df = SQLContext.createDataFrame(rdd,['model_name','res_path', 'WSSSE'])
#	df.toJSON().saveAsTextFile(dumpFilePath)

if(model_name == "Regression"):
	# Load training data in LIBSVM format
	data = MLUtils.loadLibSVMFile(sc, loadTrainingFilePath)
	
	
	# Split data into training (60%) and test (40%)
	traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
	traindata.cache()

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3)

	# Compute raw scores on the test set
	predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))

示例#45

0

显示文件

文件： RandomForest_GBT.py 项目： ZaphyrRobin/RandomForest_GBT

        accuracy = metrics.accuracy_score(expected, predicted)
        if i==0: print("Random Forest accuracy is {}".format(accuracy))
        else:    print("Gradient Boosting accuracy is {}".format(accuracy))

cal_model_accuracy((RFT, GBT))


# In[6]:

#IV Use MLlib
sc = SparkContext("local", "Ensemble_Tree")


# In[7]:

data = MLUtils.loadLibSVMFile(sc, '/usr/local/spark/data/mllib/sample_libsvm_data.txt')


# In[8]:

#Split the training set and test set
(trainingData, testData) = data.randomSplit([0.7, 0.3])


# In[9]:

#Training model
RF_model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto", 
                                        impurity='gini', maxDepth=5, maxBins=32)

示例#46

0

显示文件

文件： multi_class_metrics_example.py 项目： lhfei/spark-in-action

# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
# $example off$

from pyspark import SparkContext

if __name__ == "__main__":
    sc = SparkContext(appName="MultiClassMetricsExample")

    # Several of the methods available in scala are currently missing from pyspark
    # $example on$
    # Load training data in LIBSVM format
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")

    # Split data into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed=11)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=3)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics

示例#47

0

显示文件

文件： libsvm.py 项目： bngonmang/FIND

 def __init__(self,sc, path):
     # Load and parse the data file into an RDD of LabeledPoint.
     self.data = MLUtils.loadLibSVMFile(sc, path)

示例#48

0

显示文件

文件： spark-mongo.py 项目： wangwf/Codes

d2.take(2)


# In[21]:

from pyspark.mllib.util import MLUtils

dataOutput="libsvm_data.txt"
import os.path
import shutil
if os.path.exists(dataOutput):
    shutil.rmtree(dataOutput)#os.rmdir(dataOutput)
    print dataOutput

MLUtils.saveAsLibSVMFile(d2,"libsvm_data.txt")


# In[22]:

for i,x in enumerate(features): print i,x


# In[23]:


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = d2.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.

示例#49

0

显示文件

文件： pyspark_gradient_boosted_trees.py 项目： bravekjh/Spark

    rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        print("Usage: gradient_boosted_trees", file=sys.stderr)
        exit(1)
    sc = SparkContext(appName="Jay")
    sqlContext = SQLContext(sc)

    # Load and parse the data file into a dataframe.
    df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()

    # Map labels into an indexed column of labels in [0, numLabels)
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    [train, test] = td.randomSplit([0.7, 0.3])
    testClassification(train, test)
    testRegression(train, test)
    sc.stop()

示例#50

0

显示文件

文件： DT-Regression.py 项目： jjingrong/Spark-MLlib

# Import DecisionTree / DecisionTreeModel
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
from pyspark import SparkContext

sc = SparkContext("local", "SVM")

# Loading and parsing data into RDD of LabeledPoint
# Sample data provided by Spark 1.3.1 folder

# To run locally
#data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')

# To run on hadoop server
data = MLUtils.loadLibSVMFile(sc, 'jingrong/sample_libsvm_data.txt')

# Splits data - Approximately 70% training , 30% testing
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train the decision tree model
# Empty categoricalFeaturesInfo indicates that all features are continuous.
model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32)

# Evaluate the model on test instances, compute test error
allPredictions = model.predict(testData.map(lambda x: x.features))
predictionsAndLabels = testData.map(lambda pl: pl.label).zip(allPredictions)
testMeanSquaredError = predictionsAndLabels.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())

# Printing results
print "Tested Mean Squared Error: ", testMeanSquaredError

示例#51

0

显示文件

    sc = spark.sparkContext

    #-------------------------------------------------------------------------------
    # Read the training data and build the model
    #-------------------------------------------------------------------------------

    #reading the train dataframes
    trainingDF = spark.read.load("../data/train_small.parquet")

    #convert every row to LabeledPoint
    transformedTrainingRDD = (trainingDF.rdd.map(
        lambda row: LabeledPoint(int(row.label) - 1, row.features)))
    #print transformedTrainingRDD.show()

    #Save the RDD in LibSVM format, as Naive Bayes reads in the same format
    MLUtils.saveAsLibSVMFile(transformedTrainingRDD, "trainingLibsvmfile")
    training = MLUtils.loadLibSVMFile(sc, "trainingLibsvmfile/*")
    print "trainingLibsvmfile created!!"

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(training,
                                         numClasses=10,
                                         categoricalFeaturesInfo={},
                                         numTrees=24,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=4,
                                         maxBins=32)

示例#52

0

显示文件

文件： isotonic_regression_example.py 项目： 11wzy001/spark

# $example on$
import math
from pyspark.mllib.regression import LabeledPoint, IsotonicRegression, IsotonicRegressionModel
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonIsotonicRegressionExample")

    # $example on$
    # Load and parse the data
    def parsePoint(labeledData):
        return (labeledData.label, labeledData.features[0], 1.0)

    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_isotonic_regression_libsvm_data.txt")

    # Create label, feature, weight tuples from input data with weight set to default value 1.0.
    parsedData = data.map(parsePoint)

    # Split data into training (60%) and test (40%) sets.
    training, test = parsedData.randomSplit([0.6, 0.4], 11)

    # Create isotonic regression model from training data.
    # Isotonic parameter defaults to true so it is only shown for demonstration
    model = IsotonicRegression.train(training)

    # Create tuples of predicted and real labels.
    predictionAndLabel = test.map(lambda p: (model.predict(p[1]), p[0]))

    # Calculate mean squared error between predicted and real labels.

示例#53

0

显示文件

文件： dataframe_example.py 项目： xiaonianwen/LenovoUnit

        input = "data/mllib/sample_libsvm_data.txt"

    # Load input data
    print("Loading LIBSVM file with UDT from " + input + ".")
    df = spark.read.format("libsvm").load(input).cache()
    print("Schema from LIBSVM:")
    df.printSchema()
    print("Loaded training data as a DataFrame with " +
          str(df.count()) + " records.")

    # Show statistical summary of labels.
    labelSummary = df.describe("label")
    labelSummary.show()

    # Convert features column to an RDD of vectors.
    features = MLUtils.convertVectorColumnsFromML(df, "features") \
        .select("features").rdd.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print("Selected features column with average values:\n" +
          str(summary.mean()))

    # Save the records in a parquet file.
    tempdir = tempfile.NamedTemporaryFile(delete=False).name
    os.unlink(tempdir)
    print("Saving to " + tempdir + " as Parquet file.")
    df.write.parquet(tempdir)

    # Load the records back.
    print("Loading Parquet file with UDT from " + tempdir)
    newDF = spark.read.parquet(tempdir)
    print("Schema from Parquet:")
    newDF.printSchema()

示例#54

0

显示文件

文件： decision_tree.py 项目： shashankadidamu/OttoGroupClassification

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'file')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='entropy', maxDepth=5, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")

示例#55

0

显示文件

import shutil

from pyspark import SparkContext
# $example on$
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.util import MLUtils

# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonNaiveBayesExample")

    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")

    # Split data approximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4])

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    print('model accuracy {}'.format(accuracy))

    # Save and load model

示例#56

0

显示文件

文件： SaveECDAsLibSVM.py 项目： magaram2/DataScienceExperiments


# Try and import the PySpark classes
try:
    from pyspark import SparkContext
    from pyspark import SparkConf
    from pyspark.mllib.classification import LogisticRegressionWithSGD
    from pyspark.mllib.classification import LabeledPoint
    from pyspark.mllib.util import MLUtils

    print("Successfully loaded Spark and MLlib classes...")

except ImportError as e:
    print("Error importing spark modules", e)
    sys.exit(1)


from numpy import array

conf = SparkConf().setAppName("RecessionPredictionModel").setMaster("local")

sc = SparkContext(conf=conf)

data = sc.textFile("/Users/agaram/development/DataScienceExperiments/econometricsPoc/EconometricsDataSlope.csv/Sheet1-Table1.csv")

parsedData = data.map(lambda line: LabeledPoint([float(x) for x in line.split(',')[1:8]][6],
                                                array([float(x) for x in line.split(',')[1:8]])))

MLUtils.saveAsLibSVMFile(parsedData, "/Users/agaram/development/DataScienceExperiments/econometricsPoc/svmDataSlope")

示例#57

0

显示文件

import numpy as np
from pyspark.mllib.linalg import Vectors

# Configure the environment
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = '$SPARK_HOME'

# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonNaiveBayesExample")

    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(
        sc, "/home/ajit/Desktop/spark_lib/sample_libsvm_data.txt")
    print(type(data))
    print(data)

    # Split data approximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4])

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print(type(predictionAndLabel))

示例#58

0

显示文件

文件： B.py 项目： Inscrutive/spark

# MAGIC Upon completing this lab you should understand how to read from and write to files in Spark, convert between `RDDs` and `DataFrames`, and build a model using both the ML and MLlib APIs.

# COMMAND ----------

# MAGIC %md
# MAGIC #### Loading the data
# MAGIC  
# MAGIC First, we need to load data into Spark.  We'll use a built-in utility to load a [libSVM file](www.csie.ntu.edu.tw/~cjlin/libsvm/faq.html), which is stored in an S3 bucket on AWS.  We'll use `MLUtils.loadLibSVMFile` to load our file.  Here are the [Python](http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils.loadLibSVMFile) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) APIs.

# COMMAND ----------

from pyspark.mllib.util import MLUtils

baseDir = '/mnt/ml-amsterdam/'
irisPath = baseDir + 'iris.scale'
irisRDD = MLUtils.loadLibSVMFile(sc, irisPath, minPartitions=20).cache()

# We get back an RDD of LabeledPoints.  Note that the libSVM format uses SparseVectors.
irisRDD.take(5)

# COMMAND ----------

# MAGIC %md
# MAGIC What if we wanted to see the first few lines of the libSVM file to see what the format looks like?

# COMMAND ----------

sc.textFile(irisPath).take(5)

# COMMAND ----------

示例#59

0

显示文件

from __future__ import print_function

# $example on$
from pyspark import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils


if __name__ == "__main__":
    sc = SparkContext(appName="PythonRandomForestRegxample")
    data = MLUtils.loadLibSVMFile(sc,"file:///home/yl408/yuhao_datasets/phishing")
    #data = spark.read.format("libsvm").load("file:///home/yl408/yuhao_datasets/rcv1_train.binary")
    model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='variance', maxDepth=4, maxBins=32)
    model.save(sc, "file:///home/yl408/spark-ml/myrandomForestModel")

示例#60

0

显示文件

文件： DT.py 项目： hechemeljed/Decision-Tree-Visualization-Spark

"""
from __future__ import print_function

from pyspark import SparkContext
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

import json
from bson import json_util
from bson.json_util import dumps

if __name__ == "__main__":

	sc = SparkContext(appName="DecisionTreeClassification")

	raw_data = MLUtils.loadLibSVMFile(sc, '/home/hechem/spark-campaign-classification/test/data/sample_libsvm_data.txt')
	(trainingDataSet, testDataSet) = raw_data.randomSplit([0.7, 0.3])

	tree = DecisionTree.trainClassifier(trainingDataSet, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=30)

	predictions = tree.predict(testDataSet.map(lambda x: x.features))
	labelsAndPredictions = testDataSet.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testDataSet.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification tree model:')
	print(tree.toDebugString())
	tree_to_json = tree.toDebugString()
	
	# Parser
	def parse(lines):
		block = []