#x for x in nameSet.toLocalIterator(): # l.append(x2) pandas.DataFrame(l).to_csv('/home/slic/name0503.csv') # reecover from training data def recover(x): x = x.strip().strip('()').split(',') s = LabeledPoint(float(x[0]),np.fromstring(','.join(x[1:]).strip('[]'), dtype=float, sep=',').tolist()) return s images = sc.binaryFiles("hdfs:///user/slic/output501") data = images.values().map(recover) # RDD to DataFrame df = data.toDF() # ML labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=5).fit(df) (trainingData, testData) = df.randomSplit([0.7, 0.3]) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Random forest rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
#get training set c = getBag('/home/xhan/trainLabels.csv') # generate filename to extract filename = getFile(c,"hdfs:///user/hduser/train/") # Input the images to the spark images = sc.binaryFiles(filename) # Map the input data data = images.map(first) #create datafrom from data RDD df = data.toDF(['name','label','features']) # ML # refer to official tutorial labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=5).fit(df) (trainingData, testData) = df.randomSplit([0.7, 0.3]) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions)