dfValidIndexed = string_indexer_model.transform(dfValidSelect).cache() df_valid_pred = lrModel.transform(dfValidIndexed).cache() res=evaluator.evaluate(df_valid_pred) print res tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[12]: import loadFiles as lf print "Start loading and preprocessing test data " t0 = time() test,names=lf.loadUknown('./data/test') text_name=zip(test,names) dfTest = sc.parallelize(text_name).toDF(['review','label']).cache() dfTestPre=dfTest.map(preProcess).toDF(['words','label']).cache() bigram = NGram(inputCol="words", outputCol="bigrams") dfTestBi = bigram.transform(dfTestPre).cache() finalDfSelect = dfTestBi.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache() tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[14]: print "Classifying test data"
dictionary={} for i,word in enumerate(dict): dictionary[word]=i #we need the dictionary to be available AS A WHOLE throughout the cluster dict_broad=sc.broadcast(dictionary) #build labelled Points from data data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')] dcRDD=sc.parallelize(data_class,numSlices=16) #get the labelled points labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value)) #Train NaiveBayes model=NaiveBayes.train(labeledRDD) #broadcast the model mb=sc.broadcast(model) test,names=lf.loadUknown('./data/test') name_text=zip(names,test) #for each doc :(name,text): #apply the model on the vector representation of the text #return the name and the class predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect() output=file('./classifications.txt','w') for x in predictions: output.write('%s\t%d\n'%x) output.close()
# 5) GradientBoost model = GradientBoostedTrees.trainClassifier(tmpLB, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) ##### TODO # This could be done in parallel to test the different one ##### TODO # Perhaps combined such AdaBoost & GradientBoost print "loading unlabeled data" test,names=lf.loadUknown(testF) #load the unlabelled data . test : text per document. names : the respective file names namesb=sc.broadcast(names) # broadcast the file names as we will need them for predictions md=sc.broadcast(m) # broadcast the fitted model of the vecotrizer datadt=sc.broadcast(test) # broadcast the unlabeled data so that we may call the vectorizer in same manner #apply the vestorization in a random worker print "transforming unlabelled data" test_data=sc.parallelize(ex,numSlices=16).filter(lambda x: x!=0).map(partial(computeTest, model=md,data=datadt)).collect() datadt.unpersist() print "convert data to non-compressed vector and predict the class" #Steps: #distribute the coordinate tf-idf of the transformed unlabelled data #organize the coordinate by row