import csvfuncs as cf import bagOfWords as bow from sklearn.neighbors import KNeighborsClassifier td = cf.readTrainingData(); #trainXRaw = td.cities; trainXBow = bow.getBOWFromFile('training_BOW.csv') trainY1 = td.cityCodes trainY2 = td.countryCodes #wordList = bow.getWordListFromCSV('training_wordList.csv') #validXRaw = cf.readXData('validation.csv') #validXBow = bow.getBOW(validXRaw,wordList,'validation') validXBow = bow.getBOWFromFile('validation_BOW.csv') print "training a KNN classifier with the training data for Y1" clfY1 = KNeighborsClassifier(n_neighbors =3) clfY1.fit(trainXBow,Y1) print "predicting Y1 using the KNN classifier" Y1_hat = clfY1.predict(validXBow) print "training a KNN classifier with the training data for Y2" clfY2 = KNeighborsClassifier(n_neighbors =3) clfY2.fit(trainXBow,Y2) print "predicting Y2 using the KNN classifier" Y2_hat = clfY2.predict(validXBow) print "prediction complete" f = open('validation_result_KNN.csv','w') for i in range(len(Y1_hat)):
import csvfuncs as cf import bagOfWords as bow from sklearn.ensemble import RandomForestClassifier td = cf.readTrainingData() ### if getting the bow for the first time #trainXRaw = td.cities; #trainXBow = bow.getBOWTrain(trainXRaw,'training') ### reading the bow saved in a file, its slow to generate trainXBow = bow.getBOWFromFile('training_BOW.csv') trainY1 = td.cityCodes trainY2 = td.countryCodes ### reading the trainind data word list, used to create ### the validation, test data bow #wordList = bow.getWordListFromCSV('training_wordList.csv') ### if getting the bow for the validation for the first time #validXRaw = cf.readXData('validation.csv') #validXBow = bow.getBOW(validXRaw,wordList,'validation') ### reading the validation bow saved in a file validXBow = bow.getBOWFromFile('validation_BOW.csv') ### training the random forrest classifier to ### predict the city codes print "training a RandomForestClassifier with the training data for Y1" clfY1 = RandomForestClassifier(n_estimators=10) clfY1.fit(trainXBow, trainY1)
def test(): td = cf.readTrainingData() rwl=getRawList(td.cities)