def classifierForTopNFeatures(n, sortedFeatures, XFile, yFile, mlAlg): topFeatureIndeces = [] for i in range(n): topFeatureIndeces.append(sortedFeatures[i][2]) print(topFeatureIndeces) allX, allY, features = chooseFeatures(topFeatureIndeces, XFile, yFile) scores = getModelScores(mlAlg, allX, allY, 10) print('error for top 6 features', features, scores.mean())
def enterFeatureIndeces(XFeatures, yFeature, XFile, yFile, mlAlg): allX, allY, features = chooseFeatures(XFeatures, XFile, yFile) scores = getModelScores(mlAlg, allX, allY, 10) #myModel = linear_model.LinearRegression() #scores = cross_val_score(myModel,allX,allY,scoring='neg_mean_squared_error', cv=10) print(scores.mean()) #take the mean score of all cross val runs print(features)
def specifyDataset( XFile, yFile, mlAlg, numFeatures ): #if featuresList is empty, by default start with all features specified in dataset f = open("file.txt", "a") f.write("newew") f.write('newew') loopLength, non, non1 = readAllFeatures(XFile, yFile) # just to get length of x emptyList = [] start = time.time() num_cores = multiprocessing.cpu_count() var = Parallel(n_jobs=num_cores)( delayed(singVarClassifier)(i, XFile, yFile, mlAlg, numFeatures) for i in range(len(loopLength[0]))) end = time.time() print("time for parallel ", str(end - start)) print('var is') print(var) # emptyList.append(var) # print(emptyList) # emptyList = [] start = time.time() for i in range(len( loopLength[0])): #there are 19 total features in standard X file if i != 1 and i != 2: allX, allY, features = chooseFeatures([i], XFile, yFile) scores = getModelScores(mlAlg, allX, allY, 10) #print(scores.mean(),features[0],i) #f = open("file.txt","a") #f.write('here') # f.write(str(scores.mean())+ ' ' + features[0]+ ' ' +str(i) + '\n') #f.close emptyList.append([scores.mean(), features[0], i]) end = time.time() print("time for sequential", str(end - start)) print(emptyList) sortedList = sorted(var, reverse=True) #f.write(sortedList) f.close for i in range(len(sortedList)): print(sortedList[i]) classifierForTopNFeatures( 6, sortedList, XFile, yFile, mlAlg) #first arg is number of top ranked features to run
def specifyDataset( XFile, yFile, mlAlg, numFeatures ): #if featuresList is empty, by default start with all features specified in dataset X, y, features = readAllFeatures(XFile, yFile) startingFeatures = range( len(X[0]) ) #create a list starting with all feature indeces in ascending order scores = getModelScores(mlAlg, X, y, 10) print('error for all features', scores.mean()) #baseline score for using all features optimalFeatures = recursiveElim( startingFeatures, 6, XFile, yFile, mlAlg) #second arg is what num of features to stop at print(optimalFeatures) allX, allY, features = chooseFeatures(optimalFeatures, XFile, yFile) scores = getModelScores(mlAlg, allX, allY, 10) print('scores for optimal features', scores.mean()) print(list(features))
def recursiveElim(startingFeatures, optimalSetSize, XFile, yFile, mlAlg): if len( startingFeatures ) == optimalSetSize: #certain ml algs like svm may have better scores with less features at times than with all features. Implement this later return startingFeatures #end recursion featureScores = [] print('starting features for current round', startingFeatures) print('number of features left', len(startingFeatures)) for index in range(len(startingFeatures)): tempFeatures = [startingFeatures[0:index]] tempFeatures.append(startingFeatures[index + 1:]) tempFeatures = list(chain.from_iterable( tempFeatures)) #remove nested list and present as all one list # tempFeatures = np.delete(startingFeatures,[index],1)#all features in current round minus one (i.e. remove a column) print(tempFeatures) allX, allY, features = chooseFeatures(tempFeatures, XFile, yFile) non, non2, nonFeatures = chooseFeatures( [startingFeatures[index]], XFile, yFile) #this is just a lazy way to get the feature name scores = getModelScores(mlAlg, allX, allY, 10) print('error for this set of features', scores.mean()) featureScores.append( [scores.mean(), nonFeatures[0], startingFeatures[index]]) sortedList = sorted(featureScores, reverse=False) #make boolean ifMinimizing print('\n\n\neliminate feature', (sortedList[-1][1]), '\n\n') sortedList = sortedList[:-1] #remove the worst performing feature startingFeatures = [] #reset list for next recursion round for el in sortedList: startingFeatures.append( el[2] ) #append the feature indeces for later call to chooseFeatures startingFeatures = sorted(startingFeatures) #for consistency #print(startingFeatures) startingFeatures = recursiveElim(startingFeatures, optimalSetSize, XFile, yFile, mlAlg) return startingFeatures
def singVarClassifier(i, XFile, yFile, mlAlg, numFeatures): allX, allY, features = chooseFeatures([i], XFile, yFile) print(i) scores = getModelScores(mlAlg, allX, allY, 30) print(scores.mean(), features[0], i) return [scores.mean(), features[0], i]