def classifierForTopNFeatures(n, sortedFeatures, XFile, yFile, mlAlg):
    topFeatureIndeces = []
    for i in range(n):
        topFeatureIndeces.append(sortedFeatures[i][2])
    print(topFeatureIndeces)
    allX, allY, features = chooseFeatures(topFeatureIndeces, XFile, yFile)
    scores = getModelScores(mlAlg, allX, allY, 10)
    print('error for top 6 features', features, scores.mean())
def enterFeatureIndeces(XFeatures, yFeature, XFile, yFile, mlAlg):

    allX, allY, features = chooseFeatures(XFeatures, XFile, yFile)
    scores = getModelScores(mlAlg, allX, allY, 10)
    #myModel = linear_model.LinearRegression()
    #scores = cross_val_score(myModel,allX,allY,scoring='neg_mean_squared_error', cv=10)
    print(scores.mean())  #take the mean score of all cross val runs
    print(features)
def specifyDataset(
    XFile, yFile, mlAlg, numFeatures
):  #if featuresList is empty, by default start with all features specified in dataset

    f = open("file.txt", "a")
    f.write("newew")

    f.write('newew')

    loopLength, non, non1 = readAllFeatures(XFile,
                                            yFile)  # just to get length of x
    emptyList = []
    start = time.time()
    num_cores = multiprocessing.cpu_count()
    var = Parallel(n_jobs=num_cores)(
        delayed(singVarClassifier)(i, XFile, yFile, mlAlg, numFeatures)
        for i in range(len(loopLength[0])))
    end = time.time()
    print("time for parallel ", str(end - start))
    print('var is')
    print(var)
    # emptyList.append(var)
    # print(emptyList)
    #  emptyList = []

    start = time.time()
    for i in range(len(
            loopLength[0])):  #there are 19 total features in standard X file
        if i != 1 and i != 2:
            allX, allY, features = chooseFeatures([i], XFile, yFile)
            scores = getModelScores(mlAlg, allX, allY, 10)
            #print(scores.mean(),features[0],i)
            #f = open("file.txt","a")
            #f.write('here')
            #       f.write(str(scores.mean())+ ' ' + features[0]+ ' ' +str(i) + '\n')
            #f.close

            emptyList.append([scores.mean(), features[0], i])
    end = time.time()
    print("time for sequential", str(end - start))

    print(emptyList)
    sortedList = sorted(var, reverse=True)
    #f.write(sortedList)
    f.close

    for i in range(len(sortedList)):
        print(sortedList[i])

    classifierForTopNFeatures(
        6, sortedList, XFile, yFile,
        mlAlg)  #first arg is number of top ranked features to run
示例#4
0
def specifyDataset(
    XFile, yFile, mlAlg, numFeatures
):  #if featuresList is empty, by default start with all features specified in dataset

    X, y, features = readAllFeatures(XFile, yFile)
    startingFeatures = range(
        len(X[0])
    )  #create a list starting with all feature indeces in ascending order

    scores = getModelScores(mlAlg, X, y, 10)

    print('error for all features',
          scores.mean())  #baseline score for using all features

    optimalFeatures = recursiveElim(
        startingFeatures, 6, XFile, yFile,
        mlAlg)  #second arg is what num of features to stop at
    print(optimalFeatures)

    allX, allY, features = chooseFeatures(optimalFeatures, XFile, yFile)
    scores = getModelScores(mlAlg, allX, allY, 10)
    print('scores for optimal features', scores.mean())

    print(list(features))
示例#5
0
def recursiveElim(startingFeatures, optimalSetSize, XFile, yFile, mlAlg):

    if len(
            startingFeatures
    ) == optimalSetSize:  #certain ml algs like svm may have better scores with less features at times than with all features. Implement this later
        return startingFeatures  #end recursion

    featureScores = []
    print('starting features for current round', startingFeatures)
    print('number of features left', len(startingFeatures))
    for index in range(len(startingFeatures)):

        tempFeatures = [startingFeatures[0:index]]
        tempFeatures.append(startingFeatures[index + 1:])
        tempFeatures = list(chain.from_iterable(
            tempFeatures))  #remove nested list and present as all one list

        # tempFeatures = np.delete(startingFeatures,[index],1)#all features in current round minus one (i.e. remove a column)

        print(tempFeatures)

        allX, allY, features = chooseFeatures(tempFeatures, XFile, yFile)
        non, non2, nonFeatures = chooseFeatures(
            [startingFeatures[index]], XFile,
            yFile)  #this is just a lazy way to get the feature name

        scores = getModelScores(mlAlg, allX, allY, 10)
        print('error for this set of features', scores.mean())
        featureScores.append(
            [scores.mean(), nonFeatures[0], startingFeatures[index]])
    sortedList = sorted(featureScores,
                        reverse=False)  #make boolean ifMinimizing
    print('\n\n\neliminate feature', (sortedList[-1][1]), '\n\n')
    sortedList = sortedList[:-1]  #remove the worst performing feature
    startingFeatures = []  #reset list for next recursion round
    for el in sortedList:
        startingFeatures.append(
            el[2]
        )  #append the feature indeces for later call to chooseFeatures

    startingFeatures = sorted(startingFeatures)  #for consistency
    #print(startingFeatures)

    startingFeatures = recursiveElim(startingFeatures, optimalSetSize, XFile,
                                     yFile, mlAlg)
    return startingFeatures
def singVarClassifier(i, XFile, yFile, mlAlg, numFeatures):
    allX, allY, features = chooseFeatures([i], XFile, yFile)
    print(i)
    scores = getModelScores(mlAlg, allX, allY, 30)
    print(scores.mean(), features[0], i)
    return [scores.mean(), features[0], i]