Exemplo n.º 1
0
def ForwardSelect(data, k, trace=False):
    ''' Step-wise forward selection method
    Start with an empty set of features. Iteratively add the one feature out of
    the non-chosen features which improves the Silhouette coefficient the most. 

    The algorthim is to have converged when adding any feature does not improve
    the coefficint, or no features remain unchosen.
    '''
    selected = np.zeros(0, int)  # idx of selected features, start w/ empty
    baseCoeff = -1 - 1e-9  # -1 is worst possible performance
    dM = pairwiseDist(data)  # pre-calc distance matrix for memoization

    converged, nRound = False, 1
    while not converged:  # loop until convergence
        bestFeat, bestCoeff, means, labels = SelectBestFeature(
            data, selected, k, dM)
        if bestCoeff <= baseCoeff:  # if new feature doesn't improve performance
            converged = True
        else:  # if new feature improves performance
            selected = np.hstack([selected,
                                  bestFeat])  # add feature to selection
            baseCoeff = bestCoeff  # set new coeff as baseline performance
            outs = (means, labels)  # save output vars
            if len(selected) == data.shape[1]:
                converged = True  # algo converged if all features selected
        if trace:  # print iteration info if requesed
            tmplate = "[%02d] Best coeff=%f, set:%s"
            print(tmplate % (nRound, bestCoeff, str(selected)))
        nRound += 1
    return (
        selected, ) + outs  # return selected features, means, cluster labels
Exemplo n.º 2
0
def geneticAlgoSelect(data, k, prm, trace=False):
    '''Main function of genetic algorithm selection.
	Generate a population of feature sets by randomly generating 0 and 1s 
	given a probability as specified in the input parameter.

	For this population, evaluate the fitness with the help of a memo. By 
	storing computation results into a dictionary, subsequent individuals with
	the same set of features can be skipped and results retrieved from the dict.

	The algorithm is to considered have converged if improvement to Silhouette
	coefficient has not been made for a specific number of generations. This
	minimum required improvement is specified in the input parameter. For 
	every generation, performs the regular selection, crossover, and mutation
	operator on the population.
	'''
    pop = np.random.rand(prm['popSize'], data.shape[1]) < prm['onProb']
    pop = minOneFeature(pop)  # at least 1 feature must be selected
    memo = dict()  # dict of result for memoization
    dMat = pairwiseDist(data)  # pre-calc distance matrix for memoization

    baseFit = 0  # worst possible fitenss score
    converged, gen, stagnGens = False, 1, 0  # initialize loop vars
    while not converged:  # loop until GA has converged
        #print(np.asanyarray(pop,int))
        fit, memo = evalFitness(data, k, pop, memo, dMat)  # evaluate fitness
        bestIdx = np.argmax(fit)  # keep track of best indiviaul
        bestFit, bestIndv = fit[bestIdx], pop[bestIdx]  # best fit and features
        #print((bestFit,np.where(bestIndv)[0]))

        if (bestFit - baseFit <
                prm['minImprove']) and stagnGens > prm['stagnLim']:
            converged = True
            out = baseFit - 1, np.where(baseIndv)[
                0]  # silhouette coeff and list
        else:  # not converged, selection + crossover + mutation
            if (bestFit - baseFit < prm['minImprove']):
                stagnGens += 1
            else:
                baseFit, baseIndv = bestFit, bestIndv  # record long-run best
            parentInd = selectParents(fit, pop.shape[0])  # select parents
            pop = crossOver(pop, parentInd)  # cross-over to get next gen
            pop = mutate(pop, prm['mutateProb'])  # mutate

        if trace:
            print('Generation %d: best fitness = %.10f' % (gen, baseFit))
            print('\tBest set: %s' % str(np.where(baseIndv)[0]))
        gen += 1
    return out
Exemplo n.º 3
0
def updateMeans(data, means):
    ''' Calculate and update centroids for K-means algorithm. 
    The function has two parts:
    1) Assign each pt to the mean for which it has the shortest distance
    2) Calculate new means to be centroid of all the points in the group
    Returns the new means and the classification of data points to these means
    '''
    tmpDist = pairwiseDist(means, data)  # dist between means and all data pts
    minClus = tmpDist.argmin(axis=0)  # find group where distance is smallest

    newMeans = np.zeros([len(means), data.shape[1]])  # new mean points
    for n, x in enumerate(means):  # loop over all clusters
        tmp = np.vstack(
            (data[minClus == n, ], x))  # concat data pt and centroid
        newMeans[n] = tmp.mean(axis=0)  # new mean = centroid of all pts

    return newMeans, minClus
Exemplo n.º 4
0
def KNN(trainX, trainY, testX, K, categorical):
    ''' K-nearest Neighbors classifier.
	Arguments are: the training data, training label, test data, the K 
	hyperparameter, and a boolean variable indicating whether the data is 
	categorical. The function first calculate all the pair-wise distance 
	between the test data points and the training data point. It then finds
	the closest K data points in the training data set. The lables of these 
	K data points are then either: 1) taken an average of if the data is not
	categorical, or 2) taken the plurality of if it's categorical
	'''
    dists = pairwiseDist(testX, trainX)  # all pairwise dist of two datasets
    knnIdx, _ = kMinValIdx(dists,
                           K)  # idx of K closest data pts in training set
    knnLabels = trainY[knnIdx]  # labels of these closest data points

    testY = np.empty(testX.shape,
                     trainY.dtype)  # pre-allocate test data labels
    if not categorical:  # regression, calculate mean
        testY = knnLabels.mean(axis=1)  # mean of k-closest label values
    else:  # classification, get most common class label
        testY = np.array([mostCommonElem(lab) for lab in knnLabels])
    return testY  # return results
Exemplo n.º 5
0
def consistentSubset(trainX, trainY, K=1):
    ''' Using Hart's algorithm to find a consistent subset.
	Arguments are: training data, training label, and K. K is default to one
	as per the original Hart's algorithm. The algorithm randomly picks 
	'''

    dists = pairwiseDist(trainX)  # all pairwise dist of two datasets
    idx = np.arange(trainX.shape[0])  # construct index of data rows
    Z, idx = pickAndRemove(idx)  # randomly pick 1st pt of of subset

    converged = False
    while not converged:
        converged = True  # stop unless a misclassification
        np.random.shuffle(idx)  # shuffle sequence of sample to train randomly

        for x in idx:  # loop over all samples
            nnIdx = kMinValIdx(dists[x, Z], 1)[0]  # idx of NN in Z
            nnLabel = trainY[nnIdx].flatten()  # label of NN of x in Z
            if nnLabel != trainY[x]:  # if misclassification
                Z = np.hstack([Z, x])  # add to consistent subset
                converged = False  # continue training
        idx = np.setdiff1d(idx, Z)  # remove training set from samples

    return Z, idx