def testWouldTake(predictor):
    X, Y, W, headings = data.loadFeaturesAndLabels(data.DV_Type.WOULD_TAKE)

    # remove all where would take ambiguous
    print('removing ambiguous would take ratings')
    threshold = 8
    deleted = 0
    yes = 0
    no = 0
    for i, row in enumerate(Y):
        index = i - deleted
        if row[0] >= threshold and row[0] < 20 - threshold:
            X = numpy.delete(X, index, 0)
            Y = numpy.delete(Y, index, 0)
            deleted += 1
        elif row[0] < threshold:
            Y[index, 0] = 0
            no += 1
        elif row[0] >= 20 - threshold:
            Y[index, 0] = 1
            yes += 1

    print('removed - ambiguous:', deleted)
    print('remaining:', X.shape[0], 'yes:', yes / X.shape[0], 'no:',
          no / X.shape[0])

    #testModel(X, Y, None, headings, model_wouldtake_svm, predictWouldTakeSVM, True)
    testModel(X, Y, None, headings, model_wouldtake_svm, predictor, True, None)
def testModelEffort(model):
    X, Y, W, headings = data.loadFeaturesAndLabels(data.DV_Type.EFFORT_TRIMMED)
    # calculate weights from standard deviation
    weights = numpy.zeros((W.shape[0], 1), dtype=float)
    for i, sd in enumerate(W):
        weights[i, 0] = 1 - (sd / 30)

    if model is None:
        model = headings

    testModel(X, Y, weights, headings, model, predictEffort, False, W)
Exemplo n.º 3
0
def calcFeatureTable(dv_type):
    features, labels, weights, headings = data.loadFeaturesAndLabels(dv_type)

    # normalize features
    for i in range(0, features.shape[1]):
        features[:, i:i + 1] = preprocessing.normalize(features[:, i:i + 1],
                                                       axis=0)

    # add feature names to table
    table = [['Features']]
    for heading in headings:
        table.append([heading])

    methods = [['Lin. Cor.', lambda X, Y: pearson(X, Y)],
               ['Lin. Reg.', lambda X, Y: linearRegression(X, Y, 0)],
               ['Lasso', lambda X, Y: linearRegression(X, Y, 1)],
               ['Ridge', lambda X, Y: linearRegression(X, Y, 2)],
               ['MIC', lambda X, Y: MIC(X, Y)],
               ['Stability', lambda X, Y: stability(X, Y)],
               ['Random Forest', lambda X, Y: randomForest(X, Y)]]

    for method in methods:
        print('applying', method[0])
        coefs = method[1](features, labels)

        # format coefs: some methods output scores in nested row
        vals = coefs
        try:
            if coefs.shape[1] == len(features[0]):
                vals = coefs[0]
        except:
            True

        # append method to headings
        table[0].append(method[0])

        # append features scores to table
        for i, c in enumerate(vals):
            if math.isnan(c):
                c = 0
            table[1 + i].append(math.floor(c * 1000) / 1000)

    return table
Exemplo n.º 4
0
def RFE_effortRandomForest(numFeatures, formated=False):
    X, Y, W, headings = data.loadFeaturesAndLabels(data.DV_Type.EFFORT_TRIMMED)
    Y = Y.flatten()

    # normalize features column wise
    for i in range(0, X.shape[1]):
        X[:, i:i + 1] = preprocessing.normalize(X[:, i:i + 1], axis=0)

    clf = RandomForestRegressor()
    # rank all features, i.e continue the elimination until the last one
    rfe = RFE(clf, n_features_to_select=10)
    rfe.fit(X, Y)

    for i, val in enumerate(
            sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), headings))):
        if formated:
            if i % 10 == 0:
                print(i)
            print("    '" + val[1] + "',")
        else:
            print(i, val)
Exemplo n.º 5
0
def RandomFeatureRanking(use_zweights=False):
    dv = data.DV_Type.WORKLOAD_EFFORT
    X, Y, W, headings = data.loadFeaturesAndLabels(dv)

    # normalize features column wise
    for i in range(0, X.shape[1]):
        X[:, i:i + 1] = preprocessing.normalize(X[:, i:i + 1], axis=0)

    # calculate weights from standard deviation
    weights = numpy.zeros((W.shape[0], 1), dtype=float)
    for i, sd in enumerate(W):
        weights[i, 0] = 1 - (sd / 30)

    randoms = numpy.zeros((1000, 4), dtype=float)

    for r in range(0, randoms.shape[0]):
        print("progress:", r / randoms.shape[0])
        test = numpy.arange(X.shape[1])
        numpy.random.shuffle(test)
        test = test[:randoms.shape[1] - 1]
        selected = []
        for t, i in enumerate(test):
            selected.append(headings[i])

        # filter out features based on prediction model
        X_, headings_ = selectFeatures(X, headings, selected)

        numIterations = 10
        meanMeanError = 0
        for i in range(0, numIterations):
            # use indicies to split matricies randomly but in order
            split = 0.8
            X1, X2, indicies = data.splitMatrixRandomly(X_, split)
            Y1, Y2, indicies = data.splitMatrixRandomly(Y, split, indicies)
            w1, w2, indicies = data.splitMatrixRandomly(
                weights, split, indicies)

            # flatten data
            Y1 = Y1.flatten()
            Y2 = Y2.flatten()
            w1 = w1.flatten()
            w2 = w2.flatten()

            rf = RandomForestRegressor()
            rf.fit(X1, Y1)
            pred = rf.predict(X2)

            diff = 0
            for p, val in enumerate(pred):
                diff += abs(val - Y2[p])
            #print(diff / Y2.shape[0])

            meanMeanError += diff / Y2.shape[0]

            if False:
                plt.xlim(0, max(Y2))
                plt.ylim(0, max(Y2))
                plt.plot(Y2, Y2, 'bo')
                plt.plot(Y2, pred, 'ro')
                plt.ylabel(str(dv))
                plt.show()

        for i in range(0, randoms.shape[1]):
            randoms[r, 0] = meanMeanError / numIterations
            randoms[r, 1:] = test

    randoms = randoms[numpy.argsort(randoms[:, 0])]

    test = []
    for r, row in enumerate(randoms):
        if len(test) > 10:
            break
        for l, val in enumerate(row):
            if l > 0:
                test.append(val)

    # insert headings + avoid doubles
    selected = []
    for t, i in enumerate(test):
        h = headings[int(i)]
        try:
            selected.index(h)
        except:
            selected.append(h)
            print("'" + h + "',")

    print("used top features #:", len(selected))

    # filter out features based on prediction model
    X_, headings_ = selectFeatures(X, headings, selected)

    numIterations = 10
    meanMeanError = 0
    for i in range(0, numIterations):
        # use indicies to split matricies randomly but in order
        split = 0.6
        X1, X2, indicies = data.splitMatrixRandomly(X_, split)
        Y1, Y2, indicies = data.splitMatrixRandomly(Y, split, indicies)
        w1, w2, indicies = data.splitMatrixRandomly(weights, split, indicies)

        # flatten data
        Y1 = Y1.flatten()
        Y2 = Y2.flatten()
        w1 = w1.flatten()
        w2 = w2.flatten()

        rf = RandomForestRegressor()
        rf.fit(X1, Y1, w1)
        pred = rf.predict(X2)

        diff = 0
        for p, val in enumerate(pred):
            diff += abs(val - Y2[p])

        meanMeanError += diff / Y2.shape[0]

    print(meanMeanError / numIterations)

    plt.xlim(0, max(Y2))
    plt.ylim(0, max(Y2))
    plt.plot(Y2, Y2, 'bo')
    plt.plot(Y2, pred, 'ro')
    plt.ylabel(str(dv))