예제 #1
0
                           max_features="auto",
                           random_state=0,
                           n_jobs=4),
    "AdaBoost":
    AdaBoostClassifier(n_estimators=500, random_state=0),
    "GradientBoost":
    GradientBoostingClassifier(n_estimators=500,
                               learning_rate=1.0,
                               max_depth=None,
                               random_state=0),
    "NaiveBayes":
    GaussianNB(),
    "LDA":
    LDA(),
    "QDA":
    QDA()
}

############################# Main: Run Different Classifiers ################################

data_dir = '/fraud_model/Data/Model_Data_Signal_Tmx_v3wd/'

result_dir = '/fraud_model/Results/Model_Results_Signal_Tmx_v3wd_tmxrc_ind/'
good_downsample_rate = 0.3  #used to scale back hit rate

for job in joblist:
    print job
    result_summary = []
    result_summary.append(
        ['Case', 'KS'] +
        ['HitRate@' + str(i) + '%CatchRate' for i in range(5, 105, 5)] + [
예제 #2
0
    def classifierTrainTest(score, diagn, real_art, cvPartition, classifier,
                            subjIndex, preAccMatrix, preInstOrder):
        x = 0
        iteration = 0
        idx = 0
        PCNo = len(score[0])
        subAccMatrix = 0
        # FIX: what is test->matlab function within cvpartition class
        #idx = numpy.random.rand(cvPartition, iteration)
        #idx_test = numpy.where(idx == 1)
        #idx_train = numpy.where(idx != 1)

        #QUESTION: cv partition not scalar ,how works
        #iteration must be atleast 2
        for idx_train, idx_test in cvPartition:
            #change idx to boolean array
            idx = numpy.zeros((len(score), 1), dtype=bool)
            for index in idx_test:
                idx[index] = True

            #for testing purposes
            #idx = numpy.zeros((len(score), 1), dtype=bool)
            #idx[47] = True

            #idx is all training in MATLAB implementation?
            cvTEST = numpy.zeros((sum(idx), PCNo))
            diagnTEST = numpy.zeros((sum(idx), 1))
            real_artTEST = numpy.zeros((sum(idx), 1))
            instIndexTEST = numpy.zeros((sum(idx), 1))

            cvTRAIN = numpy.zeros((len(idx) - sum(idx), PCNo))
            diagnTRAIN = numpy.zeros((len(idx) - sum(idx), 1))
            real_artTRAIN = numpy.zeros((len(idx) - sum(idx), 1))

            k = 0
            m = 0

            for j in range(len(idx)):
                if idx[j] == 1:
                    cvTEST[k, :] = score[j, :]
                    diagnTEST[k] = diagn[j]
                    real_artTEST[k] = real_art[j]
                    instIndexTEST[k] = subjIndex[j]
                    k = k + 1
                else:
                    cvTRAIN[m, :] = score[j, :]
                    diagnTRAIN[m] = diagn[j]
                    real_artTRAIN[m] = real_art[j]
                    m = m + 1

            # FIX: use scikit-learn for classifiers and predictions
            if classifier == "lda":
                #ldaModel = LDA()
                priorsArrays = numpy.array((.5, .5))
                ldaModel = LDA(solver='eigen',
                               priors=priorsArrays,
                               shrinkage=1.00)
                #ldaModel = LDA()
                ldaModel.fit(cvTRAIN, diagnTRAIN)
                label = ldaModel.predict(cvTEST)
            elif classifier == 'qda':
                # training a quadratic discriminant classifier to the data
                qdaModel = QDA()
                priorsArrays = numpy.array((.5, .5))
                #qdaModel = QDA(solver='eigen', priors=priorsArrays, shrinkage=1.00)
                qdaModel.fit(cvTRAIN, diagnTRAIN)
                label = qdaModel.predict(cvTEST)
            elif classifier == 'tree':
                # training a decision tree to the data
                treeModel = tree()
                treeModel.fit(cvTRAIN, diagnTRAIN)
                label = treeModel.predict(cvTEST)
            elif classifier == 'svm':
                # training a support vector machine to the data
                svmModel = SVC()
                svmModel.fit(cvTRAIN, diagnTRAIN)
                label = svmModel.predict(cvTEST)

            trueClassLabel = diagnTEST
            predictedClassLabel = label

            #from former loop

            subAccMatrix = numpy.column_stack(
                (trueClassLabel, predictedClassLabel, real_artTEST))
            preAccMatrix[x:x + len(subAccMatrix[:, 0]), :] = subAccMatrix
            preInstOrder[x:x + len(instIndexTEST[:, 0])] = instIndexTEST

            x = x + len(subAccMatrix[:, 0])

            #for testing purposes
            #break
        # create dictionary for return values
        return {
            'cvTEST': cvTEST,
            'diagnTEST': diagnTEST,
            'real_artTEST': real_artTEST,
            'instIndexTEST': instIndexTEST,
            'cvTRAIN': cvTRAIN,
            'diagnTRAIN': diagnTRAIN,
            'real_artTRAIN': real_artTRAIN,
            'trueClassLabel': trueClassLabel,
            'predictedClassLabel': predictedClassLabel,
            'idx': idx,
            'subAccMatrix': subAccMatrix,
            'preAccMatrix': preAccMatrix,
            'preInstOrder': preInstOrder
        }
    def find_best(X_train, y_train, X_validation, y_validation):
        classifiers = [
            LogisticRegression(),
            KNeighborsClassifier(3),
            KNeighborsClassifier(n_neighbors=7, weights="uniform"),
            KNeighborsClassifier(n_neighbors=10, weights="uniform"),
            KNeighborsClassifier(n_neighbors=3, weights="uniform"),
            KNeighborsClassifier(n_neighbors=7, weights="distance"),
            KNeighborsClassifier(n_neighbors=10, weights="distance"),
            KNeighborsClassifier(n_neighbors=3, weights="uniform"),
            SVC(kernel="linear", C=0.025, probability=True),
            SVC(kernel="rbf", C=10, gamma=0.01, probability=True),
            SVC(kernel="rbf", C=1, gamma=0.01, probability=True),
            SVC(gamma=2, C=1, probability=True),
            DecisionTreeClassifier(max_depth=5),
            DecisionTreeClassifier(max_depth=1, criterion='entropy'),
            DecisionTreeClassifier(max_depth=5, criterion='entropy'),
            DecisionTreeClassifier(max_depth=10, criterion='entropy'),
            DecisionTreeClassifier(max_depth=5, criterion='entropy'),
            DecisionTreeClassifier(max_depth=10, criterion='gini'),
            DecisionTreeClassifier(max_depth=5, criterion='gini'),
            DecisionTreeClassifier(max_depth=1, criterion='gini'),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=10,
                                   max_features=1),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=30,
                                   max_features=5,
                                   criterion='gini'),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=20,
                                   max_features=10,
                                   criterion='entropy'),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=30,
                                   max_features=10,
                                   criterion='gini'),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=20,
                                   max_features=15,
                                   criterion='entropy'),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=20,
                                   max_features=10,
                                   criterion='gini'),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=30,
                                   max_features=15,
                                   criterion='entropy'),
            AdaBoostClassifier(),
            GaussianNB(),
            LDA(),
            QDA(),
            QDA(reg_param=0.001),
            QDA(reg_param=0.1),
            QDA(reg_param=0.01),
            SVC(C=10,
                cache_size=200,
                class_weight=None,
                coef0=0.0,
                degree=3,
                gamma=0.0,
                kernel='rbf',
                max_iter=-1,
                probability=True,
                random_state=None,
                shrinking=True,
                tol=0.001,
                verbose=False)
        ]

        clf_dict = {}
        y_pred_list = []

        for clf in classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_validation)
            y_pred_list.append(y_pred)
            acc = metrics.accuracy_score(y_validation, y_pred)
            # avg_prec = metrics.average_precision_score(y_validation, y_pred)
            # prec = metrics.precision_score(y_validation, y_pred)
            # class_rep = metrics.classification_report(y_validation, y_pred, target_names=['background', 'foreground'])
            # f1 = metrics.f1_score(y_validation, y_pred)

            clf_dict[clf] = acc

        global best_one
        best_one = max(clf_dict, key=clf_dict.get)
        print("{" + "\n".join("{}: {}".format(k, v)
                              for k, v in clf_dict.items()) + "}")
        print("\n\n********THE BEST CLASSIFIER IS********\n")
        print(best_one)
예제 #4
0
from sklearn.qda import QDA

h = .05  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()]
'''
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
			random_state=1, n_clusters_per_class=1)
'''   
# X is 1084 x 2
# y is 1084 x 1

x_file = open("X2.csv", "r")
y_file = open("y2.csv", "r")

X = []
y = []
obs = []
for line in y_file:
    y.append(float(line))
testFeatures = features[round(0.6 * Nsamples):, :]
testLabels = labels[round(0.6 * Nsamples):, :]
print(np.shape(trainFeatures), np.shape(testFeatures), np.shape(trainLabels),
      np.shape(testLabels))

K = np.unique(labels).size

plt.clf()
lineStyle = ['ob', '*g', '+c', 'xr', '>y']
for cls in range(K):
    idx = (labels == cls + 1)
    plt.plot(features[np.nonzero(idx)[0], 0], features[np.nonzero(idx)[0], 1],
             lineStyle[cls])

print('Discriminant analysis')
model = QDA()
y_pred = model.fit(trainFeatures, trainLabels[:, 0]).predict(testFeatures)
y_pred = y_pred[:, np.newaxis]
aux = (y_pred != testLabels)
aux = np.sum(aux.astype(float), 0)
misclassificationRate = aux / testLabels.size
print(misclassificationRate)

print('Logistic Regression')
model = LogisticRegression(multi_class='multinomial',
                           solver='newton-cg',
                           C=100)
#create extended features
xtrainFeatures = np.concatenate(
    (trainFeatures, trainFeatures[:, 0:1] * trainFeatures[:, 1:2]), 1)
xtestFeatures = np.concatenate(
예제 #6
0
def main():
    # Load dataset
    ftrain = csv.reader(file(r'../input/train.csv'))
    ftest = csv.reader(file(r'../input/test.csv'))
    fweather = csv.reader(file(r'../input/weather.csv'))
    fspray = csv.reader(file(r'../input/spray.csv'))

    weatherPasstimelist = [
        "Tmax", "Tmin", "Tavg", "DewPoint", "WetBulb", "PrecipTotal", "Depart"
    ]
    weatherPasstimevalue = [2, 3, 4, 6, 7, 16, 5]

    weatherdict = readweather(fweather)
    spraydict = readspray(fspray)

    #generate train and test data
    print "generate train and test data"
    trout = []
    train_y = []
    for trlist in ftrain:
        templine = []
        if ftrain.line_num == 1:
            continue
        date = trlist[0]
        datelist = date.split('-')
        dateformate = datetime.datetime.strptime(date, "%Y-%m-%d").date()
        Latitude = trlist[7]
        Longitude = trlist[8]
        Species = speciesdict[trlist[2]]

        AddressAccuracy = trlist[9]
        NumMosquitos = trlist[10]
        WnvPresent = trlist[11]
        train_y.append(WnvPresent)
        #write weather
        locid = nearloc(Latitude, Longitude)
        weatherlist = weatherdict[date][locid]
        templine.append(float(Species))
        for w in weatherlist[2:]:
            templine.append(float(w))

        #time before 1,3,7,14 days
        passstr = ''
        #         for days_ago in [1,2,3,5,8,12]:
        #             day = dateformate - datetime.timedelta(days=days_ago)
        #             weatherlistPasstime = weatherdict[str(day)][locid]
        #             for obs in weatherPasstimevalue:
        #                 try:
        #                     templine.append(float(weatherlistPasstime[obs]))
        #                 except:
        #                     print weatherlistPasstime
        #                     exit(0)

        #         templine.append(float(Latitude))
        #         templine.append(float(Longitude))

        #write spray
        if not spraydict.has_key(date):
            sprayvalue = 0
        else:
            if nearspray(spraydict[date], Latitude, Longitude):
                sprayvalue = 1
            else:
                sprayvalue = 0
        templine.append(sprayvalue)
        trout.append(templine)

    teout = []
    test_y = []
    for telist in ftest:
        templine = []
        if ftest.line_num == 1:
            continue
        date = telist[1]
        dateformate = datetime.datetime.strptime(date, "%Y-%m-%d").date()
        datelist = date.split('-')
        Latitude = telist[8]
        Longitude = telist[9]
        Species = speciesdict[telist[3]]
        locid = nearloc(Latitude, Longitude)
        weatherlist = weatherdict[date][locid]
        test_y.append(0)
        templine.append(float(Species))
        for w in weatherlist[2:]:
            templine.append(float(w))

        passstr = ''
        #         for days_ago in [1,2,3,5,8,12]:
        #             day = dateformate - datetime.timedelta(days=days_ago)
        #             weatherlistPasstime = weatherdict[str(day)][locid]
        #             for obs in weatherPasstimevalue:
        #                 try:
        #                     templine.append(float(weatherlistPasstime[obs]))
        #                 except:
        #                     print weatherlistPasstime
        #                     exit(0)

        #         templine.append(float(Latitude))
        #         templine.append(float(Longitude))
        #write spray
        if not spraydict.has_key(date):
            sprayvalue = 0
        else:
            if nearspray(spraydict[date], Latitude, Longitude):
                sprayvalue = 1
            else:
                sprayvalue = 0
        templine.append(sprayvalue)
        teout.append(templine)

    #remove feature with no distinction and less important
    indices = [i for i in range(len(trout[0]))]
    frqIndex = trimfrq(trout)

    for i in frqIndex:
        indices.remove(i)
    train_x = indexTodata(trout, indices)
    test_x = indexTodata(teout, indices)
    #     #feature selections
    #     ftsel = ExtraTreesClassifier()
    #     ftsel.fit(train_x, train_y)
    #
    #     train_x_new = ftsel.transform(train_x)
    #     test_x_new = ftsel.transform(test_x)
    #modeling
    print "modeling"

    train_x_nor, mean, std = normalize(train_x)
    test_x_nor, mean, std = normalize(test_x, mean, std)

    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(class_weight='auto'),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(max_depth=5),
        DecisionTreeClassifier(class_weight='auto'),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        RandomForestClassifier(class_weight='auto'),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()
    ]

    clf = OneClassSVM(nu=0.2, kernel="rbf", gamma=65.7933224658)
    clf.fit(train_x_nor, train_y)
    train_pdt = clf.predict(train_x_nor)
    MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt)
    print ":"
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all))
    test_pdt = clf.predict(test_x_nor)
    MCC, Acc_p, Acc_n, Acc_all = get_Accs(test_y, test_pdt)
    print "MCC, Acc_p , Acc_n, Acc_all(test): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all))
    print
    #predict test data
    print("predict test data")
    test_pdt = clf.predict(test_x_nor)
    fprt = open('sampleSubmissionbyKW.csv', 'w')
    fprt.write("ID,WnvPresent\n")
    id = 1
    for eachy in test_pdt:
        fprt.write("%s,%s\n" % (str(id), str(eachy)))
        id = id + 1
    fprt.close()
    # Use the prior two days of returns as predictor values, with direction as
    # the response
    X = snpret[["Lag1", "Lag2"]]
    y = snpret["Direction"]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.8, random_state=42
    )

    # Create the (parametrized) models
    print("Hit Rates/Confusion Matrices:\n")
    models = [("LR", LogisticRegression()),
              ("LDA", LDA()),
              ("QDA", QDA()),
              ("LSVC", LinearSVC()),
              ("RSVM", SVC(C=1000000.0, cach_size=200, class_weight=None,
                           coef=0.0, degree=3, gamma=0.001, kernal='rbf',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False)
                           )
              ("RF", RandomForestClassifier(
                  n_estimators=100, criterion='gini', max_depth=None,
                  min_samples_leaf=1, max_features='auto',bootstrap=True,
                  oob_score=False, n_jobs=1, random_state=None, verbose=0)
              )]
    # Iterate through the models
    for m in models:
        # Train each of the models on the training set
        m[1].fit(X_train, y_train)