Пример #1
0
def prep_data(data_set):
    if data_set == 1:
        data = pd.read_excel(
            r'C:\\Users\\cbroe\\OneDrive\\Skrivebord\\Stuff\\School\\bachelor\\Python\\Bachelor\\output.xlsx'
        )

        data = data.iloc[13750:86190]

        lon, lat = fs.get_feature_selection_rows(data_set, 6)
        X = data.filter(lon)
        y = data.filter(['lon_rad', 'lat_rad'])

        return data, X, y

    elif data_set == 2:
        data = pd.read_excel(
            r'C:\\Users\\cbroe\\OneDrive\\Skrivebord\\Stuff\\School\\bachelor\\Python\\Bachelor\\output2.xlsx'
        )

        data = data.iloc[0:3700]

        lon, lat = fs.get_feature_selection_rows(data_set, 5)
        X = data.filter(lon)
        y = data.filter(['Long', 'Lat'])

        return data, X, y
Пример #2
0
def plot_score_log_reg_embedded(X, y):
    # c = 2050 ,s = 0.963
    # threshold = 0.514,shape =16
    logistic_regression = log_reg_train(X, y, C=2050, max_iter=27)
    print logistic_regression.n_iter_
    print logistic_regression.coef_
    range = np.linspace(0, logistic_regression.coef_.max(), 20)
    scores = []
    scores_embedded = []
    for threshold in range:
        score = cross_val_score(logistic_regression,
                                X,
                                y,
                                cv=5,
                                scoring='accuracy')
        select_model, X_embedded = FeatureSelection.select_by_model(
            logistic_regression, X, y, threshold=threshold)
        score_embedded = cross_val_score(logistic_regression,
                                         X_embedded,
                                         y,
                                         cv=5,
                                         scoring='accuracy')
        scores.append(score.mean())
        scores_embedded.append(score_embedded.mean())
    print range[scores_embedded.index(
        max(scores_embedded))], max(scores_embedded)
    plt.plot(range, scores)
    plt.plot(range, scores_embedded)
    plt.show()
    select_model, X_embedded = FeatureSelection.select_by_model(
        logistic_regression, X, y, threshold=0.514)
    print X_embedded.shape
        def validation_core(i, x, y, model, feature_count):
            (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i)

            mutualInformationTable = FeatureSelection.byMutualInformation(foldTrainX, foldTrainY)
            words = [word for word,_ in mutualInformationTable[:feature_count]]
            (xNewTrain, xNewValidation) = FeatureSelection.Featurize(foldTrainX, foldValidationX, words)

            model.fit(xNewTrain, foldTrainY)
            return self.__countCorrect(model.predict(xNewValidation), foldValidationY)
Пример #4
0
def main():
    # Define constant
    datadir = 'data/IS11_speaker_state'           # 'data/IS09_emotion' or 'data/IS11_speaker_state'
    # datadir = 'data/IS09_emotion'           # 'data/IS09_emotion' or 'data/IS11_speaker_state'
    n_features = 1500                       # Totally 382 features for IS09_emotion; 4354 for IS11_speaker_state
    fs_method = 'none'                       # 'univ', 'rfe', 'lasso', 'l1norm', 'fdr' or 'none'
    C = 0.01                                # Penalty factor for RFE-SVM feature selection
    rfe_step = 1                            # Step for RFE-SVM feature selection
    n_speakers = 10                         # There are 10 speakers in EmoDB

    svm_pred = []
    svm_true = []
    print('Start train and test ...')

    trn_file = datadir + '/' + 'cheavd_trn_fs.mat'
    tst_file = datadir + '/' + 'cheavd_tst_fs.mat'

    # Load trn and tst data
    trn_data = sio.loadmat(trn_file)
    tst_data = sio.loadmat(tst_file)

    # Retreive training and test data
    x_train = np.array(trn_data['x'], dtype='float32')
    y_train = np.array(trn_data['y'].ravel(), dtype='int32')
    x_test = np.array(tst_data['x'], dtype='float32')
    y_test = np.array(tst_data['y'].ravel(), dtype='int32')

    if fs_method == 'rfe':
        # Select features by RFE-SVM, remove 10% of feature for each iteration (step=0.1)
        x_train, x_test = fs.rfe_select_features(x_train, y_train, x_test, n_fs=n_features,
                                                 penalty=C, step=rfe_step)
    elif fs_method == 'l1norm':
        # Select features by L1-norm
        x_train, x_test = fs.l1norm_select_features(x_train, y_train, x_test, penalty=C)
    elif fs_method == 'lasso':
        # Select features by LASSO
        x_train, x_test = fs.lasso_select_features(x_train, y_train, x_test, alpha=0.001)
    elif fs_method == 'univ':
        x_train, x_test = fs.univ_select_features(x_train, y_train, x_test, n_fs=n_features)
    elif fs_method == 'fdr':
        # Select features by FDR
        x_train, x_test = fs.fdr_select_features(x_train, y_train, x_test, n_fs=n_features)
    elif fs_method == 'none':
        pass
    print('No. of selected features = %d, ' % x_train.shape[1], end='')

    # Train an SVM classifier
    svc = SVC(C=1, gamma='auto', kernel='rbf')
    svc.fit(x_train, y_train)

    # Test the SVM classifier
    svm_pred = svc.predict(x_test)
    n_classes = np.max(y_train) + 1
    print('Overall SVM accuracy for %d classes with %d features/class: %.2f%%' %
          (n_classes, n_features, get_accuracy(svm_pred, y_test)))
Пример #5
0
 def retrieve_feature_name(self):
     """
     retrieve the names of the features for the nodes
     """
     feature_stat_df = FeatureSelection.FeatureStatus().feature_stat
     feature_name_list = feature_stat_df['feature'].tolist()
     return feature_name_list
Пример #6
0
    def start(self):
        # perform some logging
        self.jlogger.info("Starting job with job id {}".format(self.job_id))
        self.jlogger.debug("Job Config: {}".format(self.config))
        self.jlogger.debug("Job Other Data: {}".format(self.job_data))

        try:
            rud.ReadUserData(self)
            fg.FeatureGeneration(self, is_train=True)
            pp.Preprocessing(self, is_train=True)
            fs.FeatureSelection(self, is_train=True)
            fe.FeatureExtraction(self, is_train=True)
            clf.Classification(self)
            cv.CrossValidation(self)
            tsg.TestSetGeneration(self)
            tspp.TestSetPreprocessing(self)
            tsprd.TestSetPrediction(self)
            job_success_status = True
        except:
            job_success_status = False
            helper.update_running_job_status(self.job_id, "Errored")
            self.jlogger.exception("Exception occurred in ML Job {} ".format(
                self.job_id))

        return job_success_status
Пример #7
0
def main():
    X_train, X_test, y_train, y_test = FeatureSelection.main(
        './kickstarter-projects/ks-projects-201801.csv')
    X_train_d, X_test_d = _create_dummy_features(X_train,X_test)
    X_train, X_test = _create_label_encoding(X_train, X_test)
    dt = decisionTree(X_train_d, X_test_d, y_train, y_test)
    lgr = logisticRegrs(X_train_d, X_test_d, y_train, y_test)
    rf = randomForest(X_train_d, X_test_d, y_train, y_test)
    gb = gradientBooster(X_train_d, X_test_d, y_train, y_test)
    ada = adaBooster(X_train_d, X_test_d, y_train, y_test)
    bagging_ensemble = bagging(X_train_d, X_test_d, y_train, y_test)
    rf = RandomForestClassifier(criterion='entropy', max_depth=20,
                       min_samples_split=25, n_estimators=100,
                       random_state=0, class_weight='balanced_subsample')
    dt = DecisionTreeClassifier(criterion='entropy', max_depth=20,
                                min_samples_split=1000, random_state=0,
                                class_weight='balanced')
    gb = GradientBoostingClassifier(loss='exponential', n_estimators=100,
                                    random_state=0)
    ada = AdaBoostClassifier(base_estimator=dt, n_estimators=100,
                             learning_rate=0.5)
    estimators = [('dt', dt), ('rf', rf), ('ada', ada), ('gb', gb)]
    voting_ensemble = voting(X_train_d, X_test_d, y_train, y_test, estimators)

    """
    def validateByFrequency(self, x, y, model):
        totalCorrect = 0

        for i in range(self.k):
            (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i)

            frequencyTable = FeatureSelection.byFrequency(foldTrainX)
            words = [word for word,_ in frequencyTable[:10]]
            print('For fold %d/%d, choose words:' % (i + 1, self.k))
            print(words)
            (xNewTrain, xNewValidation) = FeatureSelection.Featurize(foldTrainX, foldValidationX, words)

            model.fit(xNewTrain, foldTrainY)
            totalCorrect += self.__countCorrect(model.predict(xNewValidation), foldValidationY)

        accuracy = totalCorrect / len(x)

        return accuracy
Пример #9
0
def main():
    sf = FS.Select(Sequence = False, Random = True, Cross = False) #select the way you want to process searching
    sf.ImportDF(prepareData(),label = 'is_trade')
    sf.ImportLossFunction(modelscore,direction = 'descend')
    sf.ImportCrossMethod(CrossMethod)
    sf.NonTrainableFeatures = ['used','instance_id', 'item_property_list', 'context_id', 'context_timestamp', 'predict_category_property', 'is_trade']
    sf.InitialFeatures(['item_category_list', 'item_price_level','item_sales_level','item_collected_level', 'item_pv_level','day'])
    sf.clf = lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000, max_depth=3, learning_rate = 0.05, n_jobs=8)
    sf.logfile = 'record.log'
    sf.run(validation)
Пример #10
0
def main():
    sf = FS.Select(Sequence=True, Random=True, Cross=True)
    sf.ImportDF(prepareData(), label='Survived')
    sf.ImportLossFunction(modelscore, direction='ascend')
    sf.ImportCrossMethod(CrossMethod)
    sf.NonTrainableFeatures = ['Survived']
    sf.InitialFeatures([])
    sf.PotentialAdd = ['Pclass']
    # sf.clf = lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000, max_depth=3, learning_rate = 0.05, n_jobs=1)
    sf.clf = LogisticRegression()
    sf.logfile = 'record.log'
    sf.run(validation)
def featureSelect():
    """
    Desc : Feature Selection
    """
    print('\n ********** Feature Selector ***********')
    #fileName='/nobackup/anikgaik/search/features/Train_Features/Train_Features.csv'
    #writeFile='/nobackup/anikgaik/search/features/Train_Features/Final_train_Feature.csv'
    fileName = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Generated_Features.csv'
    writeFile = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Selected_Features.csv'
    oFS = FS.featureSelect()
    oFS.loadfeature(fileName)
    oFS.printAll()
    oFS.selectFeatures()
    oFS.generateNewfeature(fileName, writeFile)
def featureSelect():
    """
    Desc : Feature Selection
    """
    print('\n ********** Feature Selector ***********')
    #fileName='/nobackup/anikgaik/search/features/Train_Features/Train_Features.csv'
    #writeFile='/nobackup/anikgaik/search/features/Train_Features/Final_train_Feature.csv'
    fileName='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Generated_Features.csv'
    writeFile='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Selected_Features.csv'
    oFS=FS.featureSelect()
    oFS.loadfeature(fileName)
    oFS.printAll()
    oFS.selectFeatures()
    oFS.generateNewfeature(fileName,writeFile)
Пример #13
0
def run(is_changing_n=False, number_to_ngram=3, number_of_features=220, fs_method=FEATURE_SELECTION_MOST_COMMON,
        c_method=CLASSIFIER_ONE_CLASS_SVM):

    # set param for final calcs
    all_ans = []

    # Feature extraction
    out_0_file = Path(out_0_path)
    if is_changing_n:
        fe.export_to_csv_all_users(number_to_ngram)
    if not out_0_file.exists():
        fe.export_to_csv_all_users(number_to_ngram)

    for user_number in range(0, 5):
        # Feature selection
        FeatureSelection.select_features(number_of_features, fs_method, user_number)

        # Classifier
        ans = Classifier.classify(number_of_features, c_method)
        all_ans.append(ans)

    print("""
    ** FINAL SCORE : {} **
    """.format(sum(all_ans)/len(all_ans)))
def main():
    train_X, train_Y, test_X, test_Y = data.load_ICICI()

    start_time = time.clock()

    clf = Ridge()
    features = SelectKBest(f_regression)
    train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main(
        clf, train_X, train_Y, test_X, test_Y)
    k_value = [best_K]

    hyper_parameters = hyper.main(Ridge, 1)
    hyper_parameters['Kbest__k'] = k_value
    pipeline = Pipeline([('Kbest', features), ('model', clf)])
    rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters)
    rand_search.fit(train_X, train_Y)

    best_accu = 1000
    b_i = 0
    predictions = []
    for i in range(len(test_Y) / 3):
        b_pred = rand_search.predict(test_X) - (test_Y[i] -
                                                rand_search.predict(test_X)[i])
        RMSE = math.sqrt(mean_squared_error(test_Y, b_pred))
        if RMSE < best_accu:
            best_accu = RMSE
            b_i = i
            predictions = b_pred[1:]
    test_Y = test_Y[:-1]
    print b_i, best_accu
    #predictions = predictions - diff              #normalizing

    end_time = time.clock() - start_time
    filename = "..\Speed\ICICI_Ridge_time.txt"
    target = open(filename, 'w')
    target.write(str(end_time))

    MSE = mean_squared_error(test_Y, predictions)
    RMSE = math.sqrt(MSE)
    filename = "..\RMSE\ICICI_Ridge_rmse.txt"
    target = open(filename, 'w')
    target.write(str(RMSE))
    print "ICICI RIDGE BEST", RMSE
    ICICI = ["ICICI"]
    Graphs_plotting.line_graph(5000, test_Y, predictions, "RidgeBEST",
                               ICICI[0])
    def get_data_splits(self, cv_method):
        cv_data_splits = []

        x = self.ml_pipeline.data.values
        y = self.ml_pipeline.data_labels.values.ravel()

        i = 1
        for train_index, test_index in cv_method.split(x, y):
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]

            x_train_pd = pd.DataFrame(x_train)
            x_test_pd = pd.DataFrame(x_test)

            ppp_ml_pipeline = MLPipeline.MLPipeline(self.ml_pipeline.job_id)
            ppp_ml_pipeline.x_train = x_train_pd
            ppp_ml_pipeline.y_train = y_train
            ppp_ml_pipeline.x_test = x_test_pd
            ppp_ml_pipeline.y_test = y_test

            pp = ppp.Preprocessing(ppp_ml_pipeline, is_train=False)
            pp.preprocess_data()

            fs = pfs.FeatureSelection(ppp_ml_pipeline, is_train=False)
            fs.perform_feature_selection()

            fe = pfe.FeatureExtraction(ppp_ml_pipeline, is_train=False)
            fe.perform_feature_extraction()

            self.jlogger.info("Cross validation split number {}".format(i))
            self.jlogger.info("XTrain Shape: {}".format(
                ppp_ml_pipeline.x_train.shape))
            self.jlogger.info("XTest Shape: {}".format(
                ppp_ml_pipeline.x_test.shape))
            self.jlogger.info("YTrain Shape: {}".format(
                ppp_ml_pipeline.y_train.shape))
            self.jlogger.info("YTest Shape: {}".format(
                ppp_ml_pipeline.y_test.shape))

            cv_data_splits.append(
                (ppp_ml_pipeline.x_train, ppp_ml_pipeline.x_test,
                 ppp_ml_pipeline.y_train, ppp_ml_pipeline.y_test))

            i += 1

        return cv_data_splits
Пример #16
0
def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator,
                              th_bs):
    new_train_set = list(trainSet)
    new_y_train = list(y_train)

    trainAndBSData = trainSet + bootstrap_data

    generateDataDrivenFeats(trainSet, trainAndBSData, es)

    featurized = featurize(trainAndBSData)

    train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)]
    test_feats = [
        featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1)
    ]

    #Do feature selection on train data
    train_feats = fs.runFeatureSelection(train_feats, y_train, es)
    train_feats, y_train, train_bucket = ss.runSampleSelection(
        train_feats, y_train, [i for i in range(0, len(trainSet), 1)], es)

    # calculate Inter-annotator weighting.
    weights_train = getWeights(trainAndBSData, train_bucket,
                               es.weighInterAnnot)

    vectorizer = DictVectorizer()
    x_train = vectorizer.fit_transform(train_feats)
    x_test = vectorizer.transform(test_feats)

    if es.scaleData:
        min_max_scalar = MinMaxScaler()
        x_train = min_max_scalar.fit_transform(x_train.toarray())
        x_test = min_max_scalar.transform(x_test.toarray())

    model = train(estimator, x_train, y_train, weights_train, model=None)

    y_pred_prob = model.predict_proba(x_test)
    for i, cur_y in enumerate(y_pred_prob):
        if np.max(cur_y) > th_bs:
            new_train_set.append(bootstrap_data[i])
            new_y_train.append(np.argmax(cur_y))

    return (new_train_set, new_y_train)  #update none to confidence vector
Пример #17
0
def do_preprocessing(pos_path,
                     neg_path,
                     selected_DB,
                     is_bigrams,
                     k=None,
                     method=None,
                     features_space=None):
    f2r = FileToReview.FileToReview(pos_path, neg_path, selected_DB)
    pos_reviews, neg_reviews = f2r.buildReviewMatrix()

    # get a new instance for preprocessing
    # The new instance needs to know where positive and negative review directories are, also database no
    prep = Preprocessing(pos_path, neg_path, selected_DB, pos_reviews,
                         neg_reviews, is_bigrams)

    # extract positive and negative vocabularies
    prep.extract_vocabulary()
    # print extracted vocabularies in dictionnary (json) format
    vocabs = prep.get_v()

    nb_neg_review = prep.get_nb_neg_review()
    nb_pos_review = prep.get_nb_pos_review()

    # get a new instance
    # The new instance needs to know where positive and negative review directories are, also database no
    tfp = TermFrequencyProcessing.TermFrequencyProcessing(
        pos_path, neg_path, selected_DB)
    tfp.compute_terms_frequency(vocabs)
    # print(tfp.get_overall_terms_frequency())
    # print(tfp.get_reviews_info())
    T = tfp.get_overall_terms_frequency()

    fs = FeatureSelection.FeatureSelection(T, nb_neg_review, nb_pos_review)

    if not features_space:
        features_space = fs.build_features_space(k, method)
        reduced_vocabs = fs.reduce_vocabs(vocabs, features_space)

        return vocabs, reduced_vocabs, fs, features_space

    reduced_vocabs = fs.reduce_vocabs(vocabs, features_space)
    return vocabs, reduced_vocabs, fs
Пример #18
0
def main():

    PotentialAdd = ['min_query_time_gap_after', 'hour', 'shop_score_delivery', 'min_query_time_gap_before_user_item', 'shop_id_smooth_query_rate',
    'min_query_time_gap_before', 'shop_score_description', 'item_sales_level', 'shop_query_count', 'user_star_level', 'user_age_level', 'item_sales_query_rate',
    'item_query_count', 'shop_score_service', 'shop_review_positive_rate', 'item_price_level', 'min_query_time_gap_after_user_item']
    '''
    PotentialAdd = []
    '''
    sf = FS.Select(Sequence = True, Random = True, Cross = False, PotentialAdd = PotentialAdd) #select the way you want to process searching
    sf.ImportDF(prepareData(),label = 'is_trade')
    sf.ImportLossFunction(modelscore,direction = 'descend')
    sf.ImportCrossMethod(CrossMethod)
    sf.NonTrainableFeatures = ['instance_id', 'item_id', 'item_brand_id', 'item_city_id', 'user_id', 'context_id', 'shop_id', 'item_category_0', 'time',
                'context_timestamp', 'item_property_list', 'predict_category_property',
                'item_category_list', 'is_trade', 'day', ]
    sf.InitialFeatures(['item_price_level', 'item_sales_level', 'item_collected_level', 'min_query_time_gap_after', 'min_query_time_gap_before_user_item',
    'min_query_time_gap_after_user_item', 'hour', 'item_category_1', 'shop_score_service', 'user_age_level', 'user_star_level', 'context_page_id', 'min_query_time_gap_before',
    'shop_query_count', 'item_sales_count'])
    #sf.InitialFeatures(['item_price_level','item_sales_level','item_collected_level', 'item_pv_level'])
    sf.clf = lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000, max_depth=3, learning_rate = 0.05, n_jobs=8)
    sf.logfile = 'record.log'
    sf.run(validation)
def eval_model(folds, models, metric='roc_curve', dummy=False):
    fprs = {}
    tprs = {}
    aucs = {}
    fold = 0
    for train, test in folds:
        X_train, X_test, y_train, y_test = FeatureSelection.main(
            './kickstarter-projects/ks-projects-201801.csv', split='strat_k_fold', train=train, test=test)
        if dummy == True:
            X_train, X_test = _create_dummy_features(
                X_train, X_test)
        elif dummy == False:
            X_train, X_test = _create_label_encoding(X_train, X_test)

        for name, model in models.items():
            model_ = model.fit(X_train, y_train)
            predicitons = model_.predict_proba(X_test)
            fpr, tpr, thresholds = roc_curve(
                y_test, predicitons[:, 1], pos_label=1)
            auc_score = auc(fpr, tpr)
            aucs[name] = {fold: auc_score}
        fold += 1
    return aucs
def main():
    splitter = create_strat_k_folds(
        './kickstarter-projects/ks-projects-201801.csv', 3)
    gridsearch_model = joblib.load('./ModelHyperparm/rfParams.pkl')
    rf = RandomForestClassifier(**gridsearch_model.best_params_,
                                n_estimators=100, random_state=0,
                                class_weight='balanced_subsample')
    dt = DecisionTreeClassifier(criterion='entropy', max_depth=20,
                                min_samples_split=1000, random_state=0,
                                class_weight='balanced')
    gb = GradientBoostingClassifier(loss='exponential', n_estimators=100,
                                    random_state=0)
    ada = AdaBoostClassifier(base_estimator=dt, n_estimators=100,
                             learning_rate=0.5)

    estimators = [('dt', dt), ('rf', rf), ('ada', ada), ('gb', gb)]
    voting_ensemble = VotingClassifier(estimators, voting='soft')
    models = {'Random Forest': rf, 'Decision Tree': dt, 'Gradient Booster': gb,
              'AdaBooster': ada, "Voting Ensemble": voting_ensemble}
    fprs, tprs, aucs = eval_model(splitter, models, dummy=True)
    auc_scores = get_auc_score(aucs)

    X_train, X_test, y_train, y_test = FeatureSelection.main(
        "./kickstarter-projects/ks-projects-201801.csv")
    X_train, X_test = _create_dummy_features(X_train, X_test)
    voting_ensemble.fit(X_train, y_train)
    predicitons = voting_ensemble.predict_proba(X_test)
    fpr, tpr, thresholds = roc_curve(
        y_test, predicitons[:, 1], pos_label=1)
    auc_score = auc(fpr, tpr)
    get_roc_curve(fpr, tpr, 'Voting Ensemble', auc_score)
    predicitons = voting_ensemble.predict(X_test)
    conf_matrix = confusion_matrix(y_test, predicitons)
    print(conf_matrix)
    print("F1:", f1_score(y_test, predicitons))
    print("Recall", recall_score(y_test, predicitons))
    print("Percision:", precision_score(y_test, predicitons))
Пример #21
0
def testDiabeteData():
    # TEST WITH DIABETES DATA FROM UCI
    url = "pima-indians-diabetes.data"
    print url
    headers = [
        'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
    ]
    df = loadCSV(url, headers)

    # df = df.iloc[::2]

    print "First few samples: \n", df.head(5)
    # scale data
    df[[
        'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
    ]] = ut.minMaxScale(df[[
        'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
    ]])
    # df = df[['preg', 'plas', 'pres', 'mass', 'pedi', 'age', 'class']]

    array = df.values
    [nrow, ncol] = array.shape
    print "First few samples: \n", df.head(5)
    # print "Dimension: ", array.shape

    X = array[:, 0:(ncol - 1)]
    Y = array[:, (ncol - 1)]

    # kfold = getKfolds(5)
    # testClassifers(X, Y, kfold)
    # k = 5
    # runClassifiers(X, Y, k)

    # Feature Selection
    m = ncol - 1  # Number of features
    print "\nFeature Selection by RFE (smaller is better): "
    fs.recursiveFeatureElimination(X, Y, m)

    print "\nFeature Selection by kBest (higher values mean higher dependency): "
    fs.selectKBest(X, Y, m)

    print "\nFeature Selection by extraTree (higher is better): "
    fs.extraTrees(X, Y)
Пример #22
0
def main(useAnnotatorWeighing=True):
    '''
    This script runs the experiments by training on a trainset and testing on a test set. Also allows bootstrapping (which is hard coded in this script as well)
    Configure your model settings by modifying the ExperimentSettings object in the script.

    The output of these models are annotated files in the output folder, which can be evaluated (in metrics) using testEval.py
    '''

    # Making folders from config
    # cfg.makeFolders()

    # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations
    features = [["DSM+1"]]
    #features = [["CONCEPTS"]]#['BOW'],
    #     features = [["CONCEPTS"]]

    # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings)
    es = ExperimentSettings()
    #     es.fs_varianceFilter = True
    #     es.bootstrap = True
    #     es.ss_prototyping = True
    #     es.weighInterAnnot = False
    #     es.ml_algorithm='RF'
    #remove these!
    #     es.removeDeniedConcepts=False
    #     es.splitFamilyConcepts=False
    #     es.splitUncertainConcepts=False

    # Reading the train/test_data into an array
    train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)
    test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts,
                                        es.splitDeniedConcepts,
                                        es.removeUncertainConcepts,
                                        es.splitUncertainConcepts,
                                        es.removeFamilyConcepts,
                                        es.splitFamilyConcepts)
    test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts,
                                       es.splitDeniedConcepts,
                                       es.removeUncertainConcepts,
                                       es.splitUncertainConcepts,
                                       es.removeFamilyConcepts,
                                       es.splitFamilyConcepts)

    # Reading in bootstrap data as well when enabled
    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)
    # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)

    vectorizer = DictVectorizer()
    min_max_scalar = MinMaxScaler()

    # Looping over different feature parameters
    for featTypes in features:
        utils.out('Executing for ' + ','.join(featTypes) + ' model.')
        es.featTypes = featTypes

        estimator = m.getEstimator(es)

        m.generatePrimaryFeats(train_data, es)
        m.generatePrimaryFeats(test_data, es)
        utils.out('Generated primary features for train and test_data!')

        y_train = [d.severity for d in train_data]

        if es.bootstrap:
            m.generatePrimaryFeats(bootstrap_data, es)
            (train_data, y_train) = m.get_bootstrapped_trainset(train_data,
                                                                y_train,
                                                                bootstrap_data,
                                                                es,
                                                                estimator,
                                                                th_bs=0.6)

        concatenated_data = []
        concatenated_data.extend(train_data)
        concatenated_data.extend(test_data)

        m.generateDataDrivenFeats(train_data, concatenated_data, es)

        featurized = m.featurize(concatenated_data)

        train_feats = featurized[0:len(train_data)]
        test_feats = featurized[len(train_data):len(featurized)]

        # Do feature selection on train data
        train_feats = fs.runFeatureSelection(train_feats, y_train, es)
        train_feats, y_train, train_bucket = ss.runSampleSelection(
            train_feats, y_train, [i for i in range(len(train_data))], es)

        x_train = vectorizer.fit_transform(train_feats)
        x_test = vectorizer.transform(test_feats)

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        weights_train = m.getWeights(train_data, train_bucket,
                                     es.weighInterAnnot)

        model = m.train(estimator, x_train, y_train, weights_train, model=None)

        y_pred = m.test(x_test, estimator=model)
        #         print(y_pred)
        for i, cur_data in enumerate(test_data):
            cur_data.predSev = y_pred[i]

        out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        utils.genOutput(data=test_data,
                        outDir=out_dir,
                        dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
Пример #23
0
def main():
    # Define constant
    optimizer = 'adam'  # Can be 'adam', 'sgd', or 'rmsprop'
    activation = 'relu'  # Can be 'sigmoid', 'tanh', 'softplus', 'softsign', 'relu'
    datadir = 'data/IS09_emotion'  # 'data/IS09_emotion' or 'data/IS11_speaker_state'
    # datadir = 'data/IS11_speaker_state'           # 'data/IS09_emotion' or 'data/IS11_speaker_state'
    n_hiddens = [100, 100]
    n_features = 100  # Totally 382 features for IS09_emotion; 4354 for IS11_speaker_state
    fs_method = 'none'  # 'univ', 'rfe', 'lasso', 'l1norm' or 'none'
    C = 0.01
    rfe_step = 1
    n_epochs = 10
    bat_size = 10

    print('Start train and test ...')

    trn_file = datadir + '/' + 'cheavd_trn_fs.mat'
    tst_file = datadir + '/' + 'cheavd_tst_fs.mat'

    # Load trn and tst data
    trn_data = sio.loadmat(trn_file)
    tst_data = sio.loadmat(tst_file)

    # Retreive training and test dataa
    x_train = np.array(trn_data['x'], dtype='float32')
    y_train = np.array(trn_data['y'].ravel(), dtype='int32')
    y_train_ohe = np_utils.to_categorical(y_train)
    x_test = np.array(tst_data['x'], dtype='float32')
    y_test = np.array(tst_data['y'].ravel(), dtype='int32')

    if fs_method == 'rfe':
        # Select features by RFE-SVM, remove 10% of feature for each iteration (step=0.1)
        x_train, x_test = fs.rfe_select_features(x_train,
                                                 y_train,
                                                 x_test,
                                                 n_fs=n_features,
                                                 penalty=C,
                                                 step=rfe_step)
    elif fs_method == 'l1norm':
        # Select features by L1-norm
        x_train, x_test = fs.l1norm_select_features(x_train,
                                                    y_train,
                                                    x_test,
                                                    penalty=C)
    elif fs_method == 'lasso':
        # Select features by LASSO
        x_train, x_test = fs.lasso_select_features(x_train,
                                                   y_train,
                                                   x_test,
                                                   alpha=0.001)
    elif fs_method == 'univ':
        x_train, x_test = fs.univ_select_features(x_train,
                                                  y_train,
                                                  x_test,
                                                  n_fs=n_features)
    elif fs_method == 'fdr':
        # Select features by FDR
        x_train, x_test = fs.fdr_select_features(x_train,
                                                 y_train,
                                                 x_test,
                                                 n_fs=n_features)
    elif fs_method == 'none':
        pass
    print('No. of selected features = %d, ' % x_train.shape[1], end='')

    # Train DNN
    model = train_dnn(x_train, y_train_ohe, n_hiddens, optimizer, activation,
                      n_epochs, bat_size)

    # Test DNN
    train_acc, dummy, dummy = test_dnn(x_train, y_train, model)
    print('Training accuracy: %.2f%% ' % (train_acc * 100), end='', flush=True)

    test_acc, dummy, dummy = test_dnn(x_test, y_test, model)
    print('Test accuracy: %.2f%% ' % (test_acc * 100))
    testAccuracy = EvaluationsStub.Accuracy(yTest, yTestPredicted)
    print("Test Set Accuracy is %f" % (testAccuracy))

print("Train with all 5 features")
model.fit(xTrain, yTrain, iterations=50000, step=0.01)
yTestPredicted = model.predict(xTest)
testAccuracy = EvaluationsStub.Accuracy(yTest, yTestPredicted)
print("Test Set Accuracy is %f" % (testAccuracy))

############################

import FeatureSelection

print('### Get the Frequency Table')

frequencyTable = FeatureSelection.byFrequency(xTrainRaw)
print('Top 10')
for i in range(10):
    print(frequencyTable[i])

#############################

print('### Get the Mutual Information Table')

mutualInformationTable = FeatureSelection.byMutualInformation(
    xTrainRaw, yTrain)
print('Top 10')
for i in range(10):
    print(mutualInformationTable[i])

#############################
""" This script will read all the emails and it will train the classifier """


import os
from Email import *
from FeatureSelection import *
from NaiveBayesClassifier import *

trainPath = "dataset"
trainSet_emails = []

#create an email for every file we read
for f in os.listdir(trainPath):
    fileName = trainPath+'/'+f
    e = Email()
    if "spm" in fileName:
        e.setCategory("SPAM")
    else:
        e.setCategory("HAM")
    e.read(fileName)
    #insert the email we created to a collection of emails
    trainSet_emails.append(e)

#select features from our training set(automatic feature selection)
fs = FeatureSelection(trainSet_emails)
fs.selectFeatures()

#create a naive bayes classifier and train it
nb = NaiveBayesClassifier()
nb.setEmails(trainSet_emails)
nb.train()
Пример #26
0
    (xRaw, yRaw) = Assignment1Support.LoadRawData(kDataPath)

    # (xTrainRaw, yTrainRaw, xTestRaw, yTestRaw) = Assignment1Support.TrainTestSplit(xRaw, yRaw)
    (xTrainRawOriginal, yTrainRawOriginal, xTestRawOriginal, yTestRawOriginal) = Assignment1Support.TrainTestSplit(xRaw, yRaw)
    (xTrainRaw, yTrainRaw) = AddNoise.MakeProblemHarder(xTrainRawOriginal, yTrainRawOriginal)
    (xTestRaw, yTestRaw) = AddNoise.MakeProblemHarder(xTestRawOriginal, yTestRawOriginal)

    (xTrain, xTest) = Assignment1Support.Featurize(xTrainRaw, xTestRaw)
    yTrain = yTrainRaw
    yTest = yTestRaw

    ### Get the Mutual Information Words as features
    import FeatureSelection

    print('### Get the Mutual Information features')
    mutualInformationTable = FeatureSelection.byMutualInformation(xTrainRaw, yTrain)
    words = [word for word,_ in mutualInformationTable[:295]]
    (xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words)

    print('### Merge the features')
    xTrain = np.hstack([xTrain, xNewTrain])
    xTest = np.hstack([xTest, xNewTest])

    import RandomForest
    ############################

    print("========== Building one Model and output the accuracy ==========")

    model = RandomForest.RandomForest(num_trees = 10, min_to_split = 2, use_bagging = True, restrict_features = 20)
    print("### Training with Random Forest")
    model.fit(xTrain, yTrain)

#========================= Classifying the metadata ===============================#

from DataPrep import data_return

X_data, class_names= data_return()
X_data = np.nan_to_num(X_data)
clf = SGDClassifier()
model=clf.fit(X_data, DataPrep.train_data[1])

# TODO: This has to be done on test_data
op=model.predict(X_data[None,0,:])

accuracy=0
Y_pred={}
for i in range(len(DataPrep.train_data[1])):
    accuracy+=int(model.predict(X_data[None,i,:])== DataPrep.train_data[1][i])
    Y_pred[i]=model.predict(X_data[None,i,:])[0]
accuracy/=len(DataPrep.train_data[1])


cm = confusion_matrix(DataPrep.train_data[1],list(Y_pred.values()))

plt.figure()
confusion_matrix=FeatureSelection.plot_confusion_matrix(cm, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')



Пример #28
0
def main():
    #=================================Linear Regression Baseline==================================

    LinearRegression_baseline.main(
    )  #All the datasets are loaded within the function itself

    #=============================================================================================

    # =============================================================================================

    models_baseline.main()

    # =============================================================================================
    models = np.array([
        LinearRegression, DecisionTreeRegressor, KNeighborsRegressor, Ridge,
        MLPRegressor, RandomForestRegressor, ElasticNet
    ])
    #models_names = ["LinearRegression", "DecisionTreeRegressor", "KNeighborsRegressor", "Ridge", "MLPRegressor", "RandomForestRegressor", "ElasticNet"]
    models_names = ["LR", "DTR", "KNR", "Ridge", "MLPR", "RFR", "EN"]
    stock = ["ICICI", "TATA", "VEDL", "REDDY"]
    model_count = models.shape[0]
    fig_no = np.zeros([4 * model_count])
    features = SelectKBest(f_regression)
    #train_X1,train_Y1,test_X1,test_Y1 = FeatureSelection.main(train_X,train_Y,test_X,test_Y)
    rmse = []
    #k_value = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12,13]

    #*********************************************ICICI data****************************************

    for i in range(0, model_count):
        train_X, train_Y, test_X, test_Y = data.load_ICICI()  #load the data

        start_time = time.clock()

        clf = models[i]()
        train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main(
            clf, train_X, train_Y, test_X, test_Y)
        k_value = [best_K]
        fig_no[i] = i
        hyper_parameters = hyper.main(models[i], model_count)
        hyper_parameters['Kbest__k'] = k_value
        pipeline = Pipeline([('Kbest', features), ('model', clf)])
        rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters)
        rand_search.fit(train_X, train_Y)

        predictions = rand_search.predict(test_X)

        end_time = time.clock() - start_time  # Calculate the speed
        filename = "..\Speed\ICICI_" + "{}".format(
            models_names[i]) + "_time.txt"  # Write the speed to a file
        target = open(filename, 'w')
        target.write(str(end_time))

        MSE = mean_squared_error(test_Y, predictions)
        RMSE = math.sqrt(MSE)  # Calculate RMSE
        filename = "..\RMSE\ICICI_" + "{}".format(
            models_names[i]) + "_rmse.txt"  #Store RMSE to a file
        target = open(filename, 'w')
        target.write(str(RMSE))
        print "ICICI ", models_names[i], "  ", RMSE
        rmse.append(RMSE)
        Graphs_plotting.line_graph(fig_no[i], test_Y, predictions,
                                   models_names[i], stock[0])
    Graphs_plotting.bar_chart(101, rmse, models_names, stock[0], model_count)
    rmse = []
    # *********************************************TATA data*****************************************

    for i in range(0, model_count):
        train_X, train_Y, test_X, test_Y = data.load_TATA()

        start_time = time.clock()

        clf = models[i]()
        train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main(
            clf, train_X, train_Y, test_X, test_Y)
        k_value = [best_K]
        fig_no[model_count + i] = model_count + i
        pipeline = Pipeline([('Kbest', features), ('model', clf)])
        hyper_parameters = hyper.main(models[i], model_count)
        hyper_parameters['Kbest__k'] = k_value
        rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters)
        rand_search.fit(train_X, train_Y)
        predictions = rand_search.predict(test_X)

        end_time = time.clock() - start_time
        filename = "..\Speed\TATA_" + "{}".format(
            models_names[i]) + "_time.txt"
        target = open(filename, 'w')
        target.write(str(end_time))

        MSE = mean_squared_error(test_Y, predictions)
        RMSE = math.sqrt(MSE)
        filename = "..\RMSE\TATA_" + "{}".format(models_names[i]) + "_rmse.txt"
        target = open(filename, 'w')
        target.write(str(RMSE))
        print "TATA ", models_names[i], "  ", RMSE
        rmse.append(RMSE)
        Graphs_plotting.line_graph(fig_no[model_count + i], test_Y,
                                   predictions, models_names[i], stock[1])
    Graphs_plotting.bar_chart(102, rmse, models_names, stock[1], model_count)
    rmse = []
    # *********************************************VEDL data*****************************************

    for i in range(0, model_count):
        train_X, train_Y, test_X, test_Y = data.load_VEDL()

        start_time = time.clock()

        clf = models[i]()
        train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main(
            clf, train_X, train_Y, test_X, test_Y)
        k_value = [best_K]
        fig_no[2 * model_count + i] = 2 * model_count + i
        pipeline = Pipeline([('Kbest', features), ('model', clf)])
        hyper_parameters = hyper.main(models[i], model_count)
        hyper_parameters['Kbest__k'] = k_value
        rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters)
        rand_search.fit(train_X, train_Y)
        predictions = rand_search.predict(test_X)

        end_time = time.clock() - start_time
        filename = "..\Speed\VEDL_" + "{}".format(
            models_names[i]) + "_time.txt"
        target = open(filename, 'w')
        target.write(str(end_time))

        # if (models[i] in [MLPRegressor, KNeighborsRegressor, RandomForestRegressor]):
        #     predictions = predictions - (predictions[0] - test_Y[0])#normalizing the predicted values
        MSE = mean_squared_error(test_Y, predictions)
        RMSE = math.sqrt(MSE)
        filename = "..\RMSE\VEDL_" + "{}".format(models_names[i]) + "_rmse.txt"
        target = open(filename, 'w')
        target.write(str(RMSE))
        print "VEDANTA ", models_names[i], "  ", RMSE
        rmse.append(RMSE)
        Graphs_plotting.line_graph(fig_no[2 * model_count + i], test_Y,
                                   predictions, models_names[i], stock[2])
    Graphs_plotting.bar_chart(103, rmse, models_names, stock[2], model_count)
    rmse = []
    # *********************************************REDDY data*****************************************

    for i in range(0, model_count):
        train_X, train_Y, test_X, test_Y = data.load_REDDY()

        start_time = time.clock()

        clf = models[i]()
        train_X, train_Y, test_X, test_Y, best_K = FeatureSelection.main(
            clf, train_X, train_Y, test_X, test_Y)
        k_value = [best_K]
        fig_no[3 * model_count + i] = 3 * model_count + i
        pipeline = Pipeline([('Kbest', features), ('model', clf)])
        hyper_parameters = hyper.main(models[i], model_count)
        hyper_parameters['Kbest__k'] = k_value
        rand_search = GridSearchCV(pipeline, param_grid=hyper_parameters)
        rand_search.fit(train_X, train_Y)
        predictions = rand_search.predict(test_X)

        end_time = time.clock() - start_time
        filename = "..\Speed\REDDY_" + "{}".format(
            models_names[i]) + "_time.txt"
        target = open(filename, 'w')
        target.write(str(end_time))

        MSE = mean_squared_error(test_Y, predictions)
        RMSE = math.sqrt(MSE)
        filename = "..\RMSE\REDDY_" + "{}".format(
            models_names[i]) + "_rmse.txt"
        target = open(filename, 'w')
        target.write(str(RMSE))
        print "DR REDDY ", models_names[i], "  ", RMSE
        rmse.append(RMSE)
        Graphs_plotting.line_graph(fig_no[3 * model_count + i], test_Y,
                                   predictions, models_names[i], stock[3])
    Graphs_plotting.bar_chart(103, rmse, models_names, stock[3], model_count)
    (xRaw, yRaw) = Assignment1Support.LoadRawData(kDataPath)

    (xTrainRaw, yTrainRaw, xTestRaw,
     yTestRaw) = Assignment1Support.TrainTestSplit(xRaw, yRaw)
    yTrain = yTrainRaw
    yTest = yTestRaw

    print('========== Debug on raw data =========')

    num_trees = 10
    min_to_split = 12
    use_bagging = True
    restrict_features = 70
    print("========== Preprocess the Data ==========")
    (xTrainRawNormalize,
     xTestRawNormalize) = FeatureSelection.preprocess(xTrainRaw, xTestRaw)
    print('========== Merge Features ==========')
    print('Use 5 Hand Craft Words as Features')
    (xTrainHand, xTestHand,
     featuresName) = FeatureSelection.hand_craft_features(
         xTrainRaw, xTestRaw, 2)

    print('Use 70 Mutual Information Words as Features')
    model = RandomForest.RandomForest(num_trees=num_trees,
                                      min_to_split=min_to_split,
                                      use_bagging=use_bagging,
                                      restrict_features=restrict_features)
    mutualInformationTable = FeatureSelection.byMutualInformation(
        xTrainRawNormalize, yTrain)
    words = [word for word, _ in mutualInformationTable[:70]]
    (xTrainMI, xTestMI) = FeatureSelection.Featurize(xTrainRawNormalize,
'''
""" This script will read all the emails and it will train the classifier """

import os
from Email import *
from FeatureSelection import *
from NaiveBayesClassifier import *

trainPath = "dataset"
trainSet_emails = []

#create an email for every file we read
for f in os.listdir(trainPath):
    fileName = trainPath + '/' + f
    e = Email()
    if "spm" in fileName:
        e.setCategory("SPAM")
    else:
        e.setCategory("HAM")
    e.read(fileName)
    #insert the email we created to a collection of emails
    trainSet_emails.append(e)

#select features from our training set(automatic feature selection)
fs = FeatureSelection(trainSet_emails)
fs.selectFeatures()

#create a naive bayes classifier and train it
nb = NaiveBayesClassifier()
nb.setEmails(trainSet_emails)
nb.train()
Пример #31
0
def evalCrossval(estimator,
                 data,
                 es=ExperimentSettings(),
                 nFolds=10,
                 printTree=False,
                 verbose=False,
                 random_seed=44):
    '''
    Calculate average cross validation score on the split train data to evaluate performance of trained models
    @param estimator: the machine learning estimator
    @param feats_train: Features for generated training data
    @param labels_train: symptom severity label for generated training data 
    @param nFolds: number of folds in k-fold cross validation 
    '''

    # scores = cross_validation.cross_val_score(estimator, feats_train, labels_train, scoring='mean_absolute_error', cv=nFolds, verbose=1)
    # print("Average cross validation score (mean absolute error): ", np.average(scores))

    labels = [x.severity for x in data]
    folds = cross_validation.StratifiedKFold(labels,
                                             n_folds=nFolds,
                                             shuffle=True,
                                             random_state=es.random_seed)

    min_max_scalar = MinMaxScaler()

    metrics = defaultdict(list)
    confMat = None

    generatePrimaryFeats(data, es)
    utils.out('Generated primary features!')

    #_, vocab = getFeats(data, ['MED'])
    #print(vocab)

    # For each fold
    for fold_idx, fold in enumerate(folds):
        #making an 'inner data' set, in which we have a copy of the original data (makes sure we do not modify the original data
        innerData = copy(data)

        train_bucket, test_bucket = fold

        # Generate data-driven features (meta-features)
        # These features should be generated within the loop, because some clustering might happen between samples (e.g. to determine which questions are 'regular')
        trainSet = [copy(innerData[idx]) for idx in train_bucket]
        generateDataDrivenFeats(trainSet, innerData, es)

        if verbose:
            utils.out('Generated data-driven features!')

        # Deriving the values for the trainset, also generating the vocabulary
        featurized = featurize(innerData)
        # Get all featurized documents from by using the indices in the train and test buckets.
        train_feats = [featurized[idx] for idx in train_bucket]
        test_feats = [featurized[idx] for idx in test_bucket]

        #Do feature selection on train data
        y_train = [labels[idx] for idx in train_bucket]
        train_feats = fs.runFeatureSelection(train_feats, y_train, es)
        train_feats, y_train, train_bucket = ss.runSampleSelection(
            train_feats, y_train, train_bucket, es)

        vectorizer = DictVectorizer()
        # Fit and transform the train data.
        x_train = vectorizer.fit_transform(train_feats)
        # Same for test data.
        x_test = vectorizer.transform(test_feats)
        y_test = [labels[idx] for idx in test_bucket]

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        # calculate Inter-annotator weighting.
        weights_train = getWeights(data, train_bucket, es.weighInterAnnot)

        if verbose:
            utils.out("Running fold", fold_idx)

        model = train(estimator, x_train, y_train, weights_train, model=None)

        #for part in model.estimators_:
        #graph = export_graphviz(part, out_file=None, feature_names=vectorizer.feature_names_)
        #selFeats = utils.find_between(graph, 'label="','gini')

        # output the importance of features
        try:
            indices = np.argsort(model.feature_importances_)[::-1]
            featImportances = [[
                vectorizer.feature_names_[x], model.feature_importances_[x]
            ] for x in indices]
        except:
            featImportances = None

        y_pred = test(x_test, model)
        #print(y_pred)

        if confMat is None:
            confMat = confusion_matrix(y_test, y_pred, [0, 1, 2, 3])
        else:
            confMat += confusion_matrix(y_test, y_pred, [0, 1, 2, 3])

        if verbose:
            utils.out("Actual", y_test)
            utils.out("Predicted", y_pred)

        if printTree:
            save_decision_tree(
                cfg.PATH_DECISION_TREE + '_'.join(es.featTypes) + "/", model,
                fold_idx, vectorizer.get_feature_names())

        calc_and_append_scores(y_test, y_pred, metrics, featImportances)

    return save_results(vectorizer, metrics, confMat, es, nFolds)
Пример #32
0
def eval_bootstrapped_crossVal(estimator,
                               data,
                               bootstrap_data,
                               es=ExperimentSettings(),
                               nFolds=10,
                               printTree=False,
                               verbose=False,
                               th_bs=0.6,
                               random_seed=44):
    labels = [x.severity for x in data]
    folds = cross_validation.StratifiedKFold(labels,
                                             n_folds=nFolds,
                                             shuffle=True,
                                             random_state=es.random_seed)

    min_max_scalar = MinMaxScaler()

    metrics = defaultdict(list)
    confMat = None

    generatePrimaryFeats(data, es)
    generatePrimaryFeats(bootstrap_data, es)
    utils.out('Generated primary features!')

    # For each fold
    for fold_idx, fold in enumerate(folds):
        #making an 'inner data' set, in which we have a copy of the original data (makes sure we do not modify the original data
        trainAndTestData = copy(data)

        train_bucket, test_bucket = fold

        # Generate data-driven features (meta-features)
        # These features should be generated within the loop, because some clustering might happen between samples (e.g. to determine which questions are 'regular')
        trainData = [copy(trainAndTestData[idx]) for idx in train_bucket]
        y_train = [labels[idx] for idx in train_bucket]

        (new_train_data,
         new_y_train) = get_bootstrapped_trainset(trainData, y_train,
                                                  bootstrap_data, es,
                                                  estimator, th_bs)

        testData = [copy(trainAndTestData[idx]) for idx in test_bucket]
        allData = new_train_data + testData
        generateDataDrivenFeats(new_train_data, allData, es)

        if verbose:
            utils.out('Generated data-driven features!')

        # Deriving the values for the trainset, also generating the vocabulary
        featurized = featurize(allData)
        # Get all featurized documents from by using the indices in the train and test buckets.

        train_feats = featurized[0:len(new_train_data)]
        test_feats = featurized[len(new_train_data):len(featurized)]

        #Do feature selection on train data

        train_feats = fs.runFeatureSelection(train_feats, new_y_train, es)
        train_feats, new_y_train, new_train_bucket = ss.runSampleSelection(
            train_feats, new_y_train, [i for i in range(len(new_train_data))],
            es)

        vectorizer = DictVectorizer()
        # Fit and transform the train data.
        x_train = vectorizer.fit_transform(train_feats)
        # Same for test data.
        x_test = vectorizer.transform(test_feats)
        y_test = [labels[idx] for idx in test_bucket]

        new_weights_train = getWeights(new_train_data, new_train_bucket,
                                       es.weighInterAnnot)

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        if verbose:
            utils.out("Running fold", fold_idx)

        model = train(estimator,
                      x_train,
                      new_y_train,
                      new_weights_train,
                      model=None)
        # output the importance of features
        indices = np.argsort(model.feature_importances_)[::-1]
        featImportance = [[
            vectorizer.feature_names_[x], model.feature_importances_[x]
        ] for x in indices]

        y_pred = test(x_test, model)

        if confMat is None:
            confMat = confusion_matrix(y_test, y_pred, [0, 1, 2, 3])
        else:
            confMat += confusion_matrix(y_test, y_pred, [0, 1, 2, 3])

        if verbose:
            utils.out("Actual", y_test)
            utils.out("Predicted", y_pred)

        if printTree:
            save_decision_tree(
                cfg.PATH_DECISION_TREE + '_'.join(es.featTypes) + "/", model,
                fold_idx, vectorizer.get_feature_names())

        calc_and_append_scores(y_test, y_pred, metrics, featImportance)

    return save_results(vectorizer, metrics, confMat, es, nFolds)