def string_selection():
    # get data
    vectorizer = CountVectorizer(decode_error='ignore')
    ch2 = SelectKBest(chi2, k=100)

    # get data
    train_data, permission_list = db_tool.get_new_train_data()
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'],
                                                                         train_data['target'], test_size=0.2,
                                                                         random_state=1)

    # feature extraction
    x_train = vectorizer.fit_transform(x_train)
    feature_names = vectorizer.get_feature_names()

    x_train = ch2.fit_transform(x_train, y_train)
    feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
    print(ch2.scores_)
    print(ch2.get_support(indices=True))
    print(feature_names)
    x_test = vectorizer.transform(x_test)
    x_test = ch2.transform(x_test)

    # # build the model
    model = MultinomialNB().fit(x_train, y_train)
    #
    # # valid the model
    predicted = model.predict(x_test)
    print (metrics.accuracy_score(y_test, predicted))
def test_mutual_info_classif():
    X, y = make_classification(
        n_samples=100,
        n_features=5,
        n_informative=1,
        n_redundant=1,
        n_repeated=0,
        n_classes=2,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_classif, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
def test_mutual_info_regression():
    X, y = make_regression(n_samples=100, n_features=10, n_informative=2,
                           shuffle=False, random_state=0, noise=10)

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_regression, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile',
                                   param=20).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
Exemplo n.º 4
0
Arquivo: ML.py Projeto: dlrsb/dream
def featureSelection(X, y, selection_method, estimator=None, num_features=None, feature_names=None, features_file=None):

    if selection_method == "kbest":
        sel = SelectKBest(f_regression, k=num_features).fit(X, y)
        return sel.get_support()
    elif selection_method == "from_model":
        sel = SelectFromModel(estimator, 0.8)  # define threshold??
        sel.fit(X, y)
        return sel.get_support()
def pred_SOC(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 4500)
    univ_selector.fit(train[all_vars], train['SOC'])
    univ_selector2 = SelectKBest(score_func = f_regression, k = 200)
    univ_selector2.fit(train[all_vars], train['SOC'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x]:
            lass_only.append(all_vars[x])    
    #randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen])
    gbr = GradientBoostingRegressor(n_estimators = 900,
            learning_rate = .0785, max_depth =1, random_state = 42, 
            verbose = 0, min_samples_leaf=4, subsample = .4)
    gbr.fit(train[chosen2], train['SOC'])
    for dset in data:
        dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])    
    # lasso
    #lass = Lasso(alpha=.00000025, positive=True)
    #lass.fit(train[all_vars], train['SOC'])
    #for dset in data:
    #    dset['SOC_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    SOC_ridge = RidgeCV(np.array([.315]), normalize=True)
    SOC_ridge.fit(train[all_vars], train['SOC'])
    for dset in data:
        dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars])
    # SVR
    svr = svm.SVR(C=9000, epsilon=.1)
    svr.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen])
    # combination
    models= ['SOC_rdg_prds', 'SOC_svr_prds',
              'SOC_gbr_prds', 'SOC_for_prds',  'SOC_svr_prds' ]
    name = 'SOC_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'SOC')
Exemplo n.º 6
0
def use(method):
    if method == 'naive bayes':
        estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()),
                      ('bayes',GaussianNB())]
        clf = Pipeline(estimators)
        parameters = {"skb__k":[8,9,10,11,12],
                      "pca__n_components":[2,6,4,8]}
        clf = grid_search.GridSearchCV(clf, parameters)
        scaler = MinMaxScaler()
        features_train_scaled = scaler.fit_transform(features_train)
        features_test_scaled = scaler.transform(features_test)
        clf.fit(features_train_scaled, labels_train)
        pred = clf.predict(features_test_scaled)
        print clf.best_params_
        features_k = clf.best_params_['skb__k']
        SKB_k = SelectKBest(f_classif, k = features_k)
        SKB_k.fit_transform(features_train_scaled, labels_train)
        print "features score: "
        print SKB_k.scores_
        features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)]
        print features_selected
    elif method == 'svm':
        estimators = [('reduce_dim', PCA()), ('svc', SVC())]
        clf = Pipeline(estimators)
        parameters = {'svc__C': [1,10]}
        clf = grid_search.GridSearchCV(clf, parameters)
        scaler = MinMaxScaler()
        features_train_scaled = scaler.fit_transform(features_train)
        features_test_scaled = scaler.transform(features_test)
        clf.fit(features_train_scaled, labels_train)
        pred = clf.predict(features_test_scaled)
        print clf.best_estimator_
    elif method == 'decision tree':
        estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()),
                      ('tree', tree.DecisionTreeClassifier())]
        clf = Pipeline(estimators)
        parameters = {"tree__min_samples_split": [2,10],"skb__k":[8,9,10,11,12],
                      "pca__n_components":[2,4,6,8]}
        clf = grid_search.GridSearchCV(clf, parameters)
        scaler = MinMaxScaler()
        features_train_scaled = scaler.fit_transform(features_train)
        features_test_scaled = scaler.transform(features_test)
        clf.fit(features_train_scaled, labels_train)
        pred = clf.predict(features_test_scaled)
        print clf.best_params_
        features_k = clf.best_params_['skb__k']
        SKB_k = SelectKBest(f_classif, k = features_k)
        SKB_k.fit_transform(features_train, labels_train)
        features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)]
        print features_selected
    accuracy = accuracy_score(labels_test, pred)
    print "accuracy score:"
    print accuracy
    calculate_precision_recall(pred, labels_test)
Exemplo n.º 7
0
def kbest_test():
    ''' Select K Best testing '''
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2

    enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))
    feature_list = ["poi", 
                "bonus",
                "deferral_payments",
                "deferred_income",
                "director_fees",
                "exercised_stock_options",
                "expenses",
                "from_messages",
                #"from_this_person_to_poi",
                #"from_poi_to_this_person",
                "loan_advances",
                "long_term_incentive",
                "other",
                "restricted_stock",
                "restricted_stock_deferred",
                "salary",
                "shared_receipt_with_poi",
                "to_messages",
                "total_payments",
                "total_stock_value"
                ]
    data = featureFormat(enron_data, feature_list)
    labels, features = targetFeatureSplit(data)
    
    # rescale features to be in [0..1] range
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)
    
    sk = SelectKBest(chi2, k=6) # f_classif
    data_new = sk.fit_transform(features_scaled, labels)
    #print data_new.shape
    
    feature_list_new = [x for x, y in 
        zip(feature_list, sk.get_support()) if y==True]
    print '--- Selected Features ---\r\n'
    print sk.get_support(True), "\r\n", feature_list_new

    feature_list_scores = zip(feature_list, sk.scores_)
    feature_list_scores = sorted(feature_list_scores, key=lambda k: k[1],
                                 reverse=True)
    print '--- All Features ---'
    for item in feature_list_scores:
        print item[0], ": ", "{0:.4f}".format(item[1])

    return
Exemplo n.º 8
0
def build_report(x_data, y_labels, classifier, cross_val_iterator, tfidf: TfidfVectorizer, features):
    cm = numpy.zeros((3, 3))
    f1 = precision = recall = accuracy = float()
    support = Counter(y_labels)
    filename = 'Bigrams + Unigrams/features.txt'


    # svd = TruncatedSVD(n_components=5000, random_state=42)
    # x_data = svd.fit_transform(x_data, y_labels)
    # try:
    # os.remove(filename)
    #  except FileNotFoundError:
    #      pass  #okay
    for i, (train, test) in enumerate(cross_val_iterator):
        x_train, x_test, y_train, y_test = x_data[train], x_data[test], y_labels[train], y_labels[test]

        selector = SelectKBest(chi2, k=features)
        selector.fit(x_train, y_train)

        x_train = x_train[:, selector.get_support()]
        x_test = x_test[:, selector.get_support()]

        y_pred = classifier.fit(x_train, y_train).predict(x_test)
        confusion_matrix, f1_measure, precision_sc, recall_sc, accuracy_sc = (metrics.confusion_matrix(y_test, y_pred),
                                                                              metrics.f1_score(y_test, y_pred),
                                                                              metrics.precision_score(y_test, y_pred),
                                                                              metrics.recall_score(y_test, y_pred,
                                                                                                   average='weighted'),
                                                                              metrics.accuracy_score(y_test, y_pred))
        # with open(filename, 'a+') as fea_file:
        #     fea_file.write("***********************************\n")
        #
        #     features = tfidf.get_feature_names()
        #     selected_features = []
        #     selected_indices = selector.get_support()
        #     for i, selected in enumerate(selected_indices):
        #         if selected:
        #             selected_features.append(features[i])
        #     for feature in selected_features:
        #         fea_file.write(feature + ",")
        #     fea_file.write("CM: " + str(confusion_matrix) + " f1: " + str(f1_measure) + " Precision: " + str(
        #         precision_sc) + " Recall: " + str(recall_sc))

        cm += confusion_matrix
        f1 += f1_measure
        precision += precision_sc
        recall += recall_sc
        accuracy += accuracy_sc

    return (cm, f1 / cross_val_iterator.n_folds, precision / cross_val_iterator.n_folds,
            recall / cross_val_iterator.n_folds, support, accuracy / cross_val_iterator.n_folds)
Exemplo n.º 9
0
 def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech):
     if(selectFeatTech==0):
         t=int(predictors.shape[1]*0.40);
         t=40;
         model = SelectKBest(chi2, k=t).fit(predictors, responses);
         predictors_new = model.transform(predictors);
         predictors_test_new = model.transform(test_predictors);
         indices = model.get_support(indices=True);
     if(selectFeatTech==1):
         randomized_logistic = RandomizedLogisticRegression();
         model = randomized_logistic.fit(predictors, responses);
         predictors_new = model.transform(predictors);
         predictors_test_new = model.transform(test_predictors);
         indices = model.get_support(indices=True);
     return predictors_new, predictors_test_new, indices;
Exemplo n.º 10
0
def selectFeatureSet_anova(data_x, data_y, nFeatures):
    """
    Use cross-validation with nfolds < nsamples in test_x (i.e. nTestPerClass (defualt 10) * nClasses (eg 12))
    Select best features based on ANOVA for svm.
    """
    #1. Run SVM to get the feature ranking
    anova_filter = SelectKBest(f_regression, k= nFeatures)
    anova_filter.fit(data_x, data_y)
    print 'selected features in boolean: \n', anova_filter.get_support()
    print 'selected features in name: \n', test_x.columns[anova_filter.get_support()];
    
    #2. Select the top nFeatures features
    selectedCols = data_x.columns[anova_filter.get_support()]
    #3. Run SVM (or any other) again on this selected features
    return selectedCols
def Chi2(df, n):
  """Feature selection using Chi2 on the whole dataframe. 
  Chi2 measures the dependence between stochastic variables, this method 
  weeds out features that are most likely to be independent of class"""
  from sklearn.feature_selection import SelectKBest
  from sklearn.feature_selection import chi2

  X_all = df.drop('Class', axis=1).values  
  Y_all = df.loc[:, 'Class'].values

  # Set selection to chi2 with n to keep
  ch2 = SelectKBest(chi2, k=n)
  X_new = ch2.fit_transform(X_all, Y_all)
  index = ch2.get_support(indices=True)

  # Translate keep indices into the indices in the df
  fixed_index = []
  for i in index:
    new_i = i + 1
    fixed_index.append(new_i)
  fixed_index = [0] + fixed_index

  good = [df.columns[i] for i in fixed_index]
  
  print("Features selected using Chi2 feature selection: %s" % str(good))
  df = df.loc[:,good]
  return(df)
Exemplo n.º 12
0
def corr_matrix_of_important_words(term_doc_mat, word_list, scores, n_features_to_keep):
    selector = SelectKBest(k = n_features_to_keep).fit(term_doc_mat, scores)
    informative_words_index = selector.get_support(indices=True)
    labels = [word_list[i] for i in informative_words_index]
    data = pd.DataFrame(term_doc_mat[:,informative_words_index].todense(), columns=labels)
    data['Score'] = reviews.Score
    return data.corr()
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
def feature_select(word,instance_dic,feature_dic, thre_hold=0.01, num_feature=100):
    instances_list = instance_dic[word]
    feature_words=feature_dic[word]
    feature_xs = []
    labels = []

    for instance in instances_list:
        label = ' '.join(instance.senseid)
        feature_x_dic = feature_vector(instance,feature_words)
        feature_vals=[]
        for word in feature_words:
            feature_vals.append(feature_x_dic[word])
        feature_xs.append(feature_vals)
        labels.append(label)

    # 1st round feature selection by removing low variance features
    sel_lowvr = VarianceThreshold(threshold=(thre_hold))
    feature_xs_selected = sel_lowvr.fit(feature_xs)
    lowvr_index = feature_xs_selected.get_support(indices=True).tolist()
    feature_xs_selected = feature_xs_selected.transform(feature_xs).tolist()



    # 2nd round feature selection using sklearn's SelectKBest()
    if num_feature < len(feature_xs_selected[0]):
        sel_chi2 = SelectKBest(chi2, k= num_feature).fit(feature_xs_selected, labels)
        chi2_index= sel_chi2.get_support(indices=True).tolist()
        #feature_xs_selected = sel_chi2.transform(feature_xs_selected).tolist()# transform from numpy array back to lis
        return lowvr_index, chi2_index
    else:
        print str(word) + ": chi2 selection not executed due to low # of features"
        return lowvr_index, [i for i in range(len(lowvr_index))]
Exemplo n.º 15
0
    def train_and_test(self, train_file, test_file):
        lines = read_text_src(train_file)
        lines = [x for x in lines if len(x) > 1]
        X_train = [line[1] for line in lines]
        y_train = [line[0] for line in lines]

        # lines = read_text_src(test_file)
        # lines = [x for x in lines if len(x) > 1]
        # X_test = [line[1] for line in lines]
        # y_test = [line[0] for line in lines]

        vectorizer = CountVectorizer(tokenizer=zh_tokenize)  # ngram_range=(1,2)

        X_train = vectorizer.fit_transform(X_train)
        print type(X_train)
        # X_test = vectorizer.transform(X_test)
        word = vectorizer.get_feature_names()
        v = len(word)
        get_bn_ratios(X_train,y_train,v)

        N = X_train.shape[1]
        ch2 = SelectKBest(chi2, k=int(N * 0.2))
        X_train = ch2.fit_transform(X_train, y_train)
        feature_names = [word[i] for i
                         in ch2.get_support(indices=True)]
Exemplo n.º 16
0
Arquivo: svm.py Projeto: lkprof/sema
def svm():
    #load data
    x_train,y_train=load_svmlight_file("12trainset")
    x_train.todense()
    x_test,y_test=load_svmlight_file("12testdata")
    x_test.todense()
    sk=SelectKBest(f_classif,9).fit(x_train,y_train)
    x_new=sk.transform(x_train)
    x_newtest=sk.transform(x_test)
    print(sk.scores_)
    print(x_new.shape)
    print(sk.get_support())
    #classfier
    clf=SVC(C=2,gamma=2)
    ovrclf=OneVsRestClassifier(clf,-1)
    ovrclf.fit(x_train,y_train)
    y_pred=ovrclf.predict(x_test)
    # write result
    with open("result.txt","w") as fw:
        for st in y_pred.tolist():
            fw.write(str(st)+'\n')
    print(np.array(y_pred).shape)

    target_names=['0','1','2','3']
    #result
    #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2)
    #print(classification_report(y_test,y_pred,target_names=target_names))
    #print("sougouVal: ",float(sum_y)/y_pred.shape[0])
    print(time.time()-start_time)
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20):
    # convert the training data text to features using TF-IDF vectorization
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(chapter_contents_train)
    # X_train_array = X_train.toarray()
    # print "tfidf vector length: ", len(X_train_array) #dbg
    # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg

    # use only the best k features according to chi-sq selection
    ch2 = SelectKBest(chi2, k=k)
    X_train = ch2.fit_transform(X_train, y_train)

    # determine the actual features used after best-k selection
    feature_names = np.asarray(vectorizer.get_feature_names())
    chisq_mask = ch2.get_support()
    features_masks = zip(feature_names,chisq_mask)
    selected_features = [z[0] for z in features_masks if z[1]]

    # train the classifier
    clf.fit(X_train, y_train)

    # convert the test data text into features using the same vectorizer as for training
    X_test = vectorizer.transform(chapter_contents_test)
    X_test = ch2.transform(X_test)

    # obtain binary class predictions for the test set
    preds = clf.predict(X_test)
    return preds, selected_features, clf
Exemplo n.º 18
0
def choseFeature(TrainX, TrainY, TestX):
	cF = SelectKBest(chi2, k=100)
	cF.fit(TrainX, TrainY)
	check = cF.get_support()
	newTrainX = cF.transform(TrainX)
	newTestX = cF.transform(TestX)
	return (newTrainX, newTestX)
Exemplo n.º 19
0
def featureSelectionSelectKBest(data, Featurenumber):
    label = data[:,1]
    datanew = data[:,2:]
    for i in range(0,len(datanew)):
        datanew[i] = map(abs, datanew[i])
    size = Featurenumber
    selector = SelectKBest(chi2, k=size).fit(data[:,2:],data[:,1])
    print selector.get_support(True)
    X_new = selector.fit_transform(datanew, label)
    data[:,2:size+2] = X_new
    fd = open('History.txt','a')
    history = 'Feature Selection: SelectKBest' + '\n' + 'Selected Feature: ' + str(selector.get_support(True)) + '\n'
    fd.write(history)
    fd.close()

    return data[:,:size+2]
Exemplo n.º 20
0
def test_select_kbest_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the k best heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def getTfidfData(dataTrain, dataTest, dataHold):
    print dataTrain.target_names
    
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    X_counts = count_vect.fit_transform(dataTrain.data)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print X_tfidf.shape
    
    Y_counts = count_vect.transform(dataTest.data)
    Y_tfidf = tfidf_transformer.transform(Y_counts)
    print Y_tfidf.shape
    
    H_counts = count_vect.transform(dataHold.data)
    H_tfidf = tfidf_transformer.transform(H_counts)
    
    print 'feature selection using chi square test', len(dataTrain.target)
    feature_names = count_vect.get_feature_names()
    
    ch2 = SelectKBest(chi2, k='all')
    X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target)
    Y_tfidf = ch2.transform(Y_tfidf)
    H_tfidf = ch2.transform(H_tfidf)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        
    if feature_names:
        feature_names = numpy.asarray(feature_names)
        print 'important features'
        print feature_names[:10]
    return X_tfidf, Y_tfidf, H_tfidf
Exemplo n.º 22
0
def discriminatory_features():
	print 'Finding most discriminatory features...'

	NUM_FEATURES = 10

	all_points = class1_song_points + class2_song_points
	true_labels = [0]*len(class1_song_points)+[1]*len(class2_song_points)

	feature_indices = []
	for i in range(NUM_FEATURES):
		selector = SelectKBest(chi2, i+1)
		selector.fit(all_points, true_labels)

		new_indices = selector.get_support(indices=True)
		for index in new_indices:
			if index not in feature_indices:
				feature_indices.append(index)

	feature_descriptions = []

	for index in feature_indices:
		feature = feature_names[index]
		if feature.lower() in wsj_mapping.keys():
			key = wsj_mapping[feature.lower()]
			description = key + ': ' + wsj_to_description[key]
		elif feature in word_vocab:
			description = 'The word: ' + feature
		else:
			description = feature
		feature_descriptions.append(description)
	return jsonify(features=feature_descriptions)
Exemplo n.º 23
0
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Exemplo n.º 24
0
 def univariate_features_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
     selector = SelectKBest(chi2, k=10)
     selector = selector.fit(x, y)
     selected_features = self.features[selector.get_support()]
     print(selected_features)
     x = selector.transform(x)
     return x
    def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, df, console):
        Tk.Frame.__init__(self, master)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.evaluator = evaluator
        self.df = df
        self.console = console

        frame_train = Tk.Frame(self)
        frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15)
        plt.figure(figsize=(12, 20))
        plt.subplot(111)
        
        # k best feature's names
        plt.figure(figsize=(12, 8))
        plt.subplot(111)
        selection = SelectKBest(f_classif, k=3)
        selection.fit(self.x_train, self.y_train)
        feature_scores = selection.scores_
        feature_names = df.columns.values
        feature_names = feature_names[feature_names != "NSP"]
        kbest_feature_indexes = selection.get_support()
        kbest_feature_names = feature_names[kbest_feature_indexes]

        # 存为DataFrame
        rec = zip(feature_scores, feature_names)
        data = pd.DataFrame(rec, columns=["Score", "Feature"])

        sns.barplot(x="Feature", y="Score", data=data)
        plt.xticks(rotation=-90)
        plt.title("Cardiotocography Feature Scores Ranking")
        self.attach_figure(plt.gcf(), frame_train)
Exemplo n.º 26
0
def main():
    inp = open('C:/Users/Abhi/workspace/MalwareClassification/ASMTRAINFULLDATA.csv','r')
    trainData = inp.readlines()
    trainData = trainData[2:]
    td=[]
    print len(trainData)
    for line in trainData:
        td.append(line.split(','))
    out = []    
    #print len(td[2])
    for i in range(len(td)):
        out.append(int(td[i][1]))
        td[i] = td[i][2:-1]
        for j in range(len(td[0])):
            td[i][j] = int(td[i][j])
    
    '''for i in range(len(td)):
        nConstant = sum(td[i])
        for j in range(len(td[0])):
            td[i][j] =td[i][j]/nConstant
    '''        
    
    #print td[0]        
            
    #print len(td[0])
    clf = SelectKBest(k=100)
    b = clf.fit_transform(td,out)
    #print b[0]
    j =clf.get_support(indices =True)
    #print len(b), len(b[0])
    #print j
    '''k=0
Exemplo n.º 27
0
def do_training():
    global X_train, X_test, feature_names, ch2
    print("Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train_data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if True:#opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" % 20000)
        t0 = time()
        ch2 = SelectKBest(chi2, k=20000)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        print("done in %fs" % (time() - t0))
        print()
    
    if feature_names:
        feature_names = np.asarray(feature_names)

    results = []

    #for penalty in ["l2", "l1"]:
    penalty = 'l2'
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3)
    results.append(benchmark(clf))
        
    joblib.dump(vectorizer, 'vectorizer.pkl', compress=9)
    joblib.dump(ch2, 'feature_selector.pkl', compress=9)
    joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
    def _calculate(measurements):
        # Initialize classifiers
        classifiers = dict()

        # Create classifier for each model
        for key in measurements:
            # Initialize model
            classifiers[key] = { "models": dict(), "features": [] }
            vec = DictVectorizer()

            # Set vectorizer to use only selected features
            features = vec.fit_transform(measurements[key][0])
            # Init feature selection and use it
            support = SelectKBest(chi2, k=10).fit(features, measurements[key][1])
            vec.restrict(support.get_support()) 

            # Assign used features
            classifiers[key]["features"] = vec.get_feature_names()

            # Get selected features data
            data = vec.transform(measurements[key][0]).toarray()

            # We need to split these data to create learning and testing set
            X_train, X_test, y_train, y_test = train_test_split(data, measurements[key][1])
            
            # Fit all models
            classifiers[key]["models"] = ModelService._createModels(X_train, X_test, y_train, y_test)

        # Return result
        return classifiers
Exemplo n.º 29
0
def get_k_best(x,y, k=300):
    '''
    return k features name
    '''
    sk = SelectKBest(f_classif, k=300)
    sk.fit_transform(x,y)
    return x.columns[sk.get_support()]
Exemplo n.º 30
0
def predict_mulitple_subgraphs(X_original,y):

    time_start = time.time()
    #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y)
    rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto',   min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3)
    #rforest = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False)

    #rforest = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')

    #rforest = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best')

    skb = SelectKBest(f_classif, k=80).fit(X_original,y)
    X = skb.fit_transform(X_original,y)
    print (skb.get_support(indices=False))

    rforest.fit(X,y)
    #my_get_fp_fn_inter(rforest,X,y)

    #m_secs =(time.time() - time_start)*1000
    #print ("training mi-seconds {}".format(m_secs))

    f_test_new_released = 'apks/'
 
    files = get_filepaths(f_test_new_released)

    subgraph_property(files, rforest, skb)
Exemplo n.º 31
0
print(newdf_test['label'].value_counts())

X_Probe=newdf.drop('label',1)
Y_Probe=newdf.label
X_Probe_test = newdf_test.drop('label',1)
Y_Probe_test = newdf_test.label

colNames=list(X_Probe)

from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import f_classif
np.seterr(divide='ignore', invalid='ignore');
fclass = SelectKBest(f_classif, k = 55) #iterate the k from 1 to 120. The max. accuracy comes at k=55 .
fclass.fit(X_Probe , Y_Probe)
true=fclass.get_support()
fclasscolindex_Probe=[i for i, x in enumerate(true) if x]
fclasscolname_Probe=list(colNames[i] for i in fclasscolindex_Probe)
print('Features selected :',fclasscolname_Probe)

features = newdf[fclasscolname_Probe].astype(float)
features1 = newdf_test[fclasscolname_Probe].astype(float)
lab = newdf['label']
lab1 = newdf_test['label']

from sklearn.svm import LinearSVC
clf = LinearSVC(random_state = 0)
t0 = time()
clf.fit(features, lab)
tt = time() - t0
print ("Classifier trained in {} seconds".format(round(tt,3)))
Exemplo n.º 32
0
def class33(X_train, X_test, y_train, y_test, i, X_1k, y_1k):
    ''' This function performs experiment 3.3

    Parameters:
       X_train: NumPy array, with the selected training features
       X_test: NumPy array, with the selected testing features
       y_train: NumPy array, with the selected training classes
       y_test: NumPy array, with the selected testing classes
       i: int, the index of the supposed best classifier (from task 3.1)
       X_1k: numPy array, just 1K rows of X_train (from task 3.2)
       y_1k: numPy array, just 1K rows of y_train (from task 3.2)
    '''
    clf_index = {1: SVC(kernel="linear", max_iter=1000), 2: SVC(kernel="rbf", gamma=2, max_iter=1000),
                 3: RandomForestClassifier(max_depth=5, n_estimators=10), 4: MLPClassifier(alpha=0.05),
                 5: AdaBoostClassifier()}

    clf = clf_index[i]
    csv = open("a1_3.3.csv", "w+")
    count = 0
    b1_feat = []
    best_feat = []
    for data in [(X_1k, y_1k), (X_train, y_train)]:
        for k in [5, 10, 20, 30, 40, 50]:
            selector = SelectKBest(f_classif, k)
            selector.fit_transform(data[0], data[1])
            pp = selector.pvalues_
            indexes = selector.get_support()
            best = pp[indexes]

            # top features of 1k training set
            if count == 0:
                if k == 5:
                    print("len", len(indexes.tolist()))
                    print("indexes", indexes.tolist())
                    indexes = indexes.tolist()
                    for index in range(0, len(indexes)):
                        if indexes[index] is True:
                            b1_feat.append(index)

            # top features for 32k training set
            elif count == 1:
                if k == 5:
                    print("len", len(indexes.tolist()))
                    print("indexes", indexes.tolist())
                    indexes = indexes.tolist()
                    for index in range(0, len(indexes)):
                        if indexes[index] is True:
                            best_feat.append(index)
                csv.write(str(k))
                for p in best:
                    csv.write("," + str(p))
                csv.write("\n")
        count += 1
    print("best 5 features", best_feat, b1_feat)
    X_1k_best = np.zeros((1000, 5))
    X_test_best = np.zeros((8000, 5))
    X_train_best = np.zeros((32000, 5))
    for j in range(5):
        for i in range(0, len(X_test)):
            X_test_best[i][j] = X_test[i][best_feat[j]]
        for i in range(0, len(X_train)):
            X_train_best[i][j] = X_train[i][best_feat[j]]
        for i in range(0, len(X_1k)):
            X_1k_best[i][j] = X_1k[i][best_feat[j]]

    clf.fit(X_1k_best, y_1k)
    result = clf.predict(X_test_best)
    print("result len", len(result))
    csv.write(str(accuracy(confusion_matrix(y_test, result))) + ",")
    clf.fit(X_test_best, y_test)
    result = clf.predict(X_test_best)
    csv.write(str(accuracy(confusion_matrix(y_test, result))) + "\n")

    csv.write("liwc_sexual, receptiviti_cautious, receptiviti_type_a are the common best features in both low and high"
              "amounts of data. We can see that the cautious feature may be a good indicator since people in "
              "different political groups may be more wary of some topics, hence are more cautious. Or it can "
              "possibly be an indicator that conspiracy theorists correlate to certain parties.\n")
    csv.write("P values are generally higher given more data. This may be because there is less bias a set of data"
              "can have towards particular features.\n")

    csv.write("liwc_sexual, receptiviti_cautious,receptiviti_type_a, number of commas, number of common nouns are the top 5"
              "features for the 32K training case. This seems to suggest that different parties tends to have "
              "different speech habits since the features are so diverse. This makes sense since different parties "
              "would attract a specific type of demographic, as such they may be more prone to use a similar tone and"
              "sentence structure.")
Exemplo n.º 33
0
# Ref: http://stackoverflow.com/questions/25792012/feature-selection-using-scikit-learn
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
# 'try' for second exploration on feature selection, with new features
# Here SelectKBest will pick 4 features from the 5 features that were picked
# from the previous analysis.
features_list_try_2 = [
    'poi', 'exercised_stock_options', 'expenses', 'fraction_from_poi',
    'fraction_to_poi', 'restricted_stock'
]
data_try_2 = featureFormat(data_dict, features_list_try_2, sort_keys=True)
labels_try_2, features_try_2 = targetFeatureSplit(data_try_2)
selector = SelectKBest(f_classif, k=4)
features_try_2_selected = selector.fit_transform(features_try_2, labels_try_2)
# Ref: http://stackoverflow.com/questions/21471513/sklearn-selectkbest-which-variables-were-chosen
features_selected_indices = selector.get_support(
    indices=True) + 1  # Since I will retrive them from
# 'features_list_test_2', which
# contains 'poi' as first entry
print "Features selected by 'SelectKBest':\n", features_list_try_2[
    features_selected_indices[0]]
print features_list_try_2[features_selected_indices[1]]
print features_list_try_2[features_selected_indices[2]]
print features_list_try_2[features_selected_indices[3]]
print

# *******************************************************************
# Now I will explore Principal Component Analysis with a set of features that take
# into account all of the features in the first set, plus the new created features.
# The principal components will not be any of the original features, but a linear
# combination of them.
# I still want to see if I can gain any further insight with the results.
Exemplo n.º 34
0
    remove = []
    for col in X.columns:
        if X[col].std() == 0:
            remove.append(col)

    X.drop(remove, axis=1, inplace=True)
    test.drop(remove, axis=1, inplace=True)

    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import f_classif

    selectK = SelectKBest(f_classif, k=220)
    selectK.fit(X, y)
    X_sel = selectK.transform(X)

    features = X.columns[selectK.get_support()]
    print(features)
    sel_test = selectK.transform(test)
    X, y, X_submission = np.array(X_sel), np.array(
        y.astype(int)).ravel(), np.array(sel_test)

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    clfs = [
        RandomForestClassifier(n_estimators=100,
                               n_jobs=-1,
                               criterion='gini',
                               class_weight='balanced'),
print("训练数据集样本数目:%d, 测试数据集样本数目:%d" % (x_train.shape[0], x_test.shape[0]))

ss = MinMaxScaler()

x_train = ss.fit_transform(x_train, y_train)
x_test = ss.transform(x_test)

print("原始数据各个特征属性的调整最小值:", ss.min_)
print("原始数据各个特征属性的缩放数据值:", ss.scale_)

ch2 = SelectKBest(chi2, k=3)

x_train = ch2.fit_transform(x_train, y_train)
x_test = ch2.transform(x_test)

select_name_index = ch2.get_support(indices=True)
print("对类别判断影响最大的三个特征属性分布是:", ch2.get_support(indices=False))

pca = PCA(n_components=2)

x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

model = DecisionTreeClassifier(criterion='entropy')
model.fit(x_train, y_train)
y_test_hat = model.predict(x_test)

from sklearn.externals.six import StringIO
with open("iris.dot", 'w') as f:
    f = tree.export_graphviz(model, out_file=f)
            trainX = X[0:1240, :]
            trainY = Y[0:1240]
        else:
            testX = X[subNo * trialNum:(subNo + 1) * trialNum, :]
            testY = Y[subNo * trialNum:(subNo + 1) * trialNum]
            trainX = np.vstack(
                (X[0:subNo * trialNum, :],
                 X[(subNo + 1) * trialNum:subNum * trialNum, :]))
            trainY = np.concatenate(
                (Y[0:subNo * trialNum],
                 Y[(subNo + 1) * trialNum:subNum * trialNum]))

        # three feature selection method...
        # method 1
        sel_criteria1 = SelectKBest(chi2, k=num_k).fit(trainX, trainY)
        sel_indx1_mask = sel_criteria1.get_support()
        sel_indx1 = np.where(sel_indx1_mask == True)
        sel_indx1 = sel_indx1[0]
        trainX1 = trainX[:, sel_indx1]
        testX1 = testX[:, sel_indx1]
        # svm
        clf1 = svm.SVC(kernel='linear')
        clf1.fit(trainX1, trainY)
        predict_testY1 = clf1.predict(testX1)
        f1_scores[no_k, 0, subNo] = metrics.f1_score(testY, predict_testY1)
        acc_scores[no_k, 0,
                   subNo] = metrics.accuracy_score(testY, predict_testY1)
        print('current sub performance:', acc_scores[no_k, 0, subNo],
              ' kbest:', num_k, ' selection_method:', 1)

        # method 2
Exemplo n.º 37
0
def ANN():
    digits = load_digits()
    data_features = digits.data[:, 0:-1]
    label = digits.data[:, -1]
    ylim = None

    digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split\
        (data_features, label, test_size=0.3, random_state=0,
                     stratify=label)

    feature_columns = pd.DataFrame(data=digits_trainingX).columns

    #clf = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(63,), random_state=1,
                        #solver='adam')
    #clf.fit(digits_trainingX, digits_trainingY)
    #y_pred = clf.predict(digits_testingX)

    kb = SelectKBest(score_func=f_regression, k=45)
    kb.fit(digits_trainingX, digits_trainingY)
    mask = kb.get_support()
    chosen_features = []

    for bool, feature in zip(mask, feature_columns):
        if bool:
            chosen_features.append(feature)

    #indices = np.argsort(kb.scores_)[::-1]
    #selected_features = []
    #for i in range(63):
        #selected_features.append(pd.DataFrame(data=digits_trainingX).columns[indices[i]])

    df = pd.DataFrame(data=digits_trainingX)
    df = df[chosen_features]
    digits_trainingX = df.to_numpy()

    df2 = pd.DataFrame(data=digits_testingX)
    df2 = df2[chosen_features]
    digits_testingX = df2.to_numpy()

    #digits_trainingX = digits_trainingX[chosen_features]
    clf = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(45,), random_state=1,
                        solver='lbfgs')
    clf.fit(digits_trainingX, digits_trainingY)
    y_pred = clf.predict(digits_testingX)



    train_sizes = np.linspace(.1, 1.0, 5)

    # ======================== CITATION BELOW ==============================================#
    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html    cv = None
    n_jobs = None
    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(clf, digits_trainingX, digits_trainingY, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title('Control Curve')
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    # ======================== CITATION ABOVE ==============================================#

    optimizers = ['lbfgs', 'sgd', 'adam']
    max_iters = [100, 200, 500]
    batch_size = [5, 10, 100]
    seed = 52

    #for i in range(63):
        #selected_features.append(pd.DataFrame(data=digits_trainingX).columns[indices[i]])

    #plt.figure()
    #plt.bar(selected_features, kb.scores_[indices[range(63)]], color='r', align='center')
    #plt.xticks(rotation=45)
    #plt.xlabel('features')
    #plt.ylabel('score')

    param_grid = dict(solver=optimizers, max_iter=max_iters, batch_size=batch_size)

    grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=KFold(random_state=seed), verbose=10,
                        scoring='accuracy')
    grid_results = grid.fit(digits_trainingX, digits_trainingY)
Exemplo n.º 38
0
    headline = fin.readline()

    for line in fin:
        row = line.strip().split('\t')

        X.append([float(x) if x != '' else 0.0 for x in row[1:-1]])
        Y.append(float(row[-1]))

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.20,
                                                        random_state=0)

    ch2 = SelectKBest(mutual_info_regression, k=10)
    ch2.fit(X_train, Y_train)

    selected_features = ch2.get_support(indices=True)

    row = headline.split('\t')
    fout.write(data)

    for feature in range(len(selected_features)):
        fout.write('\t' +
                   row[feature +
                       1])  #+1 because the compound name is the first column

    fout.write('\n')

fout.close()
Exemplo n.º 39
0
def main():

    print("Validating Connected IoT Devices!")
    DM.dm_engine()

    DM.block_all_ips()

    # Importing the dataset
    dataset = pd.read_csv('/home/pi/Software/IoT-HASS/CICIDS2017_Sample.csv')

    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, 78].values

    # Splitting the dataset into the Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)

    ############## Start of Feature Scaling ###################
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    # Fitting Decision Tree Classification to the Training set
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)

    # Feature Selection
    from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2

    KBestSelector = SelectKBest(k=5)
    KBestSelector = KBestSelector.fit(X_train, y_train)
    X_train_FS = KBestSelector.transform(X_train)

    names = dataset.iloc[:, :-1].columns.values[KBestSelector.get_support()]
    scores = KBestSelector.scores_[KBestSelector.get_support()]
    names_scores = list(zip(names, scores))
    ns_df = pd.DataFrame(data=names_scores, columns=['Feat_Name', 'F_Score'])
    ns_df_sorted = ns_df.sort_values(['F_Score', 'Feat_Name'])
    #print(ns_df_sorted)

    # Fit the model with the new reduced features
    classifier.fit(X_train_FS, y_train)

    # Predicting the Test set results
    X_test_FS = KBestSelector.transform(X_test)
    y_pred = classifier.predict(X_test_FS)

    conn = socket.socket(socket.AF_PACKET, socket.SOCK_RAW, socket.ntohs(3))

    # define array variables to hold time and statistics
    TimeBetBwdPkts = 0
    NumBwdPkts = 0
    NumIdleFlow = 0
    prev_fin_flag = 0
    flow_idle_start_time = datetime.datetime.now()
    flow_idle_end_time = datetime.datetime.now()
    AllTimesBetBwdPkts = []
    AllflowIdleTimes = []
    AllPacketLengths = []

    max_biat = 0
    mean_biat = 0
    std_biat = 0
    pkt_len_varience = 0
    std_idle = 0

    while True:
        raw_data, addr = conn.recvfrom(65535)
        dest_mac, src_mac, eth_proto, data = unpack_ethernet_frame(raw_data)

        # get packet length or size
        packet_length = len(raw_data)
        AllPacketLengths.append(packet_length)

        # IPv4
        if eth_proto == 8:
            (version, header_length, ttl, proto, src, target,
             data) = ipv4_packet_header(data)

            # TCP packet
            if proto == 6:
                (src_port, dest_port, sequence, acknowledgement, flag_urg,
                 flag_ack, flag_psh, flag_rst, flag_syn, flag_fin,
                 data) = unpack_tcp_segment(data)

                # capture packet flow
                # we will identifiy each flow by determining when src and dst ip change

                # first capture the original src and dst IPs
                prev_src_ip = src
                prev_target_ip = target

                if flag_fin == '1' and prev_fin_flag == '0':
                    flow_idle_start_time = datetime.datetime.now()
                    NumIdleFlow = NumIdleFlow + 1
                elif flag_fin == '0' and prev_fin_flag == '1':
                    flow_idle_end_time = datetime.datetime.now()
                else:
                    flow_idle_start_time = datetime.datetime.now()
                    flow_idle_end_time = datetime.datetime.now()

                prev_fin_flag = flag_fin

                flowIdleTime = (flow_idle_end_time -
                                flow_idle_start_time).microseconds

                AllflowIdleTimes.append(flowIdleTime)

                LastTimeBwdPktSeen = datetime.datetime.now()

                if (NumBwdPkts == 1):
                    TimeBetBwdPkts = 0
                elif (NumBwdPkts > 1):
                    TimeBetBwdPkts = (datetime.datetime.now() -
                                      LastTimeBwdPktSeen).microseconds
                else:
                    TimeBetBwdPkts = 0

                NumBwdPkts = NumBwdPkts + 1
                AllTimesBetBwdPkts.append(TimeBetBwdPkts)

            # get statistics values for backwards packets
            if sum(AllTimesBetBwdPkts) == 0:
                mean_biat = 0
                max_biat = 0
                std_biat = 0
            else:
                mean_biat = stats.mean(AllTimesBetBwdPkts)
                max_biat = max(AllTimesBetBwdPkts)
                std_biat = stats.stdev(AllTimesBetBwdPkts)

            if (sum(AllflowIdleTimes) > 0 and len(AllflowIdleTimes) > 1):
                std_idle = stats.stdev(AllflowIdleTimes)

            else:
                std_idle = 0

            if (sum(AllPacketLengths) > 0 and len(AllPacketLengths) > 1):
                pkt_len_varience = stats.variance(AllPacketLengths)
            else:
                pkt_len_varience = 0

            # Invoking iot_hass() function
            iot_hass(mean_biat, std_biat, max_biat, pkt_len_varience, std_idle,
                     src, target, classifier, dest_mac, src_mac, raw_data)
Exemplo n.º 40
0
def accuracy(new_features,features_list):
    #Feature List
    features_list = monta_feature(features_list)
    if new_features == False:
        print features_list
        
    else:
        features_list = nova_feature(data_dict_woo, features_list)
        print ""
        print features_list
        print "Testando novos features adicionados:\n"
        testa_nova_feature(data_dict_woo, "DIETRICH JANET R", features_list[-2:])
        
    # Extraindo as features e os labels do conjunto de dados
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)    
        
    features_train, features_test, labels_train, labels_test =     train_test_split(features, labels, test_size=0.3, random_state=42)
    print ""
    
    # Criando Min/Max Scaler
    from sklearn import preprocessing
    scaler = preprocessing.MinMaxScaler()
    # Scale Features
    features = scaler.fit_transform(features)
    
    skbest = SelectKBest(k=10)  # try best value to fit
    sk_trans = skbest.fit_transform(features_train, labels_train)
    indices = skbest.get_support(True)
    
    print "="*10,"skbest.scores_","="*10
    print skbest.scores_
    print "="*10, "="*(len("skbest.scores_")-2),"="*10
    print ""
    
    print "="*10,"features - score","="*10
    for index in indices:
        print 'features: %s score: %f' % (features_list[index + 1], skbest.scores_[index])
        
    print "="*10, "="*(len('features: %s score: %f')-2),"="*10
    print ""
    
    #print "GaussianNB"
    # GaussianNB
    clf = GaussianNB()
    clf.fit(features_train, labels_train)
    prediction = clf.predict(features_test)
    print "Accuracy GaussianNB  = {:.5f}".format(accuracy_score(prediction, labels_test))
    
    #print "KNeighborsClassifier"
    # KNeighborsClassifier
    clf = KNeighborsClassifier()
    clf = KNeighborsClassifier(algorithm = 'auto',leaf_size = 20,n_neighbors = 3,weights = 'uniform')
    clf.fit(features_train, labels_train)
    prediction = clf.predict(features_test)
    print "Accuracy KNeighborsClassifier  = {:.5f}".format(accuracy_score(prediction, labels_test))
    
    #print "SVC"
    # SVC
    clf = SVC(kernel = 'linear',max_iter = 10000,random_state = 42)
    clf.fit(features_train, labels_train)
    prediction = clf.predict(features_test)
    print "Accuracy SVC = {:.5f}".format(accuracy_score(prediction, labels_test))
    
    #print "AdaBoostClassifier"
    clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, min_samples_leaf=2, class_weight='balanced'),
                             n_estimators=50, learning_rate=.8)
    clf.fit(features_train, labels_train)
    prediction = clf.predict(features_test)
    print "Accuracy AdaBoostClassifier = {:.5f}".format(accuracy_score(prediction, labels_test))
Exemplo n.º 41
0
def chi_square(X, y, numOfFeatures = 'all'):
	X_Norm = MinMaxScaler().fit_transform(X)
	selector = SelectKBest(score_func=chi2, k=numOfFeatures).fit(X_Norm, y)
	cols = selector.get_support(indices = True).tolist()
	x_new = selector.transform(X)
	return x_new, cols
Exemplo n.º 42
0
    for ii in i_train:
        features_train.append(features[ii])
        labels_train.append(labels[ii])
    for jj in i_test:
        features_test.append(features[jj])
        labels_test.append(labels[jj])

#     print features_train
# fit selector to training set
    selector = SelectKBest(k=k)
    selector.fit(features_train, labels_train)

    #     print selector.scores_
    #     print selector.get_support(indices = True)

    for i, j in zip(selector.get_support(indices=True), selector.scores_):
        best_features.append(features_list[i])
        best_scores.append(j)
# print best_features

from collections import defaultdict
d = defaultdict(int)

for idx, key in enumerate(best_scores):
    if idx > k - 1:
        idx = idx % k

    d[idx] += key

#     print d
# for i in best_features:
Exemplo n.º 43
0
tree_best_cols = []
for e in zip(X.columns[1:], clf.feature_importances_):
    if e[1] > 0.005:
        print(e)
        tree_best_cols += [e[0]]

X_tree_best = df[tree_best_cols]

X_tree_best.hist(bins=50, figsize=(10, 10))
plt.show()
X_tree_best.boxplot(figsize=(10, 8))
plt.show()

### Select k best feature importance
sel = SelectKBest(f_classif, k=40).fit(X, y)
kbest_col10 = X.columns[sel.get_support()]
X_kbest10 = df[kbest_col10]

X_kbest10.hist(bins=50, figsize=(10, 10))
plt.show()
X_kbest10.boxplot(figsize=(10, 8))
plt.show()

### PCA dim reduction
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
pca.explained_variance_ratio_.sum()

#################################################
### Train / test set split & performance measures
#################################################
Exemplo n.º 44
0
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
'''
SelectKBest(score_func=<function f_classif>, k=10)
Select features according to the k highest scores.

'''

breastData = load_breast_cancer()
X = breastData.data
y = breastData.target

print('original data features number = ', str(X.shape[1]), ' feature')

FeatureSelectionMethod = SelectKBest(score_func=chi2, k=5)
new_X = FeatureSelectionMethod.fit_transform(X, y)

print('new data features number = ', str(new_X.shape[1]), ' feature')

print('selected features are')
print(FeatureSelectionMethod.get_support())
Exemplo n.º 45
0
print(newdf_test['label'].value_counts())

X_DOS = newdf.drop('label', 1)
Y_DOS = newdf.label
X_DOS_test = newdf_test.drop('label', 1)
Y_DOS_test = newdf_test.label

colNames = list(X_DOS)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
chi2f = SelectKBest(
    chi2,
    k=119)  #iterate the k from 1 to 120. The max. accuracy comes at k=119 .
chi2f.fit(X_DOS, Y_DOS)
true = chi2f.get_support()
chicolindex_DOS = [i for i, x in enumerate(true) if x]
chicolname_DOS = list(colNames[i] for i in chicolindex_DOS)
print('Features selected :', chicolname_DOS)

features = newdf[chicolname_DOS].astype(float)
features1 = newdf_test[chicolname_DOS].astype(float)
lab = newdf['label']
lab1 = newdf_test['label']

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
t0 = time()
clf.fit(features, lab)
tt = time() - t0
print("Classifier trained in {} seconds".format(round(tt, 3)))
Exemplo n.º 46
0
#%%split trainigng and testing dataset
df = df.drop(labels='amount', axis=1)
msk = np.random.rand(len(df)) < 0.8
x_train = df[msk].astype('float64')
y_train = x_train['log_amount'].astype('float64')
x_train = x_train.drop(labels='log_amount', axis=1).astype('float64')
x_test = df[~msk].astype('float64')
y_test = x_test['log_amount'].astype('float64')
x_test = x_test.drop(labels='log_amount', axis=1).astype('float64')
#%%
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest
clf = SelectKBest(f_regression, k=10)
X_new1 = clf.fit_transform(X=np.asarray(x_train.values, dtype="float64"),
                           y=(np.asarray(y_train, dtype="float64")))
mask1 = clf.get_support()
new_features = x_train.columns[mask1]
print(new_features)
#%%
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
clf = RandomForestRegressor(n_estimators=100,
                            max_features='sqrt',
                            n_jobs=-1,
                            random_state=0)
X_new = clf.fit(X=np.asarray(x_train.values, dtype="float64"),
                y=(np.asarray(y_train, dtype="float64")))
importances = clf.feature_importances_
indices = np.argsort(importances)
plY = x_train.columns[indices]
plX = importances[indices]
Exemplo n.º 47
0
print("\nCM per Test-set Random Forest: \n", cmRF_test, "\n")

# 3. applicare una features selection per evidenziare i pixel più significativi

feature_importances_ = clfRF.feature_importances_

print(
    "Importanza assegnata alle features dall'algoritmo (indica quanto gli sono servite durante il suo allenamento): \n",
    feature_importances_, "\n")

# 4. proiettare il dataset sulle 30 features più significative

select = SelectKBest(f_classif, k=30)
select.fit(X, y)

mask = select.get_support(
)  # ottengo un array che è composto da booleani. 'True' se la feature è importante 'False' se non è importante

np_mask = np.array(
    mask
)  # trasformo mask in un array numpy per poter eseguire operazioni su tale array --> in particolare l'estrazione delle fetures più significative

np_columns = np.array(
    data.columns[1:]
)  # prelevo tutte le colonne del dataset, la prima è "label" e non va considerata

most_significative_features = np_columns[
    np_mask]  # selezioniamo le features più significative

most_significative_features_importances = clfRF.feature_importances_[
    np_mask]  # seleziono i rispettivi valori di importanza
if opts.use_hashing:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
    print("done in %fs" % (time() - t0))
    print()

if feature_names:
    feature_names = np.asarray(feature_names)


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."


# #############################################################################
# Benchmark classifiers
def benchmark(clf):
Exemplo n.º 49
0
feat = features.split('","')
feat = feat[1:-1]
desiredno = len(feat)

features_test = test[feat]

labels_train = train[label]

features_train = train[feat]

labels_test = test[label]

# feature selection
select = SelectKBest(f_regression, k="all").fit(features_train, labels_train)
ranking = select
cols = select.get_support(indices=True)
rank = select.scores_
mask = select.get_support()
new_features = features_train.columns[mask]
features_train = features_train[new_features]
features_test = features_test[new_features]

# ridge regression
t0 = time()
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

grid_param = {'alpha': [10, 4, 1.0, 0.5, 0.3, 0.08, 0.02]}
rdg_reg = Ridge()
gd_sr = GridSearchCV(
    estimator=rdg_reg,
Exemplo n.º 50
0
def select_k_best(k, data, labels):
    k_best = SelectKBest(k=k)
    data = k_best.fit_transform(data, labels)
    return data, labels, k_best.get_support()
Exemplo n.º 51
0
def select_kbest_freg(X_train, y_train, k):
    f_selector = SelectKBest(f_regression, k=k).fit(X_train, y_train)
    f_support = f_selector.get_support()
    f_feature = X_train.loc[:, f_support].columns.tolist()
    return f_feature
Exemplo n.º 52
0
def k_significant_feat(
        feat, y_class, k=5, score_func='f_classif', scale=None,
        feat_names=None, plot=True, k_to_plot=None, close_after_plotting=False,
        saveto=None, figsize=None, title=None, xlabel=None
        ):
    """
    Finds the k most significant features in the feature matrix, based on
    how well they separate the data in groups defined in y_class. It uses
    univariate statistical tests (the type of test is specified in the variable
    score_func).
    param:
        feat: array-like, shape=(n_samlples, n_features)
            The feature matrix
        y_class: array-like, shape=(n_samples)
            Vector with the class of each samples
        k: integer or 'all'
            Number of fetures to select
        score_func: str or function, optional
            If string 'f_classif', 'chi2', 'mutual_info_classif' then the
            function f_classif, chi2 or mutual_info_classif
            from sklearn.feature_selection will be used.
            Otherwise, the user needs to input a function that takes two
            arrays X and y, and returns a pair of arrays (scores, pvalues)
            or a single array with scores.
            Default is 'f_classif'.
        scale: None, str or function, optional
            If string 'standardize', 'minmax_scale', the
            tierpsytools.preprocessing.scaling_class.scalingClass is used
            to scale the features.
            Otherwise the used can input a function that scales features.
            Default is None (no scaling).
        feat_names: list shape=(n_features)
            The names of the features, when feat is an array and not a dataframe
            (will be used for plotting)

    return:
        support: array of booleans
            True for the selected features, False for the rest
        plot: boolean
            If True, the boxplots of the chosen features will be plotted
        plot
    """
    from sklearn.feature_selection import \
        SelectKBest, chi2,f_classif, mutual_info_classif

    if plot and k_to_plot is None:
        k_to_plot = k

    if isinstance(feat,np.ndarray):
        feat = pd.DataFrame(feat, columns=feat_names)
    feat = feat.loc[:, feat.std()!=0]

    if isinstance(k,str):
        if k=='all':
            k = feat.shape[1]
        else:
            raise Exception('Data type for k not recognized.')

    # Find most significant features
    if isinstance(score_func, str):
        if score_func=='f_classif':
            score_func = f_classif
        elif score_func=='chi2':
            score_func = chi2
        elif score_func=='mutual_info_classif':
            score_func = mutual_info_classif

    if scale is not None:
        if isinstance(scale, str):
            scaler = scalingClass(scaling=scale)
            feat_scaled = scaler.fit_transform(feat)
        else:
            feat_scaled = scale(feat)
    else:
        feat_scaled = feat

    skb = SelectKBest(score_func=score_func, k=k)
    skb.fit(feat_scaled, y_class)

    support = skb.get_support()
    sorted_scores = np.sort(skb.scores_)
    ids_sorted_scores = np.argsort(skb.scores_)
    top_ft_ids = np.flip(ids_sorted_scores[~np.isnan(sorted_scores)])[:k]
    scores = skb.scores_[top_ft_ids]
    if hasattr(skb, 'pvalues_'):
        pvalues = skb.pvalues_[top_ft_ids]
    else:
        pvalues = None

    # Plot a boxplot for each feature, showing its distribution in each class
    if plot:
        plot_feature_boxplots(
            feat.iloc[:, top_ft_ids[:k_to_plot]], y_class, scores,
            pvalues=pvalues, figsize=figsize, saveto=saveto, xlabel=xlabel,
            close_after_plotting=close_after_plotting)

    if pvalues is not None:
        return feat.columns[top_ft_ids].to_list(), (scores, pvalues), support
    else:
        return feat.columns[top_ft_ids].to_list(), scores, support
Exemplo n.º 53
0
print("Accuracy for Knn decision Tree is ", accuracy)
pp.pprint(
    classification_report(y_true=labels_test,
                          y_pred=pred,
                          target_names=target_names))

#### using Select K Best algo to selection the best 7 features after ty.

no_of_selected_feat = 10
from sklearn.feature_selection import SelectKBest, f_classif

kbest = SelectKBest(f_classif, k=no_of_selected_feat)
kbest.fit_transform(features, labels)
features_selected = [
    features_list[i + 1] for i in kbest.get_support(indices=True)
]
features_score = {}
for a, b in zip(features_selected, kbest.scores_):
    features_score[a] = b

#### publishing the top 10 features from best score to fewer score

print('\n')
print("SelectKBest chose " + str(no_of_selected_feat) + " features")
pp.pprint(sorted(features_score.items(), key=itemgetter(1), reverse=True))
print("\n")

### Adding poi feature at the begining fo the features list.

if 'poi' in features_selected:
    cor_feature = X.iloc[:,
                         np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature


cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:, chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(),
                   n_features_to_select=100,
                   step=10,
                   verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:, rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
Exemplo n.º 55
0
#print(all_x_quadr)
transformer_exp = FunctionTransformer(np.exp)
all_x_exp = transformer_exp.transform(all_x_linear)
#print(all_x_exp)
transformer_cos = FunctionTransformer(np.cos)
all_x_cos = transformer_cos.transform(all_x_linear)
#print(all_x_cos)
all_x_ones = np.ones((len(all_y), 1))

all_x = np.concatenate(
    (all_x_linear, all_x_quadr, all_x_exp, all_x_cos, all_x_ones), axis=1)

## feature selection ##
sel = SelectKBest(k=18)
all_x_sel = sel.fit_transform(all_x, all_y)
sel_result = sel.get_support(indices=False)  #list of selected features
all_index = np.arange(0, 21)
sel_in_index = all_index[sel_result]
sel_out_index = all_index[~sel_result]
print(sel_in_index)
print(sel_out_index)

## ridge regression ##
#alpha_set = np.array([1e-2, 1e-1, 1e0, 1e1, 1e2])
rmse_avg_all = []
alpha = 10
#K folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)
rmse_total = 0
models = []
rmses = []
if not a.disable_early_stop:
	xgb_fit_params['early_stopping_rounds'] = 200
	cb_fit_params['early_stopping_rounds'] = 200
	lgb_fit_params['early_stopping_rounds'] = 200

if a.select_k_best is not None:
	print(f"Selecting {a.select_k_best} best features")
	def score_features(X, y, estimator=None):
	    return clone(estimator).fit(X, y).feature_importances_

	xgb_regressor = xgb.XGBRegressor(**xgb_params)

	fs = SelectKBest(score_func=lambda X, y: score_features(X, y, estimator=xgb_regressor), k=a.select_k_best).fit(X_all[:Y.shape[0]], Y)

	X_all = X_all.iloc[:, fs.get_support(indices=True)]

print(X_all.shape)

print('start training...')

folds = a.folds
bootstrap_runs = a.bootstrap_runs

fold_scores = []
fold_predictions = []
oof_fold_predictions = []

n_leak = np.where(leak_Y !=0)[0].shape[0]
print(np.arange(len(Y)).shape, n_leak)
to_train_idx = np.arange(len(Y))
Exemplo n.º 57
0
y_score = clf.predict(x_test)
M = confusion_matrix(y_test, y_score)
P, R, F1 = computeP_R_F1(M)

print("RandomForestClassifier : ")
print(M)
print("P = " + str(P) + "\nR = " + str(R) + "\nF1 = " + str(F1) +
      "\n-------------")

model = SelectFromModel(clf, prefit=True)
X_new = model.transform(x_train)
print(X_new.shape)

selector = SelectKBest(chi2, k=2)
X_new = selector.fit_transform(x_train, y_train)
idxs_selected = selector.get_support(indices=True)
print(idxs_selected)

x_train_new = x_train[:, 0:3]
x_test_new = x_test[:, 0:3]
clf.fit(x_train_new, y_train)
y_score = clf.predict(x_test_new)
M = confusion_matrix(y_test, y_score)
P, R, F1 = computeP_R_F1(M)

print("RandomForestClassifier FS : ")
print(M)
print("P = " + str(P) + "\nR = " + str(R) + "\nF1 = " + str(F1) +
      "\n-------------")

from sklearn.ensemble import ExtraTreesClassifier
for i in k:
    select = SelectKBest(f_classif, k=i)
    x_train_new = select.fit_transform(x_train, y_train)
    svm.fit(x_train_new, y_train)
    train_accuracy.append(svm.score(x_train_new, y_train))
    
plt.plot(k, train_accuracy, color = 'red', label = 'Train')
plt.xlabel('k values')
plt.ylabel('Train accuracy')
plt.legend()
plt.show()

select_top = SelectKBest(f_classif, k =5)
x_train_new = select_top.fit_transform(x_train, y_train)
x_test_new = select_top.fit_transform(x_test, y_test)
print('Top train features', x_train.columns.values[select_top.get_support()])
print('Top train features', x_test.columns.values[select_top.get_support()])

c = [1.0, 0.25, 0.5, 0.75]
kernels = ['linear', 'rbf']
gammas = ['auto', 0.01, 0.001, 1] #1/n_feature

svm = SVC()

grid_svm = GridSearchCV(estimator = svm, param_grid = dict(kernel = kernels, C = c, gamma = gammas), cv = 5)
grid_svm.fit(x_train_new, y_train)
print('The best hyperparamters: ', grid_svm.best_estimator_)

svc_model = SVC(C = 1, gamma='auto', kernel='linear')
svc_model.fit(x_train_new, y_train)
Exemplo n.º 59
0
if opts.use_hashing:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [
            feature_names[i] for i in ch2.get_support(indices=True)
        ]
    print("done in %fs" % (time() - t0))
    print()

if feature_names:
    feature_names = np.asarray(feature_names)


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."


# #############################################################################
# Benchmark classifiers
Exemplo n.º 60
0
target = data['PSS_Stress']
data = data.drop('PSS_Stress', 1)

# Missing Data Filtering

print(data.isnull().any(axis=1).sum())  # número de registos que possuem pelo menos um valor 'NaN'

data = data.fillna(data.median())  # substituir NaN por valor da mediana
# data = data.fillna(data.mean())  # substituir NaN por valor da média
# data = data.dropna()  # descartar registos que possuem NaN

# Feature selection

selector = SelectKBest(f_classif, k=5)
selector.fit(data, target)
cols = selector.get_support(indices=True)
cols_names = list(data.columns[cols])

for idx, (ci, cn) in enumerate(zip(cols, cols_names)):
    print("*" * (len(cols) - idx) + " " * idx, ci, cn)

data = data[cols_names]

# Comparar resultados entre MinMaxScaler e RobustScaler:

scaler = preprocessing.RobustScaler()

values_standardized = scaler.fit_transform(data.values)
data = pd.DataFrame(values_standardized, columns=data.columns)

clf_model = SVC()