Exemplo n.º 1
0
def MINE_selector(train_x, train_y, k=10):
    from sklearn.feature_selection import SelectKBest
    from minepy import MINE

    def mic(x, y):
        m = MINE()
        m.compute_score(x, y)
        return (m.mic(), 0.5)

#选择K个最好的特征,返回特征选择后的数据

    selection = SelectKBest(
        lambda X, Y: np.array(map(lambda x: mic(x, Y), X.T)).T, k)
    selection.fit(train_x, train_y)
    selection.scores_ = selection.scores_[0]
    print(
        '----------------------------feature importance -------------------------'
    )
    importance = selection.scores_
    print selection.scores_
    print(
        '----------------------------- selected feature -------------------------'
    )
    print selection.get_support(indices=True)
    return selection, importance
Exemplo n.º 2
0
print "Training Naive Bayes model "

# load index file
y = population[:, 1]
X = plpData[population[:, 0], :]
print "population loaded- %s rows and %s columns" % (np.shape(population)[0],
                                                     np.shape(population)[1])
print "Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1])
###########################################################################

print "Applying univariate feature selection to select %s features as naive bayes reqires non-sparse data " % (
    featnum)
kbest = SelectKBest(chi2, k=2000).fit(
    X[population[:, population.shape[1] - 1] > 0, :],
    y[population[:, population.shape[1] - 1] > 0])
kbest.scores_ = np.nan_to_num(kbest.scores_)
print "Test kbest length: %s non-zero: %s" % (kbest.scores_.shape[0],
                                              np.sum(kbest.scores_ != 0))
threshold = -np.sort(-kbest.scores_)[featnum - 1]
print "Threshold varImp set at %s" % (threshold)
X = X[population[:, population.shape[1] - 1] > 0, :]
print "Test X dim: %s , %s" % (X.shape[0], X.shape[1])
X = X[:, kbest.scores_ >= threshold]
print "Test X dim: %s , %s" % (X.shape[0], X.shape[1])
X = X.toarray()
y = y[population[:, population.shape[1] - 1] > 0]

pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
print "Calculating prediction for train set of size %s" % (pred_size)
test_pred = np.zeros(
    pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
Exemplo n.º 3
0
        # Create a scores array to get the individual categorical column.
        # Example:
        #  data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical',
        #         'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States']
        #  scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        #
        # Returns: [['Sate-gov']]
        scores = []
        # Build the scores array
        for j in range(len(COLUMNS[:-1])):
            if i == j:  # This column is the categorical column we want to extract.
                scores.append(1)  # Set to 1 to select this column
            else:  # Every other column should be ignored.
                scores.append(0)
        skb = SelectKBest(k=1)
        skb.scores_ = scores
        # Convert the categorical column to a numerical value
        lbn = LabelBinarizer()
        r = skb.transform(train_features)
        lbn.fit(r)
        # Create the pipeline to extract the categorical feature
        categorical_pipelines.append(('categorical-{}'.format(i),
                                      Pipeline([('SKB-{}'.format(i), skb),
                                                ('LBN-{}'.format(i), lbn)])))

# Create pipeline to extract the numerical features
skb = SelectKBest(k=6)
# From COLUMNS use the features that are numerical
skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
categorical_pipelines.append(('numerical', skb))
def train_naive_bayes(population, plpData, modelOutput, variableNumber, quiet):
  print("Training Naive Bayes model " )
  y = population[:,1]
  X = plpData[population[:,0].astype(int),:]
  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
  ###########################################################################
  featnum = min(variableNumber,X.shape[1])
  print("Applying univariate feature selection to select %s features as naive bayes reqires non-sparse data " %(featnum))
  kbest = SelectKBest(chi2, k=featnum).fit(X[population[:,population.shape[1]-1] > 0,:], y[population[:,population.shape[1]-1] > 0])
  kbest.scores_ = np.nan_to_num(kbest.scores_)
  print("Test kbest length: %s non-zero: %s" %(kbest.scores_.shape[0], np.sum(kbest.scores_!=0)))
  threshold = -np.sort(-kbest.scores_)[featnum-1]
  print("Threshold varImp set at %s" %(threshold))
  X = X[population[:,population.shape[1]-1] > 0, :]
  print("Test X dim: %s , %s" %(X.shape[0], X.shape[1]) )
  X = X[:,kbest.scores_ >=threshold]
  print("Test X dim: %s , %s" %(X.shape[0], X.shape[1]) )
  X = X.toarray()
  y = y[population[:,population.shape[1]-1] > 0]
  pred_size = int(np.sum(population[:,population.shape[1]-1] > 0))
  print("Calculating prediction for train set of size %s" %(pred_size))
  test_pred = np.zeros(pred_size)# zeros length sum(population[:,population.size[1]] ==i)
  for i in range(1, int(np.max(population[:,population.shape[1]-1])+1), 1):
    testInd = population[population[:,population.shape[1]-1] > 0,population.shape[1]-1] ==i
    trainInd = (population[population[:,population.shape[1]-1] > 0,population.shape[1]-1] !=i)
    train_x = X[trainInd,:]
    train_y = y[trainInd]
    test_x = X[testInd,:]	
    print("Fold %s split %s in train set and %s in test set" %(i, train_x.shape[0], test_x.shape[0]))
    print("Train set contains %s outcomes " %(np.sum(train_y)))
    # train on full test data using hyper parameters
    print("Training fold %s" %(i))
    start_time = timeit.default_timer()	
    gnb = GaussianNB()
    gnb = gnb.fit(train_x, train_y)
    end_time = timeit.default_timer()
    print("Training fold took: %.2f s" %(end_time-start_time))
    print("Calculating predictions on left out fold set...")
    ind = (population[:,population.shape[1]-1] > 0)
    ind = population[ind,population.shape[1]-1]==i
    test_pred[ind] = gnb.predict_proba(test_x)[:,1]
    print("Prediction complete: %s rows " %(np.shape(test_pred[ind])[0]))
    print("Mean: %s prediction value" %(np.mean(test_pred[ind])))
  # cv pred
  test_pred.shape = (population[population[:,population.shape[1]-1] > 0,:].shape[0], 1)
  predictioncv = np.append(population[population[:,population.shape[1]-1] > 0,:],test_pred, axis=1)
	# train final:
  print("Training final naive bayes model on all train data...")
  start_time = timeit.default_timer()	
  gnb = GaussianNB()
  gnb = gnb.fit(X, y)
  end_time = timeit.default_timer()
  print("Training final took: %.2f s" %(end_time-start_time))
	# save the model:
  if not os.path.exists(modelOutput):
    os.makedirs(modelOutput)
  print("Model saved to: %s" %(modelOutput)	)
  joblib.dump(gnb, os.path.join(modelOutput,"model.pkl"))  
  # merge pred with indexes[testInd,:]
  test_pred = gnb.predict_proba(X)[:,1]
  test_pred.shape = (population[population[:,population.shape[1]-1] > 0,:].shape[0], 1)
  prediction = np.append(population[population[:,population.shape[1]-1] > 0,:],test_pred, axis=1)
  return prediction, predictioncv, kbest.scores_;
Exemplo n.º 5
0
# created by iterating over the COLUMNS and checking if it is a CATEGORICAL_COLUMN.
for i, col in enumerate(COLUMNS[:-1]):
    if col in CATEGORICAL_COLUMNS:
        # Create a scores array to get the individual categorical column.
        # Example:
        #  data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical', 
        #         'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States']
        #  scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        #
        # Returns: [['State-gov']]      
        # Build the scores array.
        scores = [0] * len(COLUMNS[:-1]) 
        # This column is the categorical column we want to extract.
        scores[i] = 1  
        skb = SelectKBest(k=1)
        skb.scores_ = scores
        # Convert the categorical column to a numerical value
        lbn = LabelBinarizer()
        r = skb.transform(train_features)
        lbn.fit(r)
        # Create the pipeline to extract the categorical feature
        categorical_pipelines.append(
            ('categorical-{}'.format(i), Pipeline([
                ('SKB-{}'.format(i), skb),
                ('LBN-{}'.format(i), lbn)])))
# [END categorical-feature-conversion]

# [START create-pipeline]
# Create pipeline to extract the numerical features
skb = SelectKBest(k=6)
# From COLUMNS use the features that are numerical