Exemplo n.º 1
0
 def oversample(self):
     self._X_original = self._X
     self._y_original = self._y
     ros = RandomOverSampler(random_state=0)
     X, y = ros.fit_sample(self._X, self._y)
     self._X = X
     self._y = y
Exemplo n.º 2
0
    def _fit_resample(self, X, y):
        n_samples = X.shape[0]

        # convert y to z_score
        y_z = (y - y.mean()) / y.std()

        index0 = np.arange(n_samples)
        index_negative = index0[y_z > self.negative_thres]
        index_positive = index0[y_z <= self.positive_thres]
        index_unclassified = [x for x in index0
                              if x not in index_negative
                              and x not in index_positive]

        y_z[index_negative] = 0
        y_z[index_positive] = 1
        y_z[index_unclassified] = -1

        ros = RandomOverSampler(
            sampling_strategy=self.sampling_strategy,
            random_state=self.random_state,
            ratio=self.ratio)
        _, _ = ros.fit_resample(X, y_z)
        sample_indices = ros.sample_indices_

        print("Before sampler: %s. Total after: %s"
              % (Counter(y_z), sample_indices.shape))

        self.sample_indices_ = np.array(sample_indices)

        if self.return_indices:
            return (safe_indexing(X, sample_indices),
                    safe_indexing(y, sample_indices),
                    sample_indices)
        return (safe_indexing(X, sample_indices),
                safe_indexing(y, sample_indices))
Exemplo n.º 3
0
    def transform(self, X, y=None):
        # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
        # TODO      simple trainer in the correct order and leave this to advanced users?

        # Extract predicted column
        y = np.squeeze(X[[self.predicted_column]])

        # Copy the dataframe without the predicted column
        temp_dataframe = X.drop([self.predicted_column], axis=1)

        # Initialize and fit the under sampler
        over_sampler = RandomOverSampler(random_state=self.random_seed)
        x_over_sampled, y_over_sampled = over_sampler.fit_sample(temp_dataframe, y)

        # Build the resulting under sampled dataframe
        result = pd.DataFrame(x_over_sampled)

        # Restore the column names
        result.columns = temp_dataframe.columns

        # Restore the y values
        y_over_sampled = pd.Series(y_over_sampled)
        result[self.predicted_column] = y_over_sampled

        return result
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ros = RandomOverSampler(random_state=RND_SEED)
    ros.fit(X, Y)
    assert_raises(RuntimeError, ros.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_multiclass_fit_resample():
    y = Y.copy()
    y[5] = 2
    y[6] = 2
    ros = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_resample(X, y)
    count_y_res = Counter(y_resampled)
    assert count_y_res[0] == 5
    assert count_y_res[1] == 5
    assert count_y_res[2] == 5
def test_random_over_sampling_heterogeneous_data():
    X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
                        dtype=np.object)
    y = np.array([0, 0, 1])
    ros = RandomOverSampler(random_state=RND_SEED)
    X_res, y_res = ros.fit_resample(X_hetero, y)

    assert X_res.shape[0] == 4
    assert y_res.shape[0] == 4
    assert X_res.dtype == object
    assert X_res[-1, 0] in X_hetero[:, 0]
def test_ros_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ros = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ros_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ros_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 8
0
    def oversample(self):
        """Balance class data based on outcome"""
        print('Current outcome sampling {}'.format(Counter(self.y)))
        
        # to use a random sampling seed at random:
        ros = RandomOverSampler()
        #ros = SMOTE()
        #ros = ADASYN()

        self.X, self.y = ros.fit_sample(self.X, self.y)

        self.Xview = self.X.view()[:, :self.n_features]
        print('Resampled dataset shape {}'.format(Counter(self.y)))
def test_ros_fit():
    """Test the fitting method"""

    # Create the object
    ros = RandomOverSampler(random_state=RND_SEED)
    # Fit the data
    ros.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ros.min_c_, 0)
    assert_equal(ros.maj_c_, 1)
    assert_equal(ros.stats_c_[0], 3)
    assert_equal(ros.stats_c_[1], 7)
def resample(X, y, sample_fraction=0.1, test_size=0.3):
    X_columns = X.columns
    y_columns = y.columns
    n = len(X_columns)

    print('~' * 80)
    print('@@-\n', y.converted.value_counts())
    print('@@0 - Original')
    show_balance(y.values)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print('@@2 - y_train')
    show_balance(y_train)
    print('@@2 -  y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    ros = RandomOverSampler(random_state=42)
    X_train, y_train = ros.fit_sample(X_train, y_train)
    X_test, y_test = ros.fit_sample(X_test, y_test)
    print('@@3 - Oversampled y_train')
    show_balance(y_train)
    print('@@3 - Oversampled y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    if sample_fraction < 1.0:
        _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=sample_fraction, random_state=43)
        _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=sample_fraction, random_state=44)
        print('@@2 - Downsampled y_train')
        show_balance(y_train)
        print('@@2 - Downsampled y_test')
        show_balance(y_test)
        assert len(X_train.shape) == 2 and len(X_test.shape) == 2, (X_train.shape, X_test.shape)
        assert X_train.shape[1] == n and X_test.shape[1] == n, (X_train.shape, X_test.shape)

    print('X_columns=%d %s' % (len(X_columns), X_columns))
    print('y_columns=%d %s' % (len(y_columns), y_columns))
    print('X_train=%-10s y_train=%s' % (list(X_train.shape), list(y_train.shape)))
    print('X_test =%-10s y_test =%s' % (list(X_test.shape), list(y_test.shape)))
    assert X_train.shape[1] == n and X_test.shape[1] == n

    X_train = pd.DataFrame(X_train, columns=X_columns)
    y_train = pd.DataFrame(y_train, columns=y_columns, index=X_train.index)
    X_test = pd.DataFrame(X_test, columns=X_columns)
    y_test = pd.DataFrame(y_test, columns=y_columns, index=X_test.index)
    print('@@+ y_train\n', y_train.converted.value_counts(), flush=True)
    print('@@+ y_test\n', y_test.converted.value_counts(), flush=True)

    return (X_train, y_train), (X_test, y_test)
def test_ros_fit_resample_half():
    sampling_strategy = {0: 3, 1: 7}
    ros = RandomOverSampler(
        sampling_strategy=sampling_strategy, random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_resample(X, Y)
    X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [
        0.20792588, 1.49407907
    ], [0.47104475, 0.44386323], [0.22950086,
                                  0.33367433], [0.15490546, 0.3130677],
                     [0.09125309, -0.85409574], [0.12372842, 0.6536186],
                     [0.13347175, 0.12167502], [0.094035, -2.55298982]])
    y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
 def oversample(self):
     """Balance class data based on outcome"""
     print('Current outcome sampling {}'.format(Counter(self.y)))
     
     # to use a random sampling seed at random:
     ros = RandomOverSampler()
     
     # to fix the random sampling seed at a certain value & return indices: 
     #ros = RandomOverSampler(random_state=2)
     
     self.X, self.y = ros.fit_sample(self.X, self.y)
     
     self.Xview = self.X.view()[:, :self.n_features]
     print('Resampled dataset shape {}'.format(Counter(self.y)))
def test_random_over_sampling_return_indices():
    ros = RandomOverSampler(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, sample_indices = ros.fit_resample(X, Y)
    X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [
        0.20792588, 1.49407907
    ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [
        0.15490546, 0.3130677
    ], [0.09125309, -0.85409574], [0.12372842, 0.6536186],
                     [0.13347175, 0.12167502], [0.094035, -2.55298982],
                     [0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.92923648, 0.76103773], [0.47104475, 0.44386323]])
    y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(np.sort(np.unique(sample_indices)), np.arange(len(X)))
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ros = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 3600)
    assert_equal(count_y_res[1], 3600)
    assert_equal(count_y_res[2], 3600)
def test_ros_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.5
    ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, Y)

    X_gt = np.array([[0.04352327, -0.20515826], [0.20792588, 1.49407907],
                     [0.22950086, 0.33367433], [0.15490546, 0.3130677],
                     [0.09125309, -0.85409574], [0.12372842, 0.6536186],
                     [0.094035, -2.55298982], [0.92923648, 0.76103773],
                     [0.47104475, 0.44386323], [0.13347175, 0.12167502]])
    y_gt = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
   "voicemailplan"]] = X[["internationalplan", "voicemailplan"]].replace({
       "yes":
       1,
       "no":
       0
   })

X.head()

sns.set()
plt.figure(figsize=(6, 6))
sns.countplot(y, palette="plasma")
plt.show()

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_sample(X, y)

temp = pd.Series(y_ros)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_ros,
                                                    temp,
                                                    random_state=42,
                                                    test_size=42,
                                                    stratify=temp)

# Fitting the Logistic Model
y_train.unique()

from sklearn.linear_model import LogisticRegression
Exemplo n.º 17
0
                                                    test_size=0.2,
                                                    random_state=43)

# TF-IDF Vectors as features

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(x_train)
x_train_tfidf = tfidf_vect.transform(x_train)
x_test_tfidf = tfidf_vect.transform(x_test)

x_train_tfidf_os_all = []  # os = oversample
y_train_tfidf_os_all = []

for i in range(6):
    sm_tfidf = RandomOverSampler(random_state=40)
    x_train_tfidf_os, y_train_tfidf_os = sm_tfidf.fit_resample(x_train_tfidf, y_train.iloc[:, i])
    x_train_tfidf_os_all.append(x_train_tfidf_os)
    y_train_tfidf_os_all.append(y_train_tfidf_os)


# svm probs

rf_predict_proba_train = []
rf_predict_proba_test = []

for i in range(6):
    # Linear SVM with grid search

    param_grid = {'n_estimators': [500, 750, 1000],
                  'max_features': [2, 4, 6, 8],
Exemplo n.º 18
0
from imblearn.over_sampling import SMOTE

SEED=0xDEADBEEF

y_col = 'add'
X_cols = ['pct_contrib','turnover','VWAP','vol','VWMC','SPTSXComp']
all_cols = X_cols + [y_col]
X = filtered[X_cols]
y = filtered[y_col]

X_test, X_train, y_test, y_train  = sk.model_selection.train_test_split(X.values, y.values, test_size=0.2, random_state=SEED)

filtered[all_cols].to_sql('model_inputs', conn, if_exists='replace', index=False)

#oversampler = SMOTE(random_state=SEED)
oversampler = RandomOverSampler(random_state=SEED)

X_train_resample, y_train_resamle = oversampler.fit_resample(X_train, y_train)

print(len(X_train), len(X_test))

#log_clf = LogisticRegression()#
#log_clf = RandomForestClassifier()
#log_clf = xgb.XGBClassifier(max_depth=4, min_child_weight=50, learning_rate=0.01, n_estimators=50, gamma=1)
log_clf = svm.LinearSVC()
 ##LogisticRegression()
log_clf.fit(X_train_resample, y_train_resamle)

print(log_clf.score(X_train, y_train))

y_pred = log_clf.predict(X_test)
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_res, y_res = rus.fit_resample(binary_X, binary_y)
print('Information of the iris data set after making it '
      'balanced using a float and an under-sampling method: \n '
      'sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# For **over-sampling methods**, it correspond to the ratio
# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
# minority class after resampling and the number of samples in the majority
# class, respectively.

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(binary_X, binary_y)
print('Information of the iris data set after making it '
      'balanced using a float and an over-sampling method: \n '
      'sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# ``sampling_strategy`` has a ``str``
# ...................................
#
# ``sampling_strategy`` can be given as a string which specify the class
# targeted by the resampling. With under- and over-sampling, the number of
# samples will be equalized.
#
Exemplo n.º 20
0
evaluate_test=[]
prenum_train=[]
prenum_test=[]

skf=StratifiedKFold(n_splits=10)
for train,test in skf.split(dataMat,labelMat):
#==============================================================================
# skf=StratifiedShuffleSplit(n_splits=10)
# for train,test in skf.split(dataMat,labelMat):
#==============================================================================
    print("%s %s" % (train,test))
    train_in=dataMat[train]
    test_in=dataMat[test]
    train_out=labelMat[train]
    test_out=labelMat[test]
    train_in, train_out = RandomOverSampler().fit_sample(train_in, train_out)
    trainWeights=LR.stocGradAscent1(train_in,train_out,500)
    
    len_train=np.shape(train_in)[0]
    len_test=np.shape(test_in)[0]
    test_predict=[]
    proba_test=[]
    for i in range(len_test):
        test_predict_tmp=LR.classifyVector(test_in[i,:], trainWeights)
        test_predict.append(test_predict_tmp)
        proba_test_tmp=LR.classifyProb(test_in[i,:], trainWeights)
        proba_test.append(proba_test_tmp)
     
        
    train_predict=[]
    proba_train=[]
# In[22]:

#Sampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

#Under Sampling
us = RandomUnderSampler()
X_dn, y_dn = us.fit_sample(X_train_tr, y_train)
X_test_dn, y_test_dn = us.fit_sample(X_test_tr, y_test)

forest_clf_dn.fit(X_dn, y_dn)
xgb_clf_dn.fit(X_dn, y_dn)

#OverSampling
os = RandomOverSampler()
X_up, y_up = os.fit_sample(X_train_tr, y_train)
X_test_up, y_test_up = os.fit_sample(X_test_tr, y_test)

forest_clf_os.fit(X_up, y_up)
xgb_clf_os.fit(X_up, y_up)

# In[23]:

#Forest Undersampled
print('Forest Undersampled')
evaluate(forest_clf_dn, X_test_dn, y_test_dn)
plot_roc(forest_clf_dn, X_test_dn, y_test_dn)

#Forest Oversampled
print('Forest Oversampled')
Exemplo n.º 22
0
    "MemberType": {
        "open loan - dl only": 0,
        "closed loan - never converted": 1,
        "converted member": 2
    }
}

wmdat = pd.read_csv("dat_targ.csv")
class_names = wmdat["MemberType"].unique()
wmdat.replace(names, inplace=True)
wm_targ = wmdat["MemberType"]
wmdat = wmdat.drop("MemberType", axis=1)

wm, wm_test, wm_targets, wm_test_targets = train_test_split(wmdat, wm_targ)

ros = RandomOverSampler(random_state=0)
wm_resampled, wm_targets_resampled = ros.fit_resample(wm, wm_targets)
#%%
classifier = GaussianNB()
classifier.fit(wm_resampled, wm_targets_resampled)
targets_predicted = classifier.predict(wm_test)
#print(targets_predicted)
acc = sklearn.metrics.accuracy_score(wm_test_targets, targets_predicted)
rec = sklearn.metrics.recall_score(wm_test_targets,
                                   targets_predicted,
                                   average='macro')
prec = sklearn.metrics.precision_score(wm_test_targets,
                                       targets_predicted,
                                       average='macro')
print("Accuracy : {} \n Recall : {} \n Precision : {}".format(acc, rec, prec))
Exemplo n.º 23
0
# balance the class distribution)

# Random Oversampling

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)

# summarize class distribution
print(Counter(y))

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

# fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)

# summarize class distribution
print(Counter(y_over))

#The model is evaluated using repeated 10-fold cross-validation with three
#repeats, and the oversampling is performed on the training dataset within
# each fold separately,

# Evaluating a decision tree on an imbalanced dataset with a 1:100 class
# distribution
# Template to test oversampling with ur dataset and learning algorithm
Exemplo n.º 24
0
                           columns=['Class', 'Email'])

# Generate test set
testSet.to_csv(r'data/testSet.csv', index=None, header=True)
testSetDistribution = testSet.groupby('Class').size()
testSetDistribution.sort_values(ascending=False, inplace=True)
print("Test class distribution: \n" + str(testSetDistribution))

# Generate training set
trainingSet.to_csv(r'data/trainingSet.csv', index=None, header=True)
trainingSetDistribution = trainingSet.groupby('Class').size()
trainingSetDistribution.sort_values(ascending=False, inplace=True)
print("Training class distribution: \n" + str(trainingSetDistribution))

# Balanced dataset - randomly over-sample smaller classes until all classes are equally represented
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(
    np.array(X_train).reshape(-1, 1), y_train)
balancedTrainingSet = pd.DataFrame(list(zip(y_resampled, X_resampled)),
                                   columns=['Class', 'Email'])
balancedTrainingSet.to_csv(r'data/trainingSet_balanced.csv',
                           index=None,
                           header=True)
balancedTrainingSetDistribution = balancedTrainingSet.groupby('Class').size()
balancedTrainingSetDistribution.sort_values(ascending=False, inplace=True)
print("Balanced class distribution: \n" + str(balancedTrainingSetDistribution))

#Augmented dataset
currentPath = os.getcwd()
os.chdir('EDA/code')
os.system(
Exemplo n.º 25
0
    scaler.fit(df)
    df = pd.DataFrame(scaler.transform(df), columns=df.columns)
    df["nb_years"] = v

    return df


train = pd.read_csv('train.csv', header=0, sep=",")
test = pd.read_csv('test.csv', header=0, sep=",")

target = train["status_group"]
piv_train = train.shape[0]

# Resample
# Over-sampling
ros = RandomOverSampler(random_state=0)
train_resampled, target = ros.fit_resample(train.drop('status_group', axis=1),
                                           target)
piv_train = train_resampled.shape[0]

df_all = pd.concat((train_resampled, test), axis=0, ignore_index=True)

features_mask = [
    "amount_tsh", "funder", "gps_height", 'installer', 'num_private', "basin",
    'wpt_name', 'construction_year', "permit", "extraction_type", "payment",
    "quantity", "source_class"
]
df = df_all

## Model 1: 3 classes
df = data_preparation(df)
Exemplo n.º 26
0
        # df2 = df["coordinates"].apply(lambda x: pd.Series(x, dtype=np.float32))
        # df2 = df2.rename(columns= {0: 'lat', 1:'lon'})

        X = X.drop([".geo"], axis=1)
        # X["lat"] = df2['lat'].round(decimals=6 )
        # X['lon'] = df2["lon"].round(decimals=6 )

        if minimun > 25:
            print("normal smote")
            pipeline = SMOTE()
            X, y = pipeline.fit_resample(X, y)

        else:
            print("hibrid smote")
            over = SMOTE()
            random = RandomOverSampler(sampling_strategy='minority')
            steps = [('r', random), ('o', over)]
            pipeline = Pipeline(steps=steps)
            X, y = pipeline.fit_resample(X, y)

        counter = Counter(y)
        print("Frecuencia das classes SMOTE \n", counter)
        # print(X.columns)

        X['class'] = y
        # print(X.columns)

        lsDF.append(X.copy())

    dfROIs = pd.concat(lsDF)
    dfROIs['system:index'] = dfROIs.index
print("Recall = " ,recall_score(Y_test, Y_pred_tomek))
confusion_matrix(Y_test, Y_pred_tomek)


# #### Tomek Undersampling doesn't seem a good fit for data. There is hardly any increase in recall compared to the vanilla model. Undersampling techniques, even if they provide an increase in the metric of choice, are not favoured since you tend to lose some information when you undersample the majority class of the target. Hence in most cases, what we prefer to perform are Oversampling techniques like Random Oversampling and SMOTE

# In[203]:



from imblearn.over_sampling import RandomOverSampler

#Code starts here

#Initialise the random over sampler object
ros = RandomOverSampler(random_state=0)

#Sample the train data using random over sampling method
X_sample_2, y_sample_2 = ros.fit_sample(X_train, Y_train)

# Using a countplot 
sns.countplot(y_sample_2)

#Initialising a logsitic regression model
model_ros = LogisticRegression()

#Fitting the model with train data
model_ros.fit(X_sample_2, y_sample_2)

#Making predictions of the train data
Y_prediction=model_ros.predict(X_test)
Exemplo n.º 28
0
#from sklearn import tree
from imblearn.over_sampling import RandomOverSampler

df = pd.read_csv('datapagitanpa2019.csv', sep=',')
df2 = pd.read_csv('datapagi2019.csv', sep=',')

data_array = df.values
data_array2 = df2.values
X_train = data_array[:, 1:40]
y_train = data_array[:, 40]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0)

X_test = data_array2[:, 1:40]
y_test = data_array2[:, 40]

ros = RandomOverSampler(random_state=42)

ambilnama = df.drop(columns='tanggal')
ambilnama = ambilnama.drop(columns='ww')
feature_names = list(ambilnama.columns)

X_train = X_train.astype(float)
X_test = X_test.astype(float)
y_train = y_train.astype(float)
y_test = y_test.astype(float)

X_res, y_res = ros.fit_sample(X_train, y_train)

X_res = X_res.astype(float)
y_res = y_res.astype(float)
Exemplo n.º 29
0
if __name__ == '__main__':
    train_flag = 1

    train = get_data(params.train_path + '/234' + params.b_train_file_name)
    test = get_data(params.train_path + '/456' + params.b_train_file_name)
    predicted = get_data(params.train_path + '/567' + params.b_train_file_name)
    train = train.drop(cols, axis=1)
    test = test.drop(cols, axis=1)

    # train = util.get_undersample_data(train)
    # print('undersample', train.shape)
    (X_train, y_train), (X_test, y_test) = get_X_y(train), get_X_y(test)

    print(X_train.shape, y_train.shape)
    X_train, y_train = RandomOverSampler('minority').fit_sample(
        X_train, y_train)
    print(X_train.shape, y_train.shape)

    fs_model = feature_selection(X_train, y_train)
    print(X_train.shape)
    X_train = fs_model.transform(X_train)
    X_test = fs_model.transform(X_test)
    X_predict = fs_model.transform(predicted.drop(cols, axis=1))
    print(X_train.shape)

    if train_flag:
        models, names = get_models()
        estimators = train_predict(models, names, X_train, y_train, X_test,
                                   y_test)
        for estimator, name in zip(estimators, names):
            util.save_to_file(predicted[cols], estimator.predict(X_predict),
Exemplo n.º 30
0
        excl_targ = {'TCGA annotation', 'SURV', 'CNV'}
        tmp = excl_targ.remove(t)

        df = df.drop(columns=excl_targ)
        classes = df[t]
        header = df.columns
        df1 = df.copy(deep=True)  # contains target classes
        df = df.drop(columns=t)  # doesn't contain target classes

        data = np.array(df).astype(np.float)
        data = RobustScaler().fit_transform(data)

        new_data, orig_data, new_classes, orig_classes = train_test_split(
            data, classes, test_size=0.3)

        ros = RandomOverSampler()
        data, classes = ros.fit_sample(new_data, new_classes)

        # Random forests (MetOncoFit)
        feat = (data.shape[1] - 10)
        while (feat < data.shape[1] - 1):
            trees = 5
            while (trees <= 500):
                rfc = RandomForestClassifier(n_estimators=trees,
                                             max_features=feat)
                rfc.fit(data, classes)
                trees = trees + 1500
            feat = feat + 20
            rfc_pred = rfc.predict(orig_data)
            mean_acc = rfc.score(orig_data, orig_classes)
Exemplo n.º 31
0
import sys, os, csv
from imblearn.over_sampling import RandomOverSampler
input_csv_file = sys.argv[1]
input_csv = input_csv_file.split(".csv")[0]
with open(input_csv_file, newline="") as input_file:
    reader = csv.reader(input_file, delimiter=',')
    with open(input_csv + "-ro-.csv", 'w', newline='') as output_file:
        writer = csv.writer(output_file, delimiter=',')
        skip_header = True
        X = []
        y = []
        ros = RandomOverSampler()
        for x in reader:
            if skip_header:
                skip_header = False
                continue
            y.append(x[-1])
            X.append(list(map(int, x[:len(x) - 1])))
            #print (X)
        X_res, y_res = ros.fit_sample(X, y)        
        print (len(X_res))
        print (len(y_res))
        for idx, s in enumerate(X_res):
            #print (list(s) + list(y_res[idx]))
            writer.writerow(list(s) + list(y_res[idx]))
            #break;
            
Exemplo n.º 32
0
bagging_fraction = 1
verbosity = 20
num_boost_round = 20000
verbose_eval = 1000
early_stopping_rounds = 200
reg_alpha = 2
reg_lambda = 15

reduction_rate = []
for random_state in range(0, 15):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=random_state)
    #如果 random_state = None (默认值),会随机选择一个种子,这样每次都会得到不同的数据划分。给 random_state 设置相同的值,那么当别人重新运行你的代码的时候就能得到完全一样的结果,复现和你一样的过程。
    X_base_train, X_base_test, y_base_train, y_base_test = train_test_split(
        X_base, y_base, test_size=0.30, random_state=random_state)
    ros = RandomOverSampler(random_state=0)
    X_train, y_train = ros.fit_resample(X_train, y_train)
    X_base_train, y_base_train = ros.fit_resample(X_base_train, y_base_train)
    #min_max_scaler = MinMaxScaler()
    #X_train = min_max_scaler.fit_transform(X_train)
    #X_test = min_max_scaler.fit_transform(X_test)
    #sc = StandardScaler()
    #X_train = sc.fit_transform(X_train)
    #X_test = sc.fit_transform(X_test)

    #converting the dataset into proper LGB format
    train_matrix = lgb.Dataset(X_train, label=y_train)
    valid_matrix = lgb.Dataset(X_test, label=y_test)
    train_matrix_base = lgb.Dataset(X_base_train, label=y_base_train)
    valid_matrix_base = lgb.Dataset(X_base_test, label=y_base_test)
Exemplo n.º 33
0
 def __init__(self):
     self.clf = make_pipeline_imb(
         Imputer(strategy='median'), RandomOverSampler(),
         LogisticRegression(C=0.010826367338740546, penalty="l2"))
Exemplo n.º 34
0
# Y_indices = [index for index in tag_to_index_map[Y_tags]]

X = np.array(data[0::5])
# X = np.random.shuffle(X)

max_len = get_max_length(X)

prob_test_matrix = prob_test_matrix(data, max_len)

#format the input of the model
X_train_indices = sentences_to_indices(X, word_to_index, max_len)
Y_train = to_categorical(Y_indices)

#balance the training set
ros = RandomOverSampler(
    random_state=0)  #repeat all tags to the same #of the largest tags
# ros = RandomUnderSampler(replacement=True, random_state=0)    #Reduce the size of largest tags

#shuflle
index = [i for i in range(len(X_train_indices))]
random.shuffle(index)
prob_test_matrix = np.array([prob_test_matrix[i] for i in index])
X_train_indices = np.array([X_train_indices[i] for i in index])
Y_train = np.array([Y_train[i] for i in index])

#split into train and test
X_train = X_train_indices[:int(0.8 * len(X_train_indices))]
X_test = X_train_indices[int(0.8 * len(X_train_indices)):]
Y_test = Y_train[int(0.8 * len(X_train_indices)):]
Y_train = Y_train[:int(0.8 * len(X_train_indices))]
prob_test_matrix_train = prob_test_matrix[:int(0.8 * len(X_train_indices))]
Exemplo n.º 35
0
def balance_data(X, y):
    # Apply the random over-sampling
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_sample(X, y)
    return X_resampled, y_resampled
Exemplo n.º 36
0
X_average_train = X_add_train / 2

X_concat_test = hstack((X_Q_te, X_A_te))

X_add_test = X_Q_te + X_A_te

X_average_test = X_add_test / 2

#print(X_concat_test)

#print(y_tr)

y_add_tr = y_tr.copy()
y_avg_tr = y_tr.copy()
#to get equal zeroes and ones in order for the machine to actually learn well
ros = RandomOverSampler(random_state=42)
#print(X_concat_train.shape, X_add_train.shape)
X_concat_train, y_tr = ros.fit_resample(X_concat_train, y_tr)

X_add_train, y_add_tr = ros.fit_resample(X_add_train, y_add_tr)

X_average_train, y_avg_tr = ros.fit_resample(X_average_train, y_avg_tr)
print("Done some other stuff")
#using lbgfs
train_vals = X_concat_train, X_add_train, X_average_train, y_tr, y_add_tr, y_avg_tr
test_vals = X_concat_test, X_add_test, X_average_test, y_te, y_te, y_te

dump(train_vals, train_values)
dump(test_vals, test_values)

clf = LogisticRegression(random_state=42,
Exemplo n.º 37
0
dataset.dropna(inplace=True)

# summarize the number of rows and columns in the dataset after listwise drop
(sample, vnum) = dataset.shape
print(sample, vnum)

# Get the number of variables
vnum = vnum - 1

# splice into IVs and DV
values = dataset.values
X = values[:, 0:vnum]
y = values[:, vnum]

# Oversampling
ros = RandomOverSampler(random_state=0)
X_R, y_R = ros.fit_sample(X, y)

# create model
model = Sequential()
model.add(Dense(12, input_dim=vnum, kernel_initializer='uniform', activation='relu'))
model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X_R, y_R, epochs=150, batch_size=10, verbose=2)

# calculate predictions
predictions = model.predict(X)
# round predictions
scores = []
for feature in list(data.columns):

	# onehot encode the feature
	feature_data = data[[feature]]
	encoded_feature_data = pd.get_dummies(feature_data)

	print '\n'
	print feature
	print feature_data.shape
	print encoded_feature_data.shape
	print y.shape

	# upsample minority class
	from imblearn.over_sampling import RandomOverSampler
	ros = RandomOverSampler(ratio=0.5)
	X_resampled, y_resampled = ros.fit_sample(encoded_feature_data, y)

	print '\n'
	print X_resampled.shape
	print y_resampled.shape

	# create train and test split
	X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0, test_size=0.2)

	print '\n'
	print 'Training data'
	print X_train.shape
	print y_train.shape

	print 'Testing data'
#scaler = StandardScaler()
#X = scaler.fit_transform(X)

df = pd.read_csv('../Data/validation.csv', header=0)
X_valid = df.iloc[:,0:-1].copy()
Y_valid = df.iloc[:, -1].copy()
#scaler = StandardScaler()
#X_valid = scaler.fit_transform(X_valid)

# Handle the dataset with undersampling strategy
rus = RandomUnderSampler(sampling_strategy=0.8)
X_res, Y_res = rus.fit_resample(X, Y)

# Handle the dataset with oversampling strategy
ros = RandomOverSampler(random_state=0)
X_resampled, Y_resampled = ros.fit_resample(X, Y)

# Handle the dataset with SMOTE
SM = SMOTE(random_state=0)
X_smote, Y_smote = SM.fit_sample(X, Y)

score_infor = [[],[],[],[]]
roc_auc_score_infor = [[],[],[],[]]
f1_score_infor = [[],[],[],[]]

#print(pd.value_counts(Y_smote))
for weight_percent in range(1, 100):

    class_weight = {0: weight_percent, 1: (100-weight_percent)}
Exemplo n.º 40
0
def random_over_sampler(X, y):
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(X, y)
    return X_res, y_res
Exemplo n.º 41
0
def random_oversampling(feature_data, feature_label, random_state):
    X_resampled, y_resampled = \
        RandomOverSampler(random_state = random_state).fit_resample(feature_data, feature_label)

    return X_resampled, y_resampled
Exemplo n.º 42
0
    count = 0
    accuracy = np.zeros(split_num)
    loss = np.zeros(split_num)
    aauc = np.zeros(split_num)
    skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    for train_pick, test_pick in skf.split(features, label):
        X_all = features
        X_test = features[test_pick, :]
        X_train = features[train_pick, :]

        y_all = label
        y_test = label[test_pick]
        y_train = label[train_pick]

        if IF_SMOTE:
            ros = RandomOverSampler(random_state=0)
            X_train, y_train = ros.fit_sample(X_train, y_train)
            X_test, y_test = ros.fit_sample(X_test, y_test)

        # data pre-processing
        y_train = np_utils.to_categorical(y_train, num_classes=labelNum)
        y_test = np_utils.to_categorical(y_test, num_classes=labelNum)
        y_all = np_utils.to_categorical(y_all, num_classes=labelNum)

        # We add metrics to get more results you want to see
        model.compile(optimizer=optim,
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        print('Training ------------')
        # Another way to train the model
Exemplo n.º 43
0
    = train_test_split(X, y, test_size=0.20, random_state=42)

# Split train_val data into training set and validation set
X_train, X_val, y_train, y_val \
    = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# ==========================================================================================

# Over-sampled data

# Generate the new dataset using under-sampling method
verbose = False
ratio = 'auto'

# 'Random over-sampling'
OS = RandomOverSampler(ratio=ratio, verbose=verbose)
X_train_os, y_train_os = OS.fit_sample(X_train, y_train)

# 'SMOTE'
smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
X_train_smo, y_train_smo = smote.fit_sample(X_train, y_train)

# 'SMOTE bordeline 1'
bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1')
X_train_bs1, y_train_bs1 = bsmote1.fit_sample(X_train, y_train)

# 'SMOTE bordeline 2'
bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
X_train_bs2, y_train_bs2 = bsmote2.fit_sample(X_train, y_train)

# 'SMOTE SVM'
Exemplo n.º 44
0
file = open("CrashSCDect_perforance_results.csv","w")
file.write("classifier,f1,auc\n")

def store_results(method_name:str, f1_resuts, auc_results):
        print("%s performance: f1:%s auc:%s" % (method_name,np.mean(f1_resuts), np.mean(auc_results)))
        for i in range(len(f1_resuts)):
                file.write(method_name+","+str(f1_resuts[i])+","+str(auc_results[i])+"\n")


folds =0
for folds in range(10):
        print("Folds: %s"%(folds))

        # 
        sampled = RandomOverSampler(random_state=folds)
        X_sampled,Y_sampled = sampled.fit_resample(pydata.drop(['label'], axis=1), pydata['label'])

        print(X_sampled.shape)
        print(Y_sampled.shape)
        strKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=folds)
        scoring = ['roc_auc', 'f1']



        rf = RandomForestClassifier()
        results_rf = cross_validate(
                rf,
                X=X_sampled,
                y=Y_sampled,
                cv=strKFold,
Exemplo n.º 45
0
from imblearn.under_sampling import RandomUnderSampler, NearMiss

from csgo_cheater_detection.config.config import data_path, seed, random_state

# load data
df = pd.read_csv(f'{data_path}\\csgo_cheater_data_8_30_20_full.csv')

# Split X and y
y = df.pop('label')
X = df

# set seeds
np.random.seed(seed)

# Random Over-Sampling
ros = RandomOverSampler(random_state=random_state)
X_temp, y_temp = ros.fit_resample(X, y)

X_temp['label'] = y_temp
print(X_temp.columns)

# parameters
sampling_methods = {
    'random_over': RandomOverSampler(),
    'SMOTE': SMOTE(),
    'random_under': RandomUnderSampler(),
    'near_miss': NearMiss()
}

# loop
for name, method in sampling_methods.items():
Exemplo n.º 46
0
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_res, y_res = rus.fit_resample(binary_X, binary_y)
print('Information of the iris data set after making it '
      'balanced using a float and an under-sampling method: \n '
      'sampling_strategy={} \n y: {}'.format(sampling_strategy,
                                             Counter(y_res)))
plot_pie(y_res)

###############################################################################
# For **over-sampling methods**, it correspond to the ratio
# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
# minority class after resampling and the number of samples in the majority
# class, respectively.

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(binary_X, binary_y)
print('Information of the iris data set after making it '
      'balanced using a float and an over-sampling method: \n '
      'sampling_strategy={} \n y: {}'.format(sampling_strategy,
                                             Counter(y_res)))
plot_pie(y_res)

###############################################################################
# ``sampling_strategy`` has a ``str``
# ...................................
#
# ``sampling_strategy`` can be given as a string which specify the class
# targeted by the resampling. With under- and over-sampling, the number of
# samples will be equalized.
#
plt.tight_layout()
plt.show()

# In[21]:

assert Counter(y_rus)[1] == Counter(y_train)[
    1]  #Checking if they have the same number of fraud cases

# <a id='ros'></a>

# ## Random oversampling

# In[22]:

ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)

# Checking If classes are balanced:

# In[23]:

plt.bar(['Non-Fraud', 'Fraud'],
        [Counter(y_ros)[0], Counter(y_ros)[1]],
        color=['b', 'r'])
plt.xlabel('Class')
plt.ylabel('Number of transactions')
plt.annotate('{}'.format(Counter(y_ros)[0]), (0.20, 0.45),
             xycoords='axes fraction')
plt.annotate('{}'.format(Counter(y_ros)[1]), (0.70, 0.45),
             xycoords='axes fraction')
Exemplo n.º 48
0
	weights=[0.1, 0.9],
    n_informative=3, 
	n_redundant=1, 
	flip_y=0,
	n_features=20, 
	n_clusters_per_class=1,
	n_samples=200, 
	random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
Exemplo n.º 49
0
    def use_parameters(self, X_train, selected_features):
        '''


        Returns
        -------

        '''
        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            ClusterCentroids(),
            RandomUnderSampler(),
            # NearMiss(version=1),
            # EditedNearestNeighbours(),
            # AllKNN(),
            # CondensedNearestNeighbour(random_state=0),
            # InstanceHardnessThreshold(random_state=0,
            #                          estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
            RandomOverSampler(random_state=0),
            SMOTE(),
            BorderlineSMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]

        ### XGBOOST
        parameters = [{
            'scaler': test_scaler,
            'sampling': test_sampling,
            'feat__cols': selected_features,
            'model__objective': ['logloss'],
            'model__learning_rate': [0.005, 0.01, 0.05, 0.1,
                                     0.5],  # so called `eta` value
            'model__max_depth': [3, 4, 5],
            'model__min_child_weight': [1, 5, 11, 12, 15],
            'model__silent': [0],
            'model__subsample': [0.6, 0.8, 1.0],
            'model__colsample_bytree': [0.6, 0.8, 1.0],
            'model__n_estimators':
            [5, 50,
             100],  # number of trees, change it to 1000 for better results
            'model__missing': [-999],
            'model__gamma': [0.5, 1, 1.5, 2, 5],
            'model__seed': [1337]
        }]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

            print("Selected Parameters: ", parameters)
        # else:
        print("Parameters defined in the input: ", parameters)

        ### XGBOOST
        return parameters
Exemplo n.º 50
0
def load_features(task):

    log_file = log_dir + 'loading_task_' + str(task['pref_id']) + '.txt'
    load_logger = logger(log_file, task)

    dataset_prediction_task_to_outcomes = {
        'all_one_trace_type': {
            'two': ['line', 'bar'],
            'three': ['line', 'scatter', 'bar'],
            'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'],
        },
        'has_single_src': {
            'two': [True, False]
        },
        'num_x_axes': {
            'numeric': [i for i in range(5)]
        },
        'num_y_axes': {
            'numeric': [i for i in range(5)]
        }
    }

    field_prediction_task_to_outcomes = {
        'trace_type': {
            'two': ['line', 'bar'],
            'three': ['line', 'scatter', 'bar'],
            'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'],
        },
        'is_xsrc': {
            'two': [True, False]
        },
        'is_ysrc': {
            'two': [True, False]
        },
        'is_x_or_y': {
            'two': ['x', 'y']
        },
        'is_single_src': {
            'two': [True, False]
        }
    }

    if task['dataset'] == 'dataset':
        task['features_df_file_name'] = 'features_aggregate_single_pairwise.csv'
        task['outcomes_df_file_name'] = 'chart_outcomes.csv'
        task['id_field'] = 'fid'
        prediction_task_to_outcomes = dataset_prediction_task_to_outcomes
    else:
        assert task['dataset'] == 'field'
        task['features_df_file_name'] = 'field_level_features.csv'
        task['outcomes_df_file_name'] = 'field_level_outcomes.csv'
        task['id_field'] = 'field_id'
        prediction_task_to_outcomes = field_prediction_task_to_outcomes


    features_df = pd.read_csv(
        join(features_directory, task['features_df_file_name']),
        nrows=num_datapoints)
    outcomes_df = pd.read_csv(
        join(features_directory, task['outcomes_df_file_name']),
        nrows=num_datapoints)
    feature_names_by_type = pickle.load(
        open(
            join(features_directory, feature_set_lookup_file_name),
            'rb'))

    # print(features_df)
    # print('Initial Features:', features_df.shape)
    # print('Initial Outcomes:', outcomes_df.shape)
    # load_logger.log_dict(feature_names_by_type)
    # load_logger.log('\n')
    # load_logger.log(features_df)
    load_logger.log('Initial Features: ' + str(features_df.shape))
    load_logger.log('Initial Outcomes: ' + str(outcomes_df.shape))

    if task['dataset'] == 'field':
        def is_x_or_y(is_xsrc, is_ysrc):
            if is_xsrc and pd.isnull(is_ysrc): return 'x'
            if is_ysrc and pd.isnull(is_xsrc): return 'y'
            else:                              return None
        outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc'])
        outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc']

    outcomes_df_subset = format_outcomes_df(load_logger, outcomes_df, 
                                            task['outcome_variable_name'],
                                            prediction_task_to_outcomes[ task['outcome_variable_name'] ] [task['prediction_task'] ],
                                            id_field=task['id_field'])
    
    final_df = join_features_and_outcomes(features_df, outcomes_df_subset, on=task['id_field'])
    last_index = final_df.columns.get_loc(task['outcome_variable_name'])

    X = final_df.iloc[:, :last_index]
    y = final_df.iloc[:, last_index]

    # print('Intermediate Outcomes:', y.shape)
    # value_counts = y.value_counts()
    # print('Value counts:')
    # print(value_counts)
    load_logger.log('Final DF Shape: ' + str(final_df.shape))
    load_logger.log('Last Index: ' + str(last_index))

    load_logger.log('Intermediate Outcomes: ' + str(y.shape))
    load_logger.log('Value counts: \n' + str(y.value_counts()))

    # delete variables to save memory!
    del final_df, outcomes_df

    task_types = ['dimensions', 'types', 'values', 'names']
    for task_name in task_types:
        names = get_feature_set_names_by_type(
            feature_names_by_type,
            task_type=task['dataset'],
            feature_set=task_name)
        indices = [X.columns.get_loc(c) for c in names if c in X.columns]
        # print('task is ' + task_name + ' and indices are:')
        #print('names are {}'.format(names) )
        # print(indices)
        # load_logger.log('task is ' + task_name + ' and indices are: ')
        # load_logger.log(indices)


    y = pd.get_dummies(y).values.argmax(1)

    if task['sampling_mode'] == 'over':
        res = RandomOverSampler(random_state=RANDOM_STATE)
        X, y = res.fit_sample(X, y)
    elif task['sampling_mode'] == 'under':
        res = RandomUnderSampler(random_state=RANDOM_STATE)
        X, y = res.fit_sample(X, y)
    elif isinstance(task['sampling_mode'], int):
        X_resampled_arrays, y_resampled_arrays = [], []
        for outcome in np.unique(y):
            outcome_mask = (y == outcome)
            X_resampled_outcome, y_resampled_outcome = resample(
                X[outcome_mask],
                y[outcome_mask],
                n_samples=task['sampling_mode'],
                random_state=RANDOM_STATE
            )
            X_resampled_arrays.append(X_resampled_outcome)
            y_resampled_arrays.append(y_resampled_outcome)

        X, y = np.concatenate(X_resampled_arrays).astype(
            np.float64), np.concatenate(y_resampled_arrays)
    else:
        X, y = X.values.astype(np.float64), y

    # print('Final Features:', X.shape)
    # print('Final Outcomes:', y.shape)
    load_logger.log('Final Features:' + str(X.shape))
    load_logger.log('Final Outcomes:' + str(y.shape))
    unique, counts = np.unique(y, return_counts=True)
    load_logger.log('Value counts after sampling:')
    load_logger.log_dict(dict(zip(unique, counts)))
    load_logger.log('\n')

    del load_logger
    return util.unison_shuffle(X, y)
Exemplo n.º 51
0
def runns(resp_var, size_of_test_data,dataset,positive_class,n_estimators,important_features,dealing_with_nulls):
	dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
	#----DATA PREPROCESSING
	#-------dealing with NULL values in the data
	#----------remove the rows in which the response is null
	dataset=dataset.dropna(subset=[resp_var])
	#----------dealing with nulls
	dataset=deal_with_nulls(dealing_with_nulls,dataset)
	#----FEATURE SELECTION
	#-------get predictors important in predicting the response
	#-----------transform categorical predictors to dummy variables
	predictors=dataset.drop(resp_var,axis=1,inplace=False)
	predictors=pd.get_dummies(predictors)
	#-----------balance the classes in the response var
	ros = RandomOverSampler(random_state=0)
	resp=dataset[resp_var]
	prds, resp = ros.fit_sample(predictors, resp)
	#-----------fit the random forest classifier to give us the important predictors
	rf_clf = RandomForestClassifier(n_estimators=n_estimators)
	rf_clf.fit(prds,resp)
	#-------get the important predictors
	feature_imp = pd.Series(rf_clf.feature_importances_,
                    index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
	#-------names of the important predictors
	important_predictor_names = feature_imp.index[0:important_features]
	#-------subset the data to get only the important predictors and the response
	resp=pd.DataFrame(data=resp,columns=[resp_var])
	predictors=pd.DataFrame(prds,columns=list(predictors))
	dataset=pd.concat([resp,predictors],axis=1)
	#---------------------------------------------------------
	#----MODEL TRAINING
	#--------Remove the response variables from the features variables - axis 1 refers to the columns
	m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
	# Response variables are the values we want to predict
	resp_var = np.array(dataset[resp_var])

	dataset = pd.get_dummies(m_data)
    
	# Saving feature names for later use
	feature_list = list(m_data.columns)
	# Convert to numpy array
	dataset = np.array(dataset)

	# Split the data into training and testing sets
	train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = size_of_test_data, random_state = 402)

	# Instantiate model with n_estimators decision trees
	clf = SVC(kernel='rbf',probability=True)

	# Train the model on training data
	clf.fit(train_features, train_labels)
    # evaluation
	predicted = clf.predict(test_features)
	pred_prob = clf.predict_proba(test_features)
    
	accuracy = accuracy_score(test_labels, predicted)
	#confusion matrix
	cnf = (confusion_matrix(test_labels,predicted))
	#precision score
	precision = precision_score(test_labels,predicted,pos_label=positive_class)
	#avg pres
	avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
	#recall score
	rec = recall_score(test_labels,predicted,pos_label=positive_class)
	#f1 scorea
	fscore = f1_score(test_labels,predicted,pos_label=positive_class)
	#fbeta score
	fbeta = fbeta_score(test_labels,predicted,beta=0.5)
	#hamming_loss
	hamming = hamming_loss(test_labels,predicted)
	#jaccard similarity score
	jaccard = jaccard_similarity_score(test_labels,predicted)
	#logloss
	logloss = log_loss(test_labels,predicted)
	#zero-oneloss
	zero_one = zero_one_loss(test_labels,predicted)
	#auc roc 
	area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
	#cohen_score
	cohen = cohen_kappa_score(test_labels,predicted)
	#mathews corr
	mathews = matthews_corrcoef(test_labels,predicted)
	# Variable importances from the important features selection stage
	variable_importance_list = list(zip(prds, feature_imp))
	output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
	output=json.dumps(output)
	return jsonify({"Predictions": output})
Exemplo n.º 52
0
    clf_name = "knn"
    resultFolder = "/home/sherlock/Internship@iit/exudate-detection/" + clf_name + "_results-exudates/"
    clf = KNeighborsClassifier(n_neighbors=10)
    clf.fit(X_train, Y_train)
    Y_predicted = clf.predict(X_test)
    print("accuracy")
    print(accuracy_score(Y_test, Y_predicted))
    print("confusion matrix")
    print(confusion_matrix(Y_test, Y_predicted))
    writeResults(DestinationFolder, resultFolder, name_array, clf_name,
                 Y_predicted)

    print("DONE_-------------------x----xxxxx-xx-x")

    from imblearn.over_sampling import RandomOverSampler
    ros = RandomOverSampler(random_state=0)
    X_resampled, Y_resampled = ros.fit_sample(X_train, Y_train)
    print("when balanced classes : ")
    print(sorted(Counter(Y_resampled).items()))

    print("RANDOM FOREST")
    clf_name = "rf"
    resultFolder = "/home/sherlock/Internship@iit/exudate-detection/" + clf_name + "_BAL_results-exudates/"
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X_resampled, Y_resampled)
    Y_predicted = clf.predict(X_test)
    print("accuracy")
    print(accuracy_score(Y_test, Y_predicted))
    print("confusion matrix")
    print(confusion_matrix(Y_test, Y_predicted))
    writeResults(DestinationFolder, resultFolder, name_array, clf_name,
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
Exemplo n.º 54
0
encoder = preprocessing.LabelEncoder()
encoder.fit(target)
encoded_Y = encoder.transform(target)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

# separate data into training and (validation + testing) datasets in a 70/30 (20/10) proportion
X_train, X_partial, y_train, y_partial = train_test_split(
    features, dummy_y, test_size=0.3, random_state=rand_state)
X_val, X_test, y_val, y_test = train_test_split(X_partial,
                                                y_partial,
                                                test_size=0.33,
                                                random_state=rand_state)

# Oversample the training data
ros = RandomOverSampler(sampling_strategy='minority', random_state=12)
rus = RandomUnderSampler(random_state=12, replacement=True)
# X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

# Reobtain the correct training, validation and testing datasets
X_train_reduced = X_train_res.loc[:, features_list]
y_train_reduced = y_train_res  #X_train_res.loc[:, targets_list] #Sim, X_train_res está correto

X_val_reduced = X_val.loc[:, features_list]
y_val_reduced = y_val  # X_val.loc[:, targets_list]

X_test_reduced = X_test.loc[:, features_list]
y_test_reduced = y_test  #X_test.loc[:, targets_list]

# Samples no_zeros giving it the same number of values for all vb_slice ranges