Пример #1
0
    def train_test(self, data_type=None, test_size=0.20, random_state=None):
        data_xy = self._get_data(data_type)

        x_train = data_xy.x
        y_train = data_xy.y
        x_test = pd.DataFrame(data=None, columns=x_train.columns)
        y_test = pd.Series(data=None, name=data_xy.y.name)

        if test_size > 0:
            x_train, x_test, y_train, y_test = train_test_split(
                data_xy.x,
                data_xy.y,
                stratify=data_xy.y,
                random_state=random_state,
                test_size=test_size)

            ros = RandomUnderSampler(random_state=random_state)
            # ros = RandomOverSampler(random_state=random_state)
            # ros = SMOTE()
            ros.fit(x_train, y_train)
            x_train2, y_train2 = ros.fit_resample(x_train, y_train)

            x_train = pd.DataFrame(x_train2, columns=x_train.columns)
            y_train = pd.Series(y_train2, name=y_train.name)

        train = DataXyz(data_xy.code, x_train, y_train)
        train = self._get_data3(train)

        test = DataXyz(data_xy.code, x_test, y_test)
        test = self._get_data3(test)

        return (train, test)
Пример #2
0
class ImbalancedClassResampler():
    SMOTE = "SMOTE"
    RANDOM_UNDERSAMPLE = "RANDOM_UNDERSAMPLE"

    def __init__(self, method=None, n_process=1):
        self.method = method
        self.n_process = n_process
        self.resampler = None

    def fit(self, x, y):
        if self.method is None:
            return self
        if self.method == ImbalancedClassResampler.SMOTE:
            self.resampler = SMOTE(n_jobs=self.n_process)
            self.resampler.fit(x, y)
        elif self.method == ImbalancedClassResampler.RANDOM_UNDERSAMPLE:
            self.resampler = RandomUnderSampler()

    def get_params(self, deep):
        return {"method": self.method}

    def set_params(self, method):
        self.method = method

    def resample(self, x, y):
        if self.method is None:
            return x, y
        return self.resampler.resample(x, y)

    def fit_resample(self, x, y):
        self.fit(x, y)
        return self.transform(x, y)
Пример #3
0
def balance_classes(X_train,y_train):
    print('UnderSample Data - Balance Classes')
    rus = RandomUnderSampler(random_state=42)
    rus.fit(X_train, y_train)
    X_train, y_train = rus.sample(X_train, y_train)

    print('After Balancing the new size is {0}'.format(len(X_train)))
    return X_train,y_train
Пример #4
0
def test_rus_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    rus.fit(X, Y)
    assert_raises(RuntimeError, rus.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_rus_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    rus.fit(X, Y)
    assert_raises(RuntimeError, rus.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
Пример #6
0
def test_rus_fit():
    """Test the fitting method"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    # Fit the data
    rus.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(rus.min_c_, 0)
    assert_equal(rus.maj_c_, 1)
    assert_equal(rus.stats_c_[0], 500)
    assert_equal(rus.stats_c_[1], 4500)
def test_rus_fit():
    """Test the fitting method"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    # Fit the data
    rus.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(rus.min_c_, 0)
    assert_equal(rus.maj_c_, 1)
    assert_equal(rus.stats_c_[0], 3)
    assert_equal(rus.stats_c_[1], 7)
Пример #8
0
class RatioRandomUnderSampler(RandomUnderSampler):
    def __init__(self, pos_ratio, random_state=0):
        self.pos_ratio = pos_ratio
        self.ratio_sampler = None
        super(RatioRandomUnderSampler, self).__init__(random_state=random_state)

    def fit(self, X, y):
        pos = len(y[y == 1])
        neg = int(pos * ((1 - self.pos_ratio) / self.pos_ratio))
        self.ratio_sampler = RandomUnderSampler(random_state=self.random_state, ratio={0: neg, 1: pos})
        self.ratio_sampler.fit(X, y)
        return self

    def sample(self, X, y):
        return self.ratio_sampler.sample(X, y)
Пример #9
0
def main():

    # Directory containing the original dataset
    new_dir = os.path.join(directory, 'original')

    # Iterate through each of the 7 project files
    for k in range(1, 8):
        filename = os.path.join(new_dir, str(k) + '_original.csv')
        print('\n\t\t\t' + filename)

        # Read files and get the data
        data, target, minor = read_file(filename)

        # Get the new undersampled size for the majority class
        ratio = round((len(data) - minor) * 0.1)
        print('total: ' + str(len(data)) + ' minor: ' + str(minor))
        print('ratio: ' + str(ratio))

        ### Important to finish significance test for this part! ###
        # Repeat this 10 times, get 10 random datasets
        for c in ascii_lowercase:
            if c == 'k':
                break
            print(c)
            # Fit the training data for the undersampled size and get new resampled training set
            rus = RandomUnderSampler(random_state=None, ratio={0: ratio})
            rus.fit(data, target)
            X_resampled, y_resampled = rus.sample(data, target)
            print('resampled total: ' + str(len(X_resampled)))

            # Print number of buggy and number of non buggy
            bug = len([y for y in y_resampled if y == 1])
            not_bug = len(y_resampled) - bug
            print('not buggy: ' + str(not_bug) + ' buggy: ' + str(bug))

            # Store the new dataset in a new file
            new_filename = os.path.join(
                directory + '/random_sampled/files',
                str(k) + '_random_sampled_' + c + '.csv')
            with open(new_filename, 'w') as f:
                writer = csv.writer(f)

                for d, t in zip(X_resampled, y_resampled):
                    instance = np.append(d, t)
                    writer.writerow(instance)
Пример #10
0
def main():

    # Directory containing the original dataset
    new_dir = os.path.join(directory, 'original')

    # Iterate through each of the 7 project files
    for k in range(1, 8):
        filename = os.path.join(new_dir, str(k) + '_original.csv')
        print('\n\t\t\t' + filename)

        # Read files and get the data
        data, target, minor = read_file(filename)

        # Get the new undersampled size for the majority class
        ratio = round((len(data) - minor) * 0.1)
        print('total: ' + str(len(data)) + ' minor: ' + str(minor))
        # print ('ratio: '+str(ratio))

        # Fit the training data for the undersampled size and get new resampled training set
        rus = RandomUnderSampler(random_state=RANDOM_STATE, ratio={0: minor})
        rus.fit(data, target)
        X_resampled, y_resampled = rus.sample(data, target)
        print('resampled total: ' + str(len(X_resampled)))

        # Print number of buggy and number of non buggy
        bug = len([y for y in y_resampled if y == 1])
        not_bug = len(y_resampled) - bug

        print('not buggy: ' + str(not_bug) + ' buggy: ' + str(bug))

        # Store the new dataset in a new file
        new_filename = os.path.join(directory + '/down_sampled',
                                    str(k) + '_down_sampled.csv')
        with open(new_filename, 'w') as f:
            writer = csv.writer(f)

            for d, t in zip(X_resampled, y_resampled):
                instance = np.append(d, t)
                writer.writerow(instance)
Пример #11
0
            fmt='d',
            xticklabels=category_id_df.category.values,
            yticklabels=category_id_df.category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print(
    metrics.classification_report(y_test,
                                  y_pred,
                                  target_names=data['category'].unique()))

# Resampling/Undersampling

rus = RandomUnderSampler(ratio=.8, random_state=0)
rus.fit(features, labels)
X_resampled, y_resampled = rus.sample(features, labels)

X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                    y_resampled,
                                                    test_size=0.1,
                                                    random_state=0)

clf = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

conf_mat = confusion_matrix(y_test, y_pred)
Percentage_of_no_promotion = Count_no_promotion / (Count_no_promotion +
                                                   Count_promotion)
print("percentage of normal transacation is", Percentage_of_no_promotion * 100)
Percentage_of_promotion = Count_promotion / (Count_no_promotion +
                                             Count_promotion)
print("percentage of fraud transacation", Percentage_of_promotion * 100)
# In[15]:
#Data Slice
X = data.iloc[:, 1:13]
Y = data.iloc[:, 13:]
# In[12]:
#ReSampling - Under Sampling
from imblearn.under_sampling import RandomUnderSampler
# In[16]:
rus = RandomUnderSampler(random_state=0)
rus.fit(X, Y)
X_resampled, y_resampled = rus.sample(X, Y)
Y = pd.DataFrame(y_resampled, columns=['is_promoted'])
X = pd.DataFrame(X_resampled,
                 columns=[
                     'department', 'region', 'education', 'gender',
                     'recruitment_channel', 'no_of_trainings', 'age',
                     'previous_year_rating', 'length_of_service',
                     'KPIs_met >80%', 'awards_won?', 'avg_training_score'
                 ])
# In[18]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X.department = label_encoder.fit_transform(X.department)
X.region = label_encoder.fit_transform(X.region)
X.education = label_encoder.fit_transform(X.education)
Пример #13
0
all_auc_with_clustered_trees = []
all_auc_with_one_tree = []
X_train_major = np.zeros((0, 1294))
y_train_major = []

avg_roc = 0
avg_aupr = 0

for train_index, test_index in skf.split(X, y):
    X_train = X[train_index]
    X_test = X[test_index]

    y_train = y[train_index]
    y_test = y[test_index]

    major_class = max(sampler.fit(X_train, y_train).stats_c_,
                      key=sampler.fit(X_train, y_train).stats_c_.get)

    major_class_X_train = []
    major_class_y_train = []
    minor_class_X_train = []
    minor_class_y_train = []

    for index in range(len(X_train)):
        if y_train[index] == major_class:
            major_class_X_train.append(X_train[index])
            major_class_y_train.append(y_train[index])
        else:
            minor_class_X_train.append(X_train[index])
            minor_class_y_train.append(y_train[index])
def create_model(dataset):

    print("dataset : ", dataset)
    df = pd.read_csv('/home/farshid/Desktop/' + dataset, header=None)

    print('reading', dataset)
    df['label'] = df[df.shape[1] - 1]
    #
    df.drop([df.shape[1] - 2], axis=1, inplace=True)
    labelencoder = LabelEncoder()
    df['label'] = labelencoder.fit_transform(df['label'])
    #
    X = np.array(df.drop(['label'], axis=1))
    y = np.array(df['label'])

    number_of_clusters = 23
    sampler = RandomUnderSampler()
    normalization_object = Normalizer()
    X = normalization_object.fit_transform(X)
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    n_classes = 2

    for train_index, test_index in skf.split(X, y):
        X_train = X[train_index]
        X_test = X[test_index]

        y_train = y[train_index]
        y_test = y[test_index]

        break
    print('training', dataset)
    top_roc = 0

    depth_for_rus = 0
    split_for_rus = 0

    for depth in range(3, 20, 20):
        for split in range(3, 9, 20):

            classifier = AdaBoostClassifier(DecisionTreeClassifier(
                max_depth=depth, min_samples_split=split),
                                            n_estimators=100,
                                            learning_rate=1,
                                            algorithm='SAMME')

            X_train, y_train = sampler.fit_sample(X_train, y_train)

            classifier.fit(X_train, y_train)

            predictions = classifier.predict_proba(X_test)

            score = roc_auc_score(y_test, predictions[:, 1])

            if top_roc < score:
                top_roc = score

                tpr = dict()
                fpr = dict()
                roc = dict()
                for i in range(n_classes):
                    fpr[i], tpr[i], _ = roc_curve(y_test, predictions[:, i])
                    roc[i] = roc_auc_score(y_test, predictions[:, i])

    major_class = max(sampler.fit(X_train, y_train).stats_c_,
                      key=sampler.fit(X_train, y_train).stats_c_.get)

    major_class_X_train = []
    major_class_y_train = []
    minor_class_X_train = []
    minor_class_y_train = []

    for index in range(len(X_train)):
        if y_train[index] == major_class:
            major_class_X_train.append(X_train[index])
            major_class_y_train.append(y_train[index])
        else:
            minor_class_X_train.append(X_train[index])
            minor_class_y_train.append(y_train[index])

    # optimize for number of clusters here
    kmeans = KMeans(max_iter=200, n_jobs=4, n_clusters=number_of_clusters)
    kmeans.fit(major_class_X_train)

    # get the centroids of each of the clusters
    cluster_centroids = kmeans.cluster_centers_

    # get the points under each cluster
    points_under_each_cluster = {
        i: np.where(kmeans.labels_ == i)[0]
        for i in range(kmeans.n_clusters)
    }

    for i in range(number_of_clusters):
        size = len(points_under_each_cluster[i])
        random_indexes = np.random.randint(low=0,
                                           high=size,
                                           size=int(size / 2))
        temp = points_under_each_cluster[i]
        feature_indexes = temp[random_indexes]
        X_train_major = np.concatenate(
            (X_train_major, X_train[feature_indexes]), axis=0)
        y_train_major = np.concatenate(
            (y_train_major, y_train[feature_indexes]), axis=0)

    final_train_x = np.concatenate((X_train_major, minor_class_X_train),
                                   axis=0)
    final_train_y = np.concatenate((y_train_major, minor_class_y_train),
                                   axis=0)

    classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=150))
    # classifier = sklearn.svm.SVC(C=50 , gamma= .0008 , kernel='rbf', probability=True)
    # classifier = sklearn.svm.SVC(C=100, gamma=.006, kernel='rbf', probability=True)

    classifier.fit(final_train_x, final_train_y)

    predicted = classifier.predict_proba(X_test)

    tpr_c = dict()
    fpr_c = dict()
    roc_c = dict()
    for i in range(n_classes):
        fpr_c[i], tpr_c[i], _ = roc_curve(y_test, predictions[:, i])
        roc_c[i] = auc(y_test, predictions[:, i])

    print('ploting', dataset)
    #    plt.clf()
    plt.plot(fpr[1],
             tpr[1],
             lw=2,
             color='red',
             label='Roc curve: Clustered sampling')

    plt.plot(fpr_c[1],
             tpr_c[1],
             lw=2,
             color='navy',
             label='Roc curve: random under sampling')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Area under ROC curve')
    plt.legend(loc="lower right")
    plt.show()
Пример #15
0
plot(hist_iphone_smote)

galaxy_smote, gsent_smote = smote.fit_sample(galaxy_cor_3v.iloc[:, 0:45],
                                             galaxy_cor_3v['galaxysentiment'])
galaxy_smote_complete = pd.DataFrame(galaxy_smote,
                                     columns=list(
                                         galaxy_cor_3v.iloc[:, 0:45].columns))
galaxy_smote_complete['galaxysentiment'] = gsent_smote
galaxy_smote_complete['galaxysentiment'].unique()
hist_galaxy_smote = px.histogram(galaxy_smote_complete, x='galaxysentiment')
plot(hist_galaxy_smote)

### Under sampling
# Random under sampler
rus = RandomUnderSampler(random_state=0)  #, ratio={0: 30, 1: 20, 2: 60}
rus.fit(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment'])
iphone_resampled_under, isent_resampled_under = rus.sample(
    iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment'])
iphone_resampled_complete_under = pd.DataFrame(iphone_resampled_under)
iphone_resampled_complete_under['iphonesentiment'] = isent_resampled_under
hist_iphone_resampled_under = px.histogram(iphone_resampled_complete_under,
                                           x='iphonesentiment')
plot(hist_iphone_resampled_under)

rus.fit(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment'])
galaxy_resampled_under, gsent_resampled_under = rus.sample(
    galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment'])
galaxy_resampled_complete_under = pd.DataFrame(galaxy_resampled_under)
galaxy_resampled_complete_under['galaxysentiment'] = gsent_resampled_under
hist_galaxy_resampled_under = px.histogram(galaxy_resampled_complete_under,
                                           x='galaxysentiment')