def cv5_smote_revc_psednc(fold_path, filename, k): # Generate pos and neg vecs and SMOTE synthetic vecs. lamada = 6 w = 0.8 revc_kmer = RevcKmer(k=k, normalize=True, upto=True) psednc = PseDNC(lamada, w) for i in range(5): # Generate RevcKmer_PseDNC vecs. with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) test_neg_revc_psednc_vecs = np.column_stack((test_neg_revc_kmer_vecs, test_neg_psednc_vecs[:, -lamada:])) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) test_pos_revc_psednc_vecs = np.column_stack((test_pos_revc_kmer_vecs, test_pos_psednc_vecs[:, -lamada:])) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) train_neg_revc_psednc_vecs = np.column_stack((train_neg_revc_kmer_vecs, train_neg_psednc_vecs[:, -lamada:])) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) train_pos_revc_psednc_vecs = np.column_stack((train_pos_revc_kmer_vecs, train_pos_psednc_vecs[:, -lamada:])) # Generate synthetic vecs from pos_vecs. synthetic1 = (smote.smote(train_pos_revc_psednc_vecs, N=100, k=5)).tolist() synthetic2 = (smote.smote(train_pos_revc_psednc_vecs, N=50, k=5)).tolist() synthetic = np.row_stack((synthetic1, synthetic2)) n_lamada = "_".join([str(lamada), str(w)]) # Write test file. write_file = fold_path + filename + '_' + n_lamada + "_test_" + str(i) + ".txt" test_vecs = test_pos_revc_psednc_vecs.tolist() + test_neg_revc_psednc_vecs.tolist() test_vecs_labels = [1] * len(test_pos_revc_psednc_vecs) + [-1] * len(test_neg_revc_psednc_vecs) write_libsvm(test_vecs, test_vecs_labels, write_file) # Write train file. write_file = fold_path + filename + '_' + n_lamada + "_train_" + str(i) + ".txt" train_pos_vecs = train_pos_revc_psednc_vecs.tolist() + synthetic.tolist() train_vecs = train_pos_vecs + train_neg_revc_psednc_vecs.tolist() train_vecs_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_revc_psednc_vecs) write_libsvm(train_vecs, train_vecs_labels, write_file)
def _oversample(self, class_name, rate): """ Oversample examples of a class :param class_name: string, class name :param rate: float, rate of oversampling, 1 corresponds to 100% :return: nothing, generated examples are added to self._df_synthetic """ n_examples = self._df['Id'][self._df['Class'].isin([class_name])].count() labels = self._df[self._class_columns][self._df['Class'].isin([class_name])].values[0] images = np.zeros((n_examples, self._image_height * self._image_width)) i = 0 for _, f in self._df.Image[self._df['Class'].isin([class_name])].iteritems(): img = self._read_image(f) images[i] = img.flatten() i += 1 n = math.ceil(n_examples * rate) n = int(n) #if self.verbose: # synthetic_examples, parent_ids = smote(images, n, n_neighbours=5, return_parent_ids=True) # self._save_synthetic_examples(synthetic_examples, images, parent_ids, class_name) #else: synthetic_examples = smote(images, n, n_neighbours=5) df = pd.DataFrame(index=np.arange(0, n), columns=self._df_synthetic.columns.values) for i, img in enumerate(synthetic_examples): df.loc[i].Id = 's_{}_{}'.format(class_name, i) img = img.reshape((self._image_height, self._image_width)) df.loc[i].Image = img df.loc[i][self._class_columns] = labels self._df_synthetic = self._df_synthetic.append(df, ignore_index=True)
def cv5_smote_revc_kmer(fold_path, filename, k): revc_kmer = RevcKmer(k=k, normalize=True, upto=True) for i in range(5): # Generate RevcKmer vecs. with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) # Generate SMOTE synthetic vecs from train_pos_vecs. synthetic = smote.smote(train_pos_revc_kmer_vecs, N=200, k=5) # Write test file. write_file = fold_path + filename + "_test_" + str(i) + ".txt" test_vecs = test_pos_revc_kmer_vecs + test_neg_revc_kmer_vecs test_vecs_labels = [1] * len(test_pos_revc_kmer_vecs) + [-1] * len(test_neg_revc_kmer_vecs) write_libsvm(test_vecs, test_vecs_labels, write_file) # Write train file. write_file = fold_path + filename + "_train_" + str(i) + ".txt" train_pos_revc_kmer_vecs = train_pos_revc_kmer_vecs.tolist() + synthetic.tolist() train_vecs = train_pos_revc_kmer_vecs + train_neg_revc_kmer_vecs train_vecs_labels = [1] * len(train_pos_revc_kmer_vecs) + [-1] * len(train_neg_revc_kmer_vecs) write_libsvm(train_vecs, train_vecs_labels, write_file)
def readSmoteDataset(file, properties): prefix = "tera/" suffix = ".csv" finput = open(prefix + file + suffix, 'rb') reader = csv.reader(finput, delimiter=',') dataread = smote(reader) return np.array(dataread[0]), dataread[-1] # keeping the same format as Joe's code
def contents(file, sep= The.reader.sep, bad= The.reader.bad, k=3): import csv if SMOTE is True: num_lines = sum(1 for line in open(file)) f = open(file, "r") reader = csv.reader(f, delimiter=',') dataread = smote(reader) f.seek(0) dataread[0].insert(0, f.readline().split(',')[k:]) ret = [] for n, line in enumerate(dataread[0]): if n != 0: line = [float(l) for l in line] ret.append([n, line]) return ret else: ret = [] f = open(file) for n,line in enumerate(f): line = re.sub(bad,"",line) # kill white space if n != 0: try: ret.append((n, [float(x) for x in line.split(sep)[k:]])) except: ret.append((n, [1 if x == 'Y' else 0 for x in line.split(sep)[k:]])) else: ret.append([n, line.split(sep)[k:]]) return ret
def doSmote(self): df = pd.concat([self.train_X, self.train_y], axis=1) columnNames = self.data.columns.values.tolist() columnNames.append(self.class_label) smt = smote.smote(df, 5) self.data = smt.fit_transform() self.data.columns = columnNames self.train_y = self.data[self.class_label] self.data.drop([self.class_label], axis=1, inplace=True) self.train_X = self.data
def smotify(model=MODEL(), rows=None, k=5, factor = 100): if rows == None: rows = model._rows klazzify(model, rows) classLength = [len(model.classes[i]) for i in model.classes] maxLen, minLen = max(classLength), min(classLength) clones = [] for key in model.classes: classLength = len(model.classes[key]) f = factor if classLength < ((maxLen + minLen)/2) : f = (factor*(maxLen + minLen)/2)/classLength clones += smote(model, model.classes[key], k=k, N = int(ceil(f))) return clones
def smote_sample(x, y, N): nt = count_true(y) nf = np.size(y) - nt # N = int(np.floor((nf/nt*frac/(1-frac)-1)*100)) x_true, x_false = true_false_split(x, y, 'True'), true_false_split( x, y, 'False') #generate synthetic true values smoted_data = smote(x_true, 5, N) #fix column of lowest sym_order to 1 for these, so they are classed as having a sym smoted_data[:, 2] = np.ones(np.size(smoted_data, axis=0)) smote_sampled = np.vstack((x_true, smoted_data)) smote_sampled = np.vstack((x_false, smote_sampled)) np.random.shuffle(smote_sampled) return smote_sampled
def tune_SMOTE(train_pd): train_len = len(train_pd) new_train_index = random.sample(train_pd.index, int(train_len * 0.7)) new_train = train_pd.ix[new_train_index] if "_TunedSmote" in isWhat: new_train_X = new_train.ix[:, new_train.columns[:-1]].values new_train_Y = new_train.ix[:, new_train.columns[-1]].values new_tune = train_pd.drop(new_train_index) new_tune_X = new_tune.ix[:, new_tune.columns[:-1]].values new_tune_Y = new_tune.ix[:, new_tune.columns[-1]].values # clf = learner(new_train_X, new_train_Y, new_tune_X, new_tune_Y) A_smote = smote(new_train) num_range = [[int(A_smote.get_majority_num() * 0.5), int(A_smote.get_majority_num() * 1.5)]] * ( A_smote.label_num - 1) params_to_tune = {"k": [2, 20], "up_to_num": num_range} # pdb.set_trace() tuner = DE_Tune_SMOTE(learner, smote, params_to_tune, new_train, new_tune, target_class, goal) params = tuner.Tune() return params, new_train
fpr, tpr = cross_validation(X, y[:, 0], clf, thresh) expB[j, i + 1, :] = [fpr, tpr] # Gaussian Naive Bayes classifier on Plain Under-sampled data clf = GaussianNB() fpr, tpr = cross_validation(X, y[:, 0], clf, NBthresh) print 'Gauss NB Class Priors [-ve class, +ve class]: ', clf.class_prior_ priors.append(clf.class_prior_) expC[j, i + 1, :] = [fpr, tpr] # k-NN classifier on Plain Under-sampled data clf = KNeighborsClassifier(n_neighbors=KNN) fpr, tpr = cross_validation(X, y[:, 0], clf, thresh) expE[j, i + 1, :] = [fpr, tpr] dataC = smote.smote(data, minority_overSample_percent[j], 5) X = dataC[:, [1, 0]] y = dataC[:, [data.shape[1] - 1]] unique, counts = np.unique(dataC[:, [data.shape[1] - 1]], return_counts=True) freq = dict(zip(unique, counts)) nPositive = freq[1.0] nNegative = freq[0.0] print 'SMOTED by ' + str(minority_overSample_percent[j]) + ' %' print '+ve Class: ', nPositive, ' -ve Class: ', nNegative dataC = smote.underSMOTE(dataC, majority_underSample_percent[i]) unique, counts = np.unique(dataC[:, [data.shape[1] - 1]],
def cross_val(pd_data, learner, target_class, goal, isWhat="", fold=5, repeats=2): """ do 5-fold cross_validation """ def tune_learner(train_X): train_len = len(train_X) new_train_index = np.random.choice(range(train_len), train_len * 0.7) new_tune_index = list(set(range(train_len)) - set(new_train_index)) new_train_X = train_X[new_train_index] new_train_Y = train_Y[new_train_index] new_tune_X = train_X[new_tune_index] new_tune_Y = train_Y[new_tune_index] clf = learner(new_train_X, new_train_Y, new_tune_X, new_tune_Y, goal) tuner = DE_Tune_ML(clf, clf.get_param(), target_class, goal) return tuner.Tune() def tune_SMOTE(train_pd): train_len = len(train_pd) new_train_index = random.sample(train_pd.index, int(train_len * 0.7)) new_train = train_pd.ix[new_train_index] if "_TunedSmote" in isWhat: new_train_X = new_train.ix[:, new_train.columns[:-1]].values new_train_Y = new_train.ix[:, new_train.columns[-1]].values new_tune = train_pd.drop(new_train_index) new_tune_X = new_tune.ix[:, new_tune.columns[:-1]].values new_tune_Y = new_tune.ix[:, new_tune.columns[-1]].values # clf = learner(new_train_X, new_train_Y, new_tune_X, new_tune_Y) A_smote = smote(new_train) num_range = [[int(A_smote.get_majority_num() * 0.5), int(A_smote.get_majority_num() * 1.5)]] * ( A_smote.label_num - 1) params_to_tune = {"k": [2, 20], "up_to_num": num_range} # pdb.set_trace() tuner = DE_Tune_SMOTE(learner, smote, params_to_tune, new_train, new_tune, target_class, goal) params = tuner.Tune() return params, new_train F = {} total_evaluation = 0 for i in xrange(repeats): # repeat 5 times here kf = StratifiedKFold(pd_data.ix[:, pd_data.columns[-1]].values, fold, shuffle=True) for train_index, test_index in kf: train_pd = pd_data.ix[train_index] test_pd = pd_data.ix[test_index] if "Smote" in isWhat: k = 5 up_to_num = [] if "_TunedSmote" in isWhat: params, train_pd = tune_SMOTE(train_pd) # use new training data not original, because some are used as tuning k = params["k"] up_to_num = params["up_to_num"] train_pd = smote(train_pd, k, up_to_num).run() train_X = train_pd.ix[:, train_pd.columns[:-1]].values train_Y = train_pd.ix[:, train_pd.columns[-1]].values test_X = test_pd.ix[:, test_pd.columns[:-1]].values test_Y = test_pd.ix[:, test_pd.columns[-1]].values params, evaluation = tune_learner(train_X) if "_TunedLearner" in isWhat else ({},0) F = learner(train_X, train_Y, test_X, test_Y, goal).learn(F, **params) total_evaluation +=evaluation # pdb.set_trace() avg_evaluation = total_evaluation / (repeats * fold) return avg_evaluation, F
# minoritySamples, majoritySamples = getSeparatedSamples('Input/diabetes.csv') print ("Number of Miniority Samples:" + str(len(minoritySamples))) print ("Number of Majority Samples:" + str(len(majoritySamples))) # print minoritySamples[0] minorityCounter = len(minoritySamples) majorityCounter = len(majoritySamples) underSampledMajoritySamples = underSample(minorityCounter, 100, majoritySamples, majorityCounter) underSampleOnlyHelper(minoritySamples) smoteHelper(underSampledMajoritySamples) smote(minorityCounter, 100, 5, minoritySamples) # plotROC(majoritySamples, minoritySamples) # treeClassifierLogisticRegression(majoritySamples, minoritySamples) # treeClassifier(majoritySamples, minoritySamples) treeClassifier2('Output/diabetes_Smote.csv') treeClassifier2('Output/diabetes_Under.csv') # plotConvexHull() naiveBayes(majoritySamples, minoritySamples) # csvfile.close except Exception as error:
num_pos = sum([1 for sample_id in _Y if _Y[sample_id] == 1]) num_neg = sum([1 for sample_id in _Y if not _Y[sample_id] == -1]) return _X, _Y, num_pos, num_neg if __name__ == '__main__': exp_name = sys.argv[1] neighbours_smote = int(sys.argv[2]) neighbours_enn = int(sys.argv[3]) path = './' + exp_name os.makedirs(path) problems = ['1'] for p in problems: X, Y, num_pos, num_neg = read_data(p) X, Y, num_pos, num_neg = smote(X, Y, num_pos, num_neg, neighbours_smote) X, Y, num_pos, num_neg = enn(X, Y, neighbours_enn) path_file = path + '/%s_X.tsv' % p save_data(path_file, X, Y)