def get_ad_dataset(noise=0.3): # Load ad dataset ad_dataset_file = 'ad-dataset/ad.data' ad_dataset = np.genfromtxt(ad_dataset_file, delimiter=',', dtype=str) ad_dataset[ad_dataset == 'ad.'] = 1 ad_dataset[ad_dataset == 'nonad.'] = 0 ads_features = get_ads_features(201239480, 302629605) ads_features += [np.shape(ad_dataset)[1]-1] ad_dataset = ad_dataset[:, ads_features].astype(int) ad_dataset = ad_dataset.tolist() return get_noisy_folds(ad_dataset, noise)
def get_har_dataset(noise=0.3): # Load HAR dataset har_dataset_file = 'UCI HAR Dataset/train/X_train.txt' har_labels_file = 'UCI HAR Dataset/train/y_train.txt' har_dataset = np.genfromtxt(har_dataset_file, dtype=float) har_labels = np.genfromtxt(har_labels_file, dtype=int) har_labels[har_labels <= 3] = 1 # replace moving labels har_labels[har_labels >= 4] = 0 # replace resting labels har_labels.shape = (-1, 1) har_dataset = np.concatenate((har_dataset, har_labels), axis=1) har_dataset = har_dataset.tolist() return get_noisy_folds(har_dataset, noise)
def check(): featureList = get_ads_features(313542516, 208346379) file = open("ad.data", 'r') data = [] results = [] for line in file.readlines(): out = line.split(",") item = [out[index] for index in featureList] item.append(str(0 if "nonad." in out[-1] else 1)) data.append(item) file.close() noisyfolds, folds = get_noisy_folds(data) output = open("folds.pkl",'wb') output1 = open("noisyfolds.pkl",'wb') pickle.dump(folds,output) pickle.dump(noisyfolds,output1) output.close() output1.close() sumacc=0 for i in range(0,10): train=[] for j in range(0,10): if i != j: train.extend(noisyfolds[j]) trainFinal=[] results=[] for sample in train: trainFinal.append(sample[:-1]) results.append(sample[-1]) tree=DecisionTreeClassifier(criterion="entropy",splitter="best",min_samples_split=4) tree=tree.fit(trainFinal,results) predictSamples=[] predictResults=[] predictSamples = [folds[i][index][:-1] for index in range(len(folds[i]))] predictResults = [folds[i][index][-1] for index in range(len(folds[i]))] sumacc+=tree.score(predictSamples,predictResults) return sumacc/10
current_accuracy /= float(len(fold_semi[k])) accuracy += current_accuracy accuracy /= float(len(noisy_fold_semi)) print('committee semi-random: subset:{} | size: {} | acc: {}'.format( 'examples' if is_subset_of_examples else 'features', committee_size, accuracy)) def all_semi_random_sub_examples(noisy_fold_semi, fold_semi, features): for size in sizes: calculate_semi_random_committee(noisy_fold_semi, fold_semi, features, size, True) def all_semi_random_sub_features(noisy_fold_semi, fold_semi, features): for size in sizes: calculate_semi_random_committee(noisy_fold_semi, fold_semi, features, size, False) if __name__ == '__main__': '''this part should be done once''' x, y = extract_data_from_ads() x_temp = copy.deepcopy(x) for i in range(0, len(x_temp)): x_temp[i].append(y[i]) noisy_folds, folds = noise.get_noisy_folds(x_temp) '''end of part''' '''Arye''' all_semi_random_sub_features(noisy_folds, folds, [i for i in range(0, len(x[0]) - 1)]) '''Max''' all_semi_random_sub_examples(noisy_folds, folds, [i for i in range(0, len(x[0]) - 1)])