def get_ad_dataset(noise=0.3): # Load ad dataset ad_dataset_file = 'ad-dataset/ad.data' ad_dataset = np.genfromtxt(ad_dataset_file, delimiter=',', dtype=str) ad_dataset[ad_dataset == 'ad.'] = 1 ad_dataset[ad_dataset == 'nonad.'] = 0 ads_features = get_ads_features(201239480, 302629605) ads_features += [np.shape(ad_dataset)[1]-1] ad_dataset = ad_dataset[:, ads_features].astype(int) ad_dataset = ad_dataset.tolist() return get_noisy_folds(ad_dataset, noise)
def extract_data_from_ads(): file = open('ads/ad.data') indices = get_features.get_ads_features(317390805, 317390789) x_val = [] y_val = [] for line in file: list_line = line.split(',') sub_list_by_indices = [] for i in indices: sub_list_by_indices.append(list_line[i]) x_val.append(sub_list_by_indices) if current_sample_labeled_ad(list_line): y_val.append(1) else: y_val.append(0) return x_val, y_val
def check(): featureList = get_ads_features(313542516, 208346379) file = open("ad.data", 'r') data = [] results = [] for line in file.readlines(): out = line.split(",") item = [out[index] for index in featureList] item.append(str(0 if "nonad." in out[-1] else 1)) data.append(item) file.close() noisyfolds, folds = get_noisy_folds(data) output = open("folds.pkl",'wb') output1 = open("noisyfolds.pkl",'wb') pickle.dump(folds,output) pickle.dump(noisyfolds,output1) output.close() output1.close() sumacc=0 for i in range(0,10): train=[] for j in range(0,10): if i != j: train.extend(noisyfolds[j]) trainFinal=[] results=[] for sample in train: trainFinal.append(sample[:-1]) results.append(sample[-1]) tree=DecisionTreeClassifier(criterion="entropy",splitter="best",min_samples_split=4) tree=tree.fit(trainFinal,results) predictSamples=[] predictResults=[] predictSamples = [folds[i][index][:-1] for index in range(len(folds[i]))] predictResults = [folds[i][index][-1] for index in range(len(folds[i]))] sumacc+=tree.score(predictSamples,predictResults) return sumacc/10