X_train, X_test, y_train, y_test = X_train.to_numpy(), X_test.to_numpy( ), y_train.to_numpy(), y_test.to_numpy() print(type(X_train)) #print(type(X_train.to_numpy())) # ### Entrenar el clasificador (modelo) # In[3]: clasificador = HoeffdingTree() print(clasificador.get_info()) print("start training") clasificador.fit(X_train, y_train, classes=None, sample_weight=None) print("end training") # # Graficas Paper # In[4]: dataset = pd.read_csv('dataset_seleccion_parametros.csv') dataset.shape # In[5]: dataset.head() # In[6]:
# Indexação é pré-requisito para Decision Trees stringIndexer = StringIndexer(inputCol="rotulo", outputCol="indexed") si_model = stringIndexer.fit(scaledData) obj_final = si_model.transform(scaledData) X = np.array(obj_final.select("scaledFeatures").collect()) y = np.array(obj_final.select("indexed").collect()) #mudar a dimensão da matriz de atributos para 2d nsamples, nx, ny = X.shape d2_X = X.reshape((nsamples, nx * ny)) # Criando o modelo vfdtClassifer = HoeffdingTree() modelo = vfdtClassifer.fit(d2_X, y) def output_rdd(rdd): X_inc = [] y_inc = [] if not rdd.isEmpty(): rdd2 = rdd.map(transformToNumeric2) DF = spSession.createDataFrame(rdd2) rdd3 = DF.rdd.map(transformaVar) DF = spSession.createDataFrame(rdd3, ["rotulo", "atributos"]) scaler_Model = scaler.fit(DF) scaled_Data = scaler_Model.transform(DF) X = np.array(scaled_Data.select("scaledFeatures").collect())
"CTU-IoT-Malware-Capture-43-1-p_0", "CTU-IoT-Malware-Capture-43-1-p_3", "33-1-2-43-1-3" ] n_chunks = [4000, 850, 3300, 1450, 2300] # metrics = (balanced_accuracy_score, geometric_mean_score_1, f1_score, precision, recall, specificity) for n, name in enumerate(names): filepath = ("arff/" + name + ".arff") stream = ARFFParser(filepath, chunk_size=250, n_chunks=n_chunks[n]) clf = SEA(base_estimator=GaussianNB()) clf2 = HoeffdingTree(split_criterion='hellinger') X, y = stream.get_chunk() clf.fit(X, y) clf2.fit(X, y) probas = [] probas2 = [] ys = [] # limit = 150 for chunk in tqdm(range(n_chunks[n] - 1)): X, y = stream.get_chunk() proba = clf.predict_proba(X)[:, 1] try: proba2 = clf2.predict_proba(X)[:, 1] except: try: proba2 = np.concatenate(proba2) except: pass
from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from skmultiflow.trees import HoeffdingTree import matplotlib.pyplot as plt res = [] # Create a dataset. X, y = make_classification(10000, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) # Define a tree for fitting the complete dataset and one for streaming. ht_complete = HoeffdingTree() ht_partial = HoeffdingTree() # Fit the complete dataset. ht_complete.fit(X_train, y_train) ht_complete_score = ht_complete.score(X_test, y_test) print(f'Score when fitting at once: {ht_complete_score}') # Streaming samples one after another. timer = False j = 0 for i in range(len(X_train)): ht_partial.partial_fit(X_train[i].reshape(1, -1), np.array([y_train[i]])) res.append(ht_partial.score(X_test, y_test)) print(f'Score when streaming after {i} samples: {res[-1]}') if res[-1] >= ht_complete_score - 0.01: print(f'(Almost) full score reached! Continue for another {20 - j} samples.') timer = True if timer: j += 1