Exemplo n.º 1
0
X_train, X_test, y_train, y_test = X_train.to_numpy(), X_test.to_numpy(
), y_train.to_numpy(), y_test.to_numpy()

print(type(X_train))
#print(type(X_train.to_numpy()))

# ### Entrenar el clasificador (modelo)

# In[3]:

clasificador = HoeffdingTree()
print(clasificador.get_info())

print("start training")
clasificador.fit(X_train, y_train, classes=None, sample_weight=None)
print("end training")

# # Graficas Paper

# In[4]:

dataset = pd.read_csv('dataset_seleccion_parametros.csv')

dataset.shape

# In[5]:

dataset.head()

# In[6]:
Exemplo n.º 2
0
# Indexação é pré-requisito para Decision Trees
stringIndexer = StringIndexer(inputCol="rotulo", outputCol="indexed")
si_model = stringIndexer.fit(scaledData)
obj_final = si_model.transform(scaledData)

X = np.array(obj_final.select("scaledFeatures").collect())
y = np.array(obj_final.select("indexed").collect())

#mudar a dimensão da matriz de atributos para 2d
nsamples, nx, ny = X.shape
d2_X = X.reshape((nsamples, nx * ny))

# Criando o modelo
vfdtClassifer = HoeffdingTree()
modelo = vfdtClassifer.fit(d2_X, y)


def output_rdd(rdd):

    X_inc = []
    y_inc = []

    if not rdd.isEmpty():
        rdd2 = rdd.map(transformToNumeric2)
        DF = spSession.createDataFrame(rdd2)
        rdd3 = DF.rdd.map(transformaVar)
        DF = spSession.createDataFrame(rdd3, ["rotulo", "atributos"])
        scaler_Model = scaler.fit(DF)
        scaled_Data = scaler_Model.transform(DF)
        X = np.array(scaled_Data.select("scaledFeatures").collect())
Exemplo n.º 3
0
    "CTU-IoT-Malware-Capture-43-1-p_0", "CTU-IoT-Malware-Capture-43-1-p_3",
    "33-1-2-43-1-3"
]
n_chunks = [4000, 850, 3300, 1450, 2300]

# metrics = (balanced_accuracy_score, geometric_mean_score_1, f1_score, precision, recall, specificity)

for n, name in enumerate(names):
    filepath = ("arff/" + name + ".arff")
    stream = ARFFParser(filepath, chunk_size=250, n_chunks=n_chunks[n])
    clf = SEA(base_estimator=GaussianNB())
    clf2 = HoeffdingTree(split_criterion='hellinger')

    X, y = stream.get_chunk()
    clf.fit(X, y)
    clf2.fit(X, y)

    probas = []
    probas2 = []
    ys = []
    # limit = 150
    for chunk in tqdm(range(n_chunks[n] - 1)):
        X, y = stream.get_chunk()
        proba = clf.predict_proba(X)[:, 1]
        try:
            proba2 = clf2.predict_proba(X)[:, 1]
        except:
            try:
                proba2 = np.concatenate(proba2)
            except:
                pass
Exemplo n.º 4
0
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from skmultiflow.trees import HoeffdingTree
import matplotlib.pyplot as plt

res = []
# Create a dataset.
X, y = make_classification(10000, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Define a tree for fitting the complete dataset and one for streaming.
ht_complete = HoeffdingTree()
ht_partial = HoeffdingTree()

# Fit the complete dataset.
ht_complete.fit(X_train, y_train)
ht_complete_score = ht_complete.score(X_test, y_test)
print(f'Score when fitting at once: {ht_complete_score}')

# Streaming samples one after another.
timer = False
j = 0
for i in range(len(X_train)):
    ht_partial.partial_fit(X_train[i].reshape(1, -1), np.array([y_train[i]]))
    res.append(ht_partial.score(X_test, y_test))
    print(f'Score when streaming after {i} samples: {res[-1]}')
    if res[-1] >= ht_complete_score - 0.01:
        print(f'(Almost) full score reached! Continue for another {20 - j} samples.')
        timer = True
    if timer:
        j += 1