Python DataRetriever示例，MLAlgorithms.Utils.DataRetriever.DataRetriever Python示例

示例#1

0

显示文件

文件： TestBinDiscretizer.py 项目： lineranch/CSCI-447

    def test_calc_bins(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        breastCancer = dataRetriever.retrieveData("breastCancer")
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        bd = BinDiscretizer(breastCancer["clumpThickness"], bins=8)
        fitted_data = bd.train_fit()

        numpy_bins = np.histogram_bin_edges(breastCancer["clumpThickness"],
                                            bins=8)

        self.assertEqual(np.allclose(bd.bin_edges, numpy_bins[1:]), True,
                         "Should produce the same bins as np.histogram")

        #Numpy digitize has this weird shenanigan where values of array.max() are considered the next highest bin:
        #https://stackoverflow.com/questions/4355132/numpy-digitize-returns-values-out-of-range
        #These lines convert the array returned to consider the max values as part of the right most bin
        numpy_digitize = np.digitize(breastCancer["clumpThickness"],
                                     numpy_bins)
        max_vals = np.asarray(
            np.where(numpy_digitize == numpy_digitize.max())).flatten()
        numpy_digitize[max_vals] = numpy_digitize.max() - 1
        print("Shape", breastCancer.shape)
        self.assertEqual(np.allclose(numpy_digitize, fitted_data), True,
                         "Should produce the same results as np.digitize")

示例#2

0

显示文件

文件： TestDataRetriever.py 项目： lineranch/CSCI-447

    def test_data_retrieval(self):
        dataRetriever = DataRetriever("../Datasets/metadata.json")

        # This test is failing because the test itself isn't working
        # self.assertEqual(dataRetriever.retrieveData("breastCancer"), pd.DataFrame() , "Should return a dataframe")
        self.assertEqual(dataRetriever.retrieveData("dogDiseases"), None,
                         "Should return null since no data exist")

示例#3

0

显示文件

文件： TestKFolds.py 项目： lineranch/CSCI-447

    def test_train_test_sizes(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        for test, train in KFolds(breastCancer, 10):
            self.assertEqual(len(test) + len(train), len(breastCancer))

示例#4

0

显示文件

文件： TestKFolds.py 项目： lineranch/CSCI-447

    def test_train_test_independence(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        for test, train in KFolds(breastCancer, 10):
            #https://stackoverflow.com/questions/3170055/test-if-lists-share-any-items-in-python
            self.assertFalse(bool(set(test.index) & set(train.index)))

示例#5

0

显示文件

文件： TestKFolds.py 项目： lineranch/CSCI-447

    def test_proper_number_of_folds(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        iterations = 0
        for test, train in KFolds(breastCancer, 10):
            iterations += 1

        self.assertEqual(iterations, 10)

示例#6

0

显示文件

文件： TestBinDiscretizer.py 项目： lineranch/CSCI-447

    def test_creation(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        breastCancer = dataRetriever.retrieveData("breastCancer")
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        with self.assertRaises(Exception) as context:
            BinDiscretizer()

        self.assertTrue('positional' in str(context.exception),
                        "Should Raise Error if no data is passed in")

        with self.assertRaises(TypeError):
            BinDiscretizer(breastCancer["clumpThickness"], bins=None)

示例#7

0

显示文件

文件： TestKFolds.py 项目： lineranch/CSCI-447

    def test_stratisfied(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        iterations = 0
        for test, train in KFolds(breastCancer, 10, stratisfied=True):
            print("TestLen 2", len(test[test['class'] == 2]), "TestLen 4",
                  len(test[test['class'] == 4]))
            print("TrainLen 2", len(train[train['class'] == 2]), "TrainLen 4",
                  len(train[train['class'] == 4]))
            iterations += 1

示例#8

0

显示文件

文件： TestNormalizers.py 项目： lineranch/CSCI-447

    def test_range_normalizer_bounds(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        breastCancer = dataRetriever.retrieveData("breastCancer")
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        rn = RangeNormalizer(breastCancer[continousAttributes])

        fitted = rn.train_fit()

        #Check if the mins/maxes of all the fitted columns are 0/1, respectively
        self.assertEqual(np.allclose(np.ones(fitted.shape[1]), fitted.max()),
                         True)
        self.assertEqual(np.allclose(np.zeros(fitted.shape[1]), fitted.min()),
                         True)

示例#9

0

显示文件

文件： TestKMediods.py 项目： lineranch/CSCI-447

    def testKMeans(self):
        data = DataRetriever("../Datasets/metadata.json")
        data.retrieveData("computerHardware")

        kValue = 15
        t = Timer()
        t.start()
        mediods = KMediods(data.getDataSet(), data.getDataClass(),
                           data.getDescreteAttributes(),
                           data.getContinuousAttributes(),
                           data.getPredictionType(), kValue, 100)

        t.stop()
        print(f"Time: {t}")
        print(mediods)
        mediods.to_csv('kmedoids.csv', index=False)

示例#10

0

显示文件

文件： TestKFolds.py 项目： lineranch/CSCI-447

    def test_test_set_coverage(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        tested_vals = []

        #Add a dummy index to the dataset so we can see which rows are selected each test
        breastCancer["dummyIndex"] = np.arange(len(breastCancer)) + 1

        for test, train in KFolds(breastCancer, 10):
            tested_vals.extend(test["dummyIndex"])

        self.assertTrue(set(tested_vals) == set(breastCancer["dummyIndex"]))

示例#11

0

显示文件

文件： TestNormalizers.py 项目： lineranch/CSCI-447

    def test_untrained(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        breastCancer = dataRetriever.retrieveData("breastCancer")
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        rn = RangeNormalizer(breastCancer[continousAttributes])
        sn = StandardNormalizer(breastCancer[continousAttributes])

        #Test range normalizer
        with self.assertRaises(UntrainedUtilityError):
            rn.fit(breastCancer[continousAttributes])

        #Test standard normalizer
        with self.assertRaises(UntrainedUtilityError):
            sn.fit(breastCancer[continousAttributes])

示例#12

0

显示文件

    def testOneHotEncoder(self):
        dataRetriver = DataRetriever("../Datasets/metadata.json")
        glassData = dataRetriver.retrieveData("breastCancer")
        data = glassData.getDataSet()
        unknown = glassData.getDataClass()
        train = data.sample(n=6, random_state=69)
        test = data.sample(n=6, random_state=420)

        ohe = OneHotEncoder()
        encodedDataFrame = ohe.train_fit(train,
                                         glassData.getDescreteAttributes())
        encodedDict = ohe.encodedDict

        encodedTest = ohe.fit(test)

        # print(encodedDataFrame)
        # print(encodedDict)
        print("=============Train============")
        print(encodedDataFrame[unknown])
        print(train[unknown])
        print("=============Test=============")
        print(encodedTest[unknown])
        print(test[unknown])

示例#13

0

显示文件

文件： TestNaiveBayes.py 项目： lineranch/CSCI-447

    def test_q_calculation(self):
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")

        naiveBayes = NaiveBayes(dataRetriever.getDataSet(),
                                dataRetriever.getDataClass())

        #seperatedByClass = naiveBayes.calculateQ()

        #print(seperatedByClass)

        self.assertEqual(
            dataRetriever.getDataMenu(),
            ["breastCancer", "glass", "iris", "soybeanSmall", "vote"],
            "should return list of data sets")

示例#14

0

显示文件

文件： GA_driver.py 项目： lineranch/CSCI-447

def run_driver(current_data_set,
               mutation_rate=0.5,
               maxIter=1000,
               batch_size=0.6,
               population_size=110,
               network_architecture=[15],
               pb_actor=None):
    cost_func = {
        "breastCancer": "bin_cross",
        "glass": "log_cosh",
        "soybeanSmall": "log_cosh",
        "abalone": "log_cosh",
        "forestFires": "log_cosh",
        "computerHardware": "log_cosh"
    }

    title_text = r""" 
       ______                    __   _          ___     __                     _  __   __                    
      / ____/___   ____   ___   / /_ (_)_____   /   /   / /____ _ ____   _____ (_)/ /_ / /_   ____ ___   _____
     / / __ / _ \ / __ \ / _ \ / __// // ___/  / /| /  / // __ `// __ \ / ___// // __// __ \ / __ `__ \ / ___/
    / /_/ //  __// / / //  __// /_ / // /__   / ___ / / // /_/ // /_/ // /   / // /_ / / / // / / / / /(__  ) 
    \____/ \___//_/ /_/ \___/ \__//_/ \___/  /_/  |_//_/ \__, / \____//_/   /_/ \__//_/ /_//_/ /_/ /_//____/  
                                                        /____/                                                
    """

    output_json = {}

    # ====================== Adjustable Variables ==============================
    # current_data_set = "abalone"
    # mutation_rate = .5
    # maxIter = 10
    # batch_size = .6
    # population_size = 110

    # network_architecture = []
    # ===========================================================================

    output_json["parameters"] = {
        "mutation_rate": mutation_rate,
        "population_size": population_size,
        "network_architecture": network_architecture,
        "cost_func": cost_func[current_data_set],
        "maxIter": maxIter,
        "batch_size": batch_size
    }

    # ================ Data pre-processing =================================================
    dataRetriever = DataRetriever("../../Datasets/metadata.json")
    dataRetriever.retrieveData(current_data_set)
    dataset = dataRetriever.getDataSet().dropna()

    discrete_attr = dataRetriever.getDescreteAttributes()
    cont_attributes = dataRetriever.getContinuousAttributes()
    # This line is used to normalize the data for Forest Fires
    if current_data_set == "forestFires":
        discrete_attr.remove('month')
        discrete_attr.remove('day')
        dataset['month'] = (pd.to_datetime(dataset.month,
                                           format='%b').dt.month) - 1
        dataset["day"] = dataset['day'].apply(
            lambda x: list(calendar.day_abbr).index(x.capitalize()))
        dataset["month_sin"] = np.sin(dataset['month'])
        dataset["month_cos"] = np.sin(dataset['month'])

        dataset["day_sin"] = np.sin(dataset['day'])
        dataset["day_cos"] = np.sin(dataset['day'])
        dataset = dataset.drop('day', axis=1)
        dataset = dataset.drop('month', axis=1)
        cont_attributes.append('month_sin')
        cont_attributes.append('month_cos')
        cont_attributes.append('day_sin')
        cont_attributes.append('day_cos')

        dataset[dataRetriever.getDataClass()] = np.log(
            dataset[dataRetriever.getDataClass()] + 0.000001)
    elif current_data_set == "computerHardware":
        discrete_attr.remove('venderName')
        discrete_attr.remove('modelName')
        dataset = dataset.drop('venderName', axis=1)
        dataset = dataset.drop('modelName', axis=1)

    dataset = dataset.reset_index(drop=True)

    if dataRetriever.getDataClass() in discrete_attr:
        discrete_attr.remove(dataRetriever.getDataClass())

    # ======================= Train Neural Network ================
    print(title_text)
    fold = 0
    metrics = []

    for test_set, train_set in KFolds(dataset, 10):
        fold += 1
        fitness_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_fitness.csv"
        output_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_output.csv"

        metrics.append(
            multiprocess_func.remote(test_set,
                                     train_set,
                                     fold,
                                     fitness_file,
                                     output_file,
                                     dataRetriever,
                                     cost_func[current_data_set],
                                     current_data_set,
                                     mutation_rate,
                                     maxIter,
                                     batch_size,
                                     population_size,
                                     network_architecture,
                                     pb_actor=None))

    metrics = ray.get(metrics)
    print(metrics)
    print("Average Performance: ", np.asarray(metrics).mean())
    output_json["Metrics"] = metrics
    output_json["Average"] = np.asarray(metrics, dtype=np.float64).mean()
    output_json["Std"] = np.asarray(metrics, dtype=np.float64).std()

    with open(
            f"../DataDump/GA_{current_data_set}_layer{len(network_architecture)}.json",
            'w') as f:
        json.dump(output_json, f, indent=4)

示例#15

0

显示文件

from MLAlgorithms.KNN.KNearest import KNearestNeighbor, EditedKNN, CondensedKNN
from MLAlgorithms.Utils.KFolds import KFolds
from MLAlgorithms.Utils.StandardNormalizer import StandardNormalizer
from MLAlgorithms.Utils.DataRetriever import DataRetriever
from MLAlgorithms.Utils.ClassifierAnalyzer import ClassifierAnalyzer

import numpy as np
import json

dataRetriever = DataRetriever("../Datasets/metadata.json")
dataRetriever.retrieveData("vote")
data = dataRetriever.getDataSet()
data = data.dropna()
data = data.sample(frac=1.0, random_state=93)
data = data.reset_index(drop=True)
# data = data.drop('idNumber', axis=1)

class_col = dataRetriever.getDataClass()
# data[class_col] = np.log(data[class_col] + 0.001)

contAttr = dataRetriever.getContinuousAttributes()
discAttr = dataRetriever.getDescreteAttributes()
predictionType = dataRetriever.getPredictionType()

output_json = {}
iter_num = 0

for test, train in KFolds(data, 5, stratisfied=True, class_col=class_col):

    #KFolds doesn't have the capability of returning a validate set
    #K is set to desired k/2 and the validate set is half of the test set

示例#16

0

显示文件

文件： TestDataRetriever.py 项目： lineranch/CSCI-447

 def test_existence(self):
     dataRetriever = DataRetriever("../Datasets/metadata.json")
     self.assertEqual(dataRetriever.hasData("breastCancer"), True,
                      "Should have breast cancer data")
     self.assertEqual(dataRetriever.hasData("dogDiseases"), False,
                      "Shouldn't have dog disease data")

示例#17

0

显示文件

文件： TestDataRetriever.py 项目： lineranch/CSCI-447

 def test_menu(self):
     dataRetriever = DataRetriever("../Datasets/metadata.json")
     self.assertEqual(
         dataRetriever.getDataMenu(),
         ["breastCancer", "glass", "iris", "soybeanSmall", "vote"],
         "should return list of data sets")

示例#18

0

显示文件

def network_tuner(*nodes_per_hidden_layer):
    """
    This function is used to calcuate the optimal network architecture
    The user should input the dataset they would like to operate with and change the performance metric in accordance to the data set type IE regression or classification 

    """
    
    MSEs = []

    bestNetwork = {}
    learning_rate = 0.0001
    maxItter = 500
    batch_size = .5

    dataRetriever = DataRetriever("../Datasets/metadata.json")
    dataRetriever.retrieveData("glass")
    dataset = dataRetriever.getDataSet().dropna()


    dataset = dataset.reset_index(drop=True)

    # This line is used to normalize the data for Forest Fires
    # dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1)

    dataset[dataRetriever.getContinuousAttributes()] = (dataset[dataRetriever.getContinuousAttributes()]-dataset[dataRetriever.getContinuousAttributes()].mean())/dataset[dataRetriever.getContinuousAttributes()].std()

    test_set = dataset.sample(frac=0.1, random_state=69)
    train_set = dataset.drop(test_set.index)
    test_set = test_set.reset_index(drop=True)
    train_set = train_set.reset_index(drop=True)

    ohe = OneHotEncoder()
    discrete_attr = dataRetriever.getDescreteAttributes()
    if dataRetriever.getDataClass() in discrete_attr:
        discrete_attr.remove(dataRetriever.getDataClass())

    datasetEncoded = ohe.train_fit(train_set, dataRetriever.getDescreteAttributes())
    testEncoded = ohe.fit(test_set)


    output = None
    nn = NeuralNetwork(datasetEncoded, 0, [], dataRetriever.getPredictionType(), dataRetriever.getDataClass())
    for i in range(maxItter):
        # We don't call an inital feedforward because backpropagate starts with a feedforward call
        # batch_size represents the number of data points per batch
        output = nn._back_propagate(learning_rate=learning_rate, batch_size=batch_size)


    final = nn.test(testEncoded.drop(dataRetriever.getDataClass(), axis=1))
    output = nn._feed_forward(testEncoded.drop(dataRetriever.getDataClass(), axis=1), testing=True)
    actual = testEncoded[dataRetriever.getDataClass()]


    ## ===================== Classification =================
    correct = 0
    acc = 0
    for i, row in enumerate(final):
        if row == actual.iloc[i]: correct += 1


    # final = final.reshape(final.shape[0])

    # MSE = ((actual-final)**2).mean()
    # MSEs.append(MSE)
    bestNetwork['network'] = nn
    bestNetwork['acc'] = acc
    bestNetwork['arc'] = [0]
    # # ============================================

    # # ============ Compare Acc to Most Common Class

    values = test_set[dataRetriever.getDataClass()].value_counts()


    # USED FOR CLASSIFICATION
    # print(f'Accuracy: {acc}')
    # print(f'Max Class Prior: {values.max()/values.sum()}')
    # print(f"Class Distribution:\n{values}")
    # print("Final: ", final)
    # print("Actual: ", list(actual))
    # print()



    numOfLayer = len(nodes_per_hidden_layer)
    print("Number of Hidden Layers: ", numOfLayer)
    for layer in range(numOfLayer):
        print(f"Layer Number: {layer + 1}")
        combinations = list(itertools.product(*nodes_per_hidden_layer[:layer+1]))

        for combo in combinations:

            output = None
            print("Node Combination: ",list(combo))
            print(combo)

            nn = NeuralNetwork(datasetEncoded, layer, list(combo), dataRetriever.getPredictionType(), dataRetriever.getDataClass())
            for i in range(maxItter):
                # We don't call an inital feedforward because backpropagate starts with a feedforward call
                # batch_size represents the number of data points per batch
                output = nn._back_propagate(learning_rate=learning_rate, batch_size=batch_size)

            final = nn.test(testEncoded.drop(dataRetriever.getDataClass(), axis=1))
            output = nn._feed_forward(testEncoded.drop(dataRetriever.getDataClass(), axis=1), testing=True)
            actual = testEncoded[dataRetriever.getDataClass()]

            ## ===================== Classification =================
            correct = 0
            acc = 0
            for i, row in enumerate(final):
                if row == actual.iloc[i]: correct += 1

            acc = correct/len(test_set)
            # # # ============================================

            # # # ============ Compare Acc to Most Common Class

            values = test_set[dataRetriever.getDataClass()].value_counts()

            # USED FOR CLASSIFICATION
            # print(f'Accuracy: {acc}')
            # print(f'Max Class Prior: {values.max()/values.sum()}')
            # # print(f"Class Distribution:\n{values}")
            # print("Final: ", final)
            # print("Actual: ", list(actual))
            # print()

            if acc > bestNetwork['acc']:
                bestNetwork['network'] = nn
                bestNetwork['acc'] = acc
                bestNetwork['arc'] = combo

            # final = final.reshape(final.shape[0])

            # MSE = ((actual-final)**2).mean()
            # MSEs.append(MSE)
            # if MSE < bestNetwork['acc']:
            #     bestNetwork['network'] = nn
            #     bestNetwork['acc'] = MSE
            #     bestNetwork['arc'] = combo

            



    return bestNetwork#, MSEs

示例#19

0

显示文件

文件： classificationDriver.py 项目： lineranch/CSCI-447

import pandas as pd
import numpy as np

from scipy.stats import norm  # Used for P score
import matplotlib.pyplot as plt

from tqdm import tqdm

from MLAlgorithms.NeuralNetwork.NeuralNetwork import NeuralNetwork
from MLAlgorithms.Utils.KFolds import KFolds
from MLAlgorithms.Utils.StandardNormalizer import StandardNormalizer
from MLAlgorithms.Utils.DataRetriever import DataRetriever
from MLAlgorithms.Utils.ClassifierAnalyzer import ClassifierAnalyzer
from MLAlgorithms.Utils.OneHotEncoder import OneHotEncoder

dataRetriever = DataRetriever("../Datasets/metadata.json")
dataRetriever.retrieveData("breastCancer")
dataset = dataRetriever.getDataSet().dropna()
dataset = dataset.reset_index(drop=True)

# This line is used to normalize the data for Forest Fires
# dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1)
maxIter = 1
learning_rate = 1e-3
batch_size = 0.01

metrics = []
fold = 0

# Ten-Fold Cross Validation
for test_set, train_set in KFolds(dataset, 10):

示例#20

0

显示文件

文件： ga_testing_driver.py 项目： lineranch/CSCI-447

 / / __ / _ \ / __ \ / _ \ / __// // ___/  / /| /  / // __ `// __ \ / ___// // __// __ \ / __ `__ \ / ___/
/ /_/ //  __// / / //  __// /_ / // /__   / ___ / / // /_/ // /_/ // /   / // /_ / / / // / / / / /(__  ) 
\____/ \___//_/ /_/ \___/ \__//_/ \___/  /_/  |_//_/ \__, / \____//_/   /_/ \__//_/ /_//_/ /_/ /_//____/  
                                                    /____/                                                
"""

# ====================== Adjustable Variables ==============================
current_data_set = "soybeanSmall"
mutation_rate = .5
maxItter = 1000
batch_size = .6
population_size = 110
# ===========================================================================

# ================ Data pre-processing =================================================
dataRetriever = DataRetriever("../Datasets/metadata.json")
dataRetriever.retrieveData(current_data_set)
dataset = dataRetriever.getDataSet().dropna()

discrete_attr = dataRetriever.getDescreteAttributes()
cont_attributes = dataRetriever.getContinuousAttributes()
# This line is used to normalize the data for Forest Fires
if current_data_set == "forestFires":
    zeros = dataset[dataset[dataRetriever.getDataClass()] < 1].index
    print(len(zeros) / len(dataset))
    dataset = dataset.drop(zeros)
    discrete_attr.remove('month')
    discrete_attr.remove('day')

    dataset['month'] = (pd.to_datetime(dataset.month,
                                       format='%b').dt.month) - 1

示例#21

0

显示文件

abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)


happiness = r"""
    ____             __  _      __        _____                                 ____        __  _           _             __  _           
   / __ \____ ______/ /_(_)____/ /__     / ___/      ______ __________ ___     / __ \____  / /_(_)___ ___  (_)___  ____ _/ /_(_)___  ____ 
  / /_/ / __ `/ ___/ __/ / ___/ / _ \    \__ \ | /| / / __ `/ ___/ __ `__ \   / / / / __ \/ __/ / __ `__ \/ /_  / / __ `/ __/ / __ \/ __ \
 / ____/ /_/ / /  / /_/ / /__/ /  __/   ___/ / |/ |/ / /_/ / /  / / / / / /  / /_/ / /_/ / /_/ / / / / / / / / /_/ /_/ / /_/ / /_/ / / / /
/_/    \__,_/_/   \__/_/\___/_/\___/   /____/|__/|__/\__,_/_/  /_/ /_/ /_/   \____/ .___/\__/_/_/ /_/ /_/_/ /___/\__,_/\__/_/\____/_/ /_/ 
                                                                                 /_/
"""

print(happiness)
dataRetriever = DataRetriever("../Datasets/metadata.json")
dataRetriever.retrieveData("abalone")
dataset = dataRetriever.getDataSet().dropna()
dataset = dataset.reset_index(drop=True)

test_set = dataset.sample(frac=0.1, random_state=69)
train_set = dataset.drop(test_set.index)
test_set = test_set.reset_index(drop=True)
train_set = train_set.reset_index(drop=True)

# This line is used to normalize the data for Forest Fires
# dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1)
maxIter = 1
learning_rate = 1e-3
batch_size = 0.01

示例#22

0

显示文件

文件： ValueDifferenceMetric.py 项目： lineranch/CSCI-447

            test_cols = x_points.columns

        for i, x_row in enumerate(x_points.to_numpy()):
            for j, y_row in enumerate(y_points.to_numpy()):
                for k, col in enumerate(test_cols):
                    x_val = "unseen"
                    y_val = "unseen"

                    if x_row[k] in self.probMatrix[col]: x_val = x_row[k]
                    if y_row[k] in self.probMatrix[col]: y_val = y_row[k]

                    self.distances[i, j] += self.probMatrix[col][x_val][y_val]


if __name__ == "__main__":
    dataRetriever = DataRetriever("../Datasets/metadata.json")
    dataRetriever.retrieveData("computerHardware")
    data = dataRetriever.getDataSet()
    data = data.dropna()
    data = data.reset_index(drop=True)[[
        "venderName", "modelName", "myct", "mmin", "mmax", "cach", "chmin",
        "chmax", "prp", "erp"
    ]]

    test = data.sample(frac=0.2)
    train = data.drop(test.index)

    test = test.reset_index(drop=True)
    train = train.reset_index(drop=True)

    VDM = ValueDifferenceMetric(

示例#23

0

显示文件

文件： KNearest.py 项目： lineranch/CSCI-447

        temp_train_with_unknown = self.train_data.copy(deep=True)
        temp_train_with_unknown["unknown_col"] = self.unknown_col.values
        self.distance_matrix = DistanceMatrix(self.test_data,
                                              temp_train_with_unknown,
                                              self.contAttr, self.discAttr,
                                              len(self.contAttr),
                                              len(self.discAttr),
                                              self.predictionType,
                                              "unknown_col")

        self.neighbors = self.distance_matrix.distanceMatrix
        print(curr_iter)


if __name__ == "__main__":
    dataRetriever = DataRetriever("../Datasets/metadata.json")
    dataRetriever.retrieveData("imageSegmentation")
    data = dataRetriever.getDataSet()
    data = data.dropna()
    data = data.reset_index(drop=True)
    # data = data.drop('idNumber', axis=1)

    class_col = dataRetriever.getDataClass()
    # data[class_col] = np.log(data[class_col] + 0.001)
    contAttr = dataRetriever.getContinuousAttributes()
    discAttr = dataRetriever.getDescreteAttributes()

    test = data.sample(frac=0.2, random_state=17)
    train = data.drop(test.index)

    # sn = RangeNormalizer(train[contAttr])

示例#24

0

显示文件

文件： driver.py 项目： lineranch/CSCI-447

    """
    t = 0
    f = 0
    for i in range(len(answers)):
        if predictions[i] == answers[i]:
            t += 1
        else:
            f += 1

    print("The Percent of Correct Predictions is {t}%".format(
        t=round((t * 100 / len(answers)), 1)))
    print("The Percent of Incorrect Predictions is {f}%\n".format(
        f=round((f * 100 / len(answers)), 1)))


dataRetriever = DataRetriever("../Datasets/metadata.json")

################################################ Un-Shuffled Data ################################################

# This first for loop performs the NaiveBayes algorithm for un-shuffled data
jsonResults1 = {}
for dataSet in dataRetriever.getDataMenu():
    dataRetriever.retrieveData(dataSet)
    dataClass = dataRetriever.getDataClass()
    retrievedData = dataRetriever.getDataSet()

    numOfClassValues = len(
        retrievedData[dataRetriever.getDataClass()].unique())
    method = "macro"
    foldNum = 1

示例#25

0

显示文件

文件： KMedoidsClusteringCreateCSV.py 项目： lineranch/CSCI-447

from MLAlgorithms.Utils.DataRetriever import DataRetriever
from MLAlgorithms.KNN.KMediods import KMediods
from MLAlgorithms.Utils.StandardNormalizer import StandardNormalizer
"""
This is the driver we used to create the medoids CSVs. We went with the archaic route and manually
changed the data set for each generation. This was the method chosen since some data sets took longer to calculate

There are a couple of lines that are data set specific 
"""

data = DataRetriever("../Datasets/metadata.json")

dataSetName = "computerHardware"

print(f"Creating CSV for {dataSetName}")
data.retrieveData(dataSetName)

maxItter = 100
kValue = 78

# These are only used for image segmentation and abalone
# frac = .25
# random_state = 69
# kValue = m.floor(frac * kValue)

dataSetUnNormalized = data.getDataSet()
# dataSetUnNormalized[data.getDataClass()] = np.log(dataSetUnNormalized[data.getDataClass()] + 0.001)  // This is for Forest Fires

sn = StandardNormalizer(dataSetUnNormalized[data.getContinuousAttributes()])
dataSetUnNormalized[data.getContinuousAttributes()] = sn.train_fit()

示例#26

0

显示文件

import pandas as pd
import numpy as np
import matplotlib
print(matplotlib.rcsetup.interactive_bk)
matplotlib.use('WebAgg')
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
from MLAlgorithms.Utils.DataRetriever import DataRetriever
from MLAlgorithms.Utils.KFolds import KFolds
import numba

dataRetriever = DataRetriever("../Datasets/metadata.json")
dataRetriever.retrieveData("breastCancer")
data = dataRetriever.getDataSet()
data = data.dropna()
data = data.reset_index(drop=True)

times = []
rows_list = []
distances = np.zeros((len(data), len(data)), dtype=np.float)
for index, row in tqdm(data.iterrows(), total=len(data)):
    for index2, row2 in data.iterrows():
        distances[index, index2] = ((row2 - row)**2).sum()


@numba.njit
def calc_distances(numpy_array):
    distances = np.zeros((numpy_array.shape[0], numpy_array.shape[0]),
                         dtype=np.float64)