Exemplo n.º 1
0
    def test_dataObj_merge(self):
        fileName = 'data_test.csv'
        df = pd.read_csv(fileName)
        x = [0, 5, 30]
        data2 = oe.data(df, x)

        self.assertRaises(ValueError, lambda: self.data.merge([x]))

        #add some transforms, so we can make sure merge pulls everything along
        self.data.transform('parent', 'log', 'log10', base=10)
        self.assertEqual(2, len(self.data.D.keys()))
        data2.transform('parent', 'log', 'log10', base=10)
        data2.transform('parent', 'log', 'log2', base=2)

        self.data.merge([data2])
        self.assertEqual(5, len(self.data.D.keys()))
Exemplo n.º 2
0
    def slice(self, names):
        """
		Returns a new data object containing a slice indicated by the list of names given (dictionary keys shared amongst D, params, etc.).
		Cannot remove 'parent' as that is the default dataframe matrix that established data object. To replace parent, instead 
		instantiate a new object on a dataframe created from transformation of interest. 

		Parameters
		----------
		names: list
			A list of strings matching the names to keep in the new slice

		Returns
		--------
		d: an openensembles data object
			A oe.data object that contains only those names passed in

		Examples
		--------
		Remove 'zscore' from the list, keeping everything else

		>>> names = d.D.keys() #get all the keys
		>>> names = names.remove(['zscore'])
		>>> dNew = d.slice(names)

		Raises
		------
		ValueError
			If a name in the list of names does not exist in data object


		"""
        d = oe.data(self.df, self.x)
        names_existing = list(self.D.keys())
        for name in names:
            if name not in names_existing:
                raise ValueError(
                    "ERROR: the source you requested for slicing does not exist in data object %s"
                    % (name))

            d.D[name] = self.D[name]
            d.x[name] = self.x[name]
            d.params[name] = self.params[name]

        return d
Exemplo n.º 3
0
def run_mv_oe(X, y=None):
    """Deprecated"""
    print("a")
    n_features = X.shape[1]
    columns = [f"x{i}" for i in range(n_features)]

    df = pd.DataFrame(X, columns=columns)
    dataObj = oe.data(df, list(range(n_features)))

    c = oe.cluster(dataObj)
    c_MV_arr = []

    for i in range(30):
        name = f'kmeans_{i}'
        c.cluster('parent', 'kmeans', name, K=15, init='random', n_init=1)
        c_MV_arr.append(c.finish_majority_vote(threshold=0.5))

    final_labels = c_MV_arr[-1].labels['majority_vote'] - 1
    print(len(np.unique(final_labels)))

    return X, final_labels, y if len(np.unique(final_labels)) > 1 else run_mv_oe(X, y)
Exemplo n.º 4
0
#%matplotlib inline

#np.random.seed(a_fixed_number) every time you call the numpy's other random function, the result will be the same
#However, if you just call it once and use various random functions, the results will still be different:
np.random.seed(
    0
)  #this helps to establish the same dataset and functionality, but is not required

# Open a csv file and convert to dataframe object

#df = pd.read_csv('Data/DataGranulatedGCSNoPT.csv') -----

# convert dataframe to oe dataobject
# second argument is the  number of columns

d = oe.data(df, [i for i in range(1, len(df.columns) + 1)])
'''
WHAT NEEDS TO BE ADDED FOR FUTURE USAGE FROM OTHERS:
    a) After loading data
        Ensure everything is either normalized or handle in this cell before creating the ensemble
        
    b) Ensure categorical features are encoded
    
    Code cell accidently deleted for encoding but pandas has useful tool for easy encoding
    https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

'''

# pass oe dataobj to cluster class
c = oe.cluster(
    d)  #instantiate an object so we can get all available algorithms
Exemplo n.º 5
0
 def test_setup_floats(self):
     fileName = 'data_test.csv'
     df = pd.read_csv(fileName)
     x = [0.0, 5.0, 30.0]
     self.data = oe.data(df, x)
     self.assertListEqual(x, self.data.x['parent'])
Exemplo n.º 6
0
 def test_setup_stringX(self):
     fileName = 'data_test.csv'
     df = pd.read_csv(fileName)
     x = ['something', 5, 30]
     self.data = oe.data(df, x)
     self.assertListEqual([0, 1, 2], self.data.x['parent'])
Exemplo n.º 7
0
 def test_incorrect_setup(self):
     fileName = 'data_test.csv'
     df = pd.read_csv(fileName)
     x = [0, 5, 10, 30]
     self.assertRaises(ValueError, lambda: oe.data(df, x))
Exemplo n.º 8
0
 def test_remove_metaData(self):
     fileName = 'data_test_meta.csv'
     df = pd.read_csv(fileName)
     x = [0, 5, 30]
     self.data = oe.data(df, x)
Exemplo n.º 9
0
 def setUp(self):
     fileName = 'data_test.csv'
     df = pd.read_csv(fileName)
     x = [0, 5, 30]
     self.data = oe.data(df, x)
Exemplo n.º 10
0
 def setUp(self):
     fileName = 'data_test.csv'
     df = pd.DataFrame.from_csv(fileName)
     x = [0, 5, 30]
     self.data = oe.data(df, x)
# --- SECTION 1 ---
# Libraries and data loading
import openensembles as oe
import numpy as np
import pandas as pd
import sklearn.metrics

from sklearn.datasets import load_breast_cancer
from sklearn.manifold import TSNE

bc = load_breast_cancer()
t = TSNE()
# --- SECTION 2 ---
# Create the data object
cluster_data = oe.data(pd.DataFrame(t.fit_transform(bc.data)), [0, 1])

np.random.seed(123456)
# --- SECTION 3 ---
# Create the ensembles and calculate the homogeneity score
for K in [2, 3, 4, 5, 6, 7]:
    for ensemble_size in [3, 4, 5]:
        ensemble = oe.cluster(cluster_data)
        for i in range(ensemble_size):
            name = f'kmeans_{ensemble_size}_{i}'
            ensemble.cluster('parent', 'kmeans', name, K)

        preds = ensemble.finish_majority_vote(threshold=0.5)
        print(f'K: {K}, size {ensemble_size}:', end=' ')
        print('%.2f' % sklearn.metrics.homogeneity_score(
            bc.target, preds.labels['majority_vote']))
recents = recents.dropna(axis=1, how="all")
recents = recents.fillna(recents.median())

# Use only these specific features
columns = [
    'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth',
    'Freedom to make life choices', 'Generosity', 'Perceptions of corruption',
    'Positive affect', 'Negative affect', 'Confidence in national government',
    'Democratic Quality', 'Delivery Quality'
]

# Transform the data with TSNE
tsne = t_sne.TSNE()
transformed = pd.DataFrame(tsne.fit_transform(recents[columns]))
# Create the data object
cluster_data = oe.data(transformed, [0, 1])

# Create the ensemble
ensemble = oe.cluster(cluster_data)
for i in range(20):
    name = f'kmeans({i}-tsne'
    ensemble.cluster('parent', 'kmeans', name, 10)

# Create the cluster labels
preds = ensemble.finish_co_occ_linkage(threshold=0.5)

# Add Life Ladder to columns
columns = [
    'Life Ladder', 'Log GDP per capita', 'Social support',
    'Healthy life expectancy at birth', 'Freedom to make life choices',
    'Generosity', 'Perceptions of corruption', 'Positive affect',
Exemplo n.º 13
0
# --- SECTION 1 ---
# Libraries and data loading
import openensembles as oe
import numpy as np
import pandas as pd
import sklearn.metrics

from sklearn.datasets import load_breast_cancer

bc = load_breast_cancer()

# --- SECTION 2 ---
# Create the data object
cluster_data = oe.data(pd.DataFrame(bc.data), bc.feature_names)

np.random.seed(123456)
# --- SECTION 3 ---
# Create the ensembles and calculate the homogeneity score
for K in [2, 3, 4, 5, 6, 7]:
    for ensemble_size in [3, 4, 5]:
        ensemble = oe.cluster(cluster_data)
        for i in range(ensemble_size):
            name = f'kmeans_{ensemble_size}_{i}'
            ensemble.cluster('parent', 'kmeans', name, K)

        preds = ensemble.finish_graph_closure(threshold=0.5)
        print(f'K: {K}, size {ensemble_size}:', end=' ')
        print('%.2f' % sklearn.metrics.homogeneity_score(
            bc.target, preds.labels['graph_closure']))
Exemplo n.º 14
0
# Use only these specific features
columns = ['Log GDP per capita',
       'Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption','Positive affect', 'Negative affect',
       'Confidence in national government', 'Democratic Quality',
       'Delivery Quality']

# Normalize the features by subtracting the mean
# and dividing by the standard  deviation
normalized = recents[columns]
normalized = normalized - normalized.mean()
normalized = normalized / normalized.std()
# Create the data object
cluster_data = oe.data(recents[columns], columns)


np.random.seed(123456)
results = {'K':[], 'size':[], 'silhouette': []}
# Test different ensemble setups
Ks = [2, 4, 6, 8, 10, 12, 14]
sizes = [5, 10, 20, 50]
for K in Ks:
    for ensemble_size in sizes:
        ensemble = oe.cluster(cluster_data)
        for i in range(ensemble_size):
            name = f'kmeans_{ensemble_size}_{i}'
            ensemble.cluster('parent', 'kmeans', name, K)

        preds = ensemble.finish_co_occ_linkage(threshold=0.5)
Exemplo n.º 15
0
import pandas as pd
from sklearn import datasets
import openensembles as oe
import matplotlib.pyplot as plt
import seaborn as sns
#Set up a dataset and put in pandas DataFrame.
x, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.02, random_state=None)
df = pd.DataFrame(x)
#instantiate the oe data object
dataObj = oe.data(df, [1,2])
#instantiate an oe clustering object
c = oe.cluster(dataObj)
c_MV_arr = []
val_arr = []
for i in range(0,39):
    # add a new clustering solution, with a unique name
    name = 'kmeans_' + str(i)
    c.cluster('parent', 'kmeans', name, K=16, init = 'random', n_init = 1)
    # calculate a new majority vote solution, where c has one more solution on each iteration
    c_MV_arr.append(c.finish_majority_vote(threshold=0.5))
    #calculate the determinant ratio metric for each majority vote solution
    v = oe.validation(dataObj, c_MV_arr[i])
    val_name = v.calculate('det_ratio', 'majority_vote', 'parent')
    val_arr.append(v.validation[val_name])

#calculate the co-occurrence matrix
coMat = c.co_occurrence_matrix()
coMat.plot(labels=False)