예제 #1
0
    def transform(self, X, y=None):

        output = X.copy()
        # if NaNs are not in dataset we just label encode each specified column
        # or all of them if not specified
        if self.keep_nan == False:
            if self.columns is not None:
                for col in self.columns:
                    output[col] = LabelEncoder().fit_transform(output[col])
            else:
                for colname, col in output.iteritems():
                    output[colname] = LabelEncoder().fit_transform(col)

            #If y is passed, it is encoded as well
            if y is not None:
                target = LabelEncoder().fit_transform(y)

                return output, target
            else:
                return output

        # else we will use masking to keep track of NaNs and restore them after label encoding
        else:
            if self.columns is not None:
                for col in self.columns:
                    original = output[col]
                    mask = output[col].isnull()
                    new_col = LabelEncoder().fit_transform(
                        output[col].astype('str'))
                    new_col = new_col.astype('int')
                    new_col = pd.Series(new_col, name=output[col].name)
                    output[col] = new_col.where(~mask, original)
            else:
                for colname, col in output.iteritems():
                    original = output[colname]
                    mask = output[colname].isnull()
                    new_col = LabelEncoder().fit_transform(
                        output[colname].astype('str'))
                    new_col = new_col.astype('int')
                    new_col = pd.Series(new_col, name=colname)
                    output[colname] = new_col.where(~mask, original)

            #If y is passed, it is encoded as well
            if y is not None:
                original = y
                mask = y.isnull()
                target = LabelEncoder().fit_transform(y.astype('str'))
                target = target.astype('int')
                target = pd.Series(target, name=y.name)
                target = target.where(~mask, original)

                return output, target
            else:
                return output
class CSVDataset(Dataset):
    def __init__(self, path):

        df = pd.read_csv(path, header=None)

        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]

        self.X = self.X.astype('float32')

        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape(len(self.y), 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    def get_splits(self, n_test=0.33):
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size

        return random_split(self, [train_size, test_size])
예제 #3
0
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load csv file as a dataframe using pandas
        df = read_csv(path, header=None)
        # store inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]
        print("Input data shape:", np.shape(self.X))
        print("Input label shape:", np.shape(self.y))
        # Ensure input X values are floats
        self.X = self.X.astype('float32')
        # Encode target labels and ensure they are floats
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape(len(self.y), 1)
        #print("After reshaping, input label shape:", np.shape(self.y))
        #print("Unique labels:", np.unique(self.y))

    # Number of rows in dataset
    def __len__(self):
        return len(self.X)

    # Get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    # Get indices for train and test rows
    def get_splits(self, n_train=0.7):
        train_split = round(n_train * len(self.X))
        test_split = len(self.X) - train_split
        return random_split(self, [train_split, test_split])
예제 #4
0
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        df = read_csv(path, header=None)
        # store the inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]
        # ensure input data is floats
        self.X = self.X.astype('float32')
        # label encode target and ensure the values are floats
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))

    # number of rows in the dataset
    def __len__(self):
        return len(self.X)

    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])
 def transform(self, X, y=None):
     
     output = X.copy()
     
     for col in self.columns:
         df = pd.get_dummies(X[col])
         for index, row in df.iterrows():
             if pd.Series(row.values).any():
                 pass
             else:
                 new_row = row.replace(0, np.nan, inplace=True)
                 df.loc[index] = new_row
             same_columns_names = set(df.columns).intersection(output.columns)
             if same_columns_names:
                 count = 1
                 new_names = []
                 for i in range(len(df.columns)):
                     name = df.columns[i]                        
                     new_names.append(f'{name}_{count}')
                 count += 1
                 df.columns = new_names
         output = pd.concat([output,df], axis=1)
         
     output.drop(self.columns, axis=1, inplace=True)
     
     if y is not None:
         original = y
         mask = y.isnull()
         target =  LabelEncoder().fit_transform(y.astype('str'))
         target = target.astype('int')
         target = pd.Series(target, name = y.name)
         target = target.where(~mask, original)
         
     
     return output, target       
예제 #6
0
class CSVDataset(Dataset):
    def __init__(self, path):
        ##Load the csv dataset as Dataframe
        df = read_csv(path, header=None)
        ### Store the inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]
        ## make them floats
        self.X = self.X.astype('float32')
        ## encode the targets
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))

    ## number of rows in the dataset
    def __len__(self):
        return len(self.X)

    ## get a row from the dataset

    def __getitem__(self, index):
        return [self.X[index], self.y[index]]

    ### get index for test and train rows
    def get_splits(self, n_test=0.33):
        #determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        ## calculate the split
        return random_split(self, [train_size, test_size])
예제 #7
0
def get_task_split(task, seed, split):
    if not (len(split) == 2 or len(split) == 3):
        raise ValueError(f'Splits must be either 2 or 3 floats\n{split=}')

    task = openml.tasks.get_task(task)
    X, y, categorical_mask, _ = task.get_dataset().get_data(task.target_name)
# Process labels
    if y is not None:
        if y.dtype == 'category' or y.dtype == object:
            y = LabelEncoder().fit_transform(y.values)
        elif y.dtype == bool:
            y = y.astype('int')

    if isinstance(y, pandas.Series):
        y = y.to_numpy()

    # Process Features
    for col in X.columns:
        mode = X[col].mode()[0]
        X[col].fillna(mode, inplace=True)

    encoding_frames = []
    for col in list(X.columns[categorical_mask]):
        encodings = pandas.get_dummies(X[col], prefix=col, prefix_sep='_')
        X.drop(col, axis=1, inplace=True)
        encoding_frames.append(encodings)

    X = pandas.concat([X, *encoding_frames], axis=1)
    X = X.to_numpy()

    # Create split
    if len(split) == 2:
        train_split = split[0]
        test_split = split[1]
        splits = split_data(X, y, test_split, seed)
        return {
            'baseline_train': splits['split_1'],
            'baseline_test': splits['split_2']
        }
    else:
        algo_split = split[0]
        selector_split = split[1]
        test_split = split[2]

        # Split data between testing and training
        train_test_split = split_data(X, y, test_split, seed)

        # Further divide the train split of train_test_split to be between
        # the algorithm and the selector
        selector_relative_split = selector_split / (algo_split + selector_split)

        X_train, y_train = train_test_split['split_1']
        train_splits = split_data(
            X_train, y_train, selector_relative_split, seed)
        return {
            'algo_train': train_splits['split_1'],
            'selector_train': train_splits['split_2'],
            'test': train_test_split['split_2']
        }
예제 #8
0
 def __init__(self,fileName):
     df = pd.read_csv(fileName, header=None)
     self.X = tensor(df.values[:,:-1].astype('float32'))
     encoded_output = LabelEncoder().fit_transform(df.values[:,-1])
     self.y = tensor(encoded_output.astype('float32'))
     self.y = self.y.reshape((len(self.y),1))
     self.numInFeatures = len(self.X[0])
     self.numData = len(self.X)
예제 #9
0
def load_dataset():
    # load the dataset
    url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/wine.csv'
    df = read_csv(url, header=None)
    data = df.values
    X, y = data[:, :-1], data[:, -1]
    # minimally prepare dataset
    X = X.astype('float')
    y = LabelEncoder().fit_transform(y.astype('str'))
    return X, y
def get_dataset():
    # load dataset
    url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"
    dataset = read_csv(url, header=None)
    data = dataset.values
    # separate into input and output columns
    X, y = data[:, :-1], data[:, -1]
    # ensure inputs are floats and output is an integer label
    X = X.astype('float32')
    y = LabelEncoder().fit_transform(y.astype('str'))
    return X, y
예제 #11
0
    def transform(self, X, y=None):

        output = X.copy()

        for col in self.columns:  #create a df with encoded columns for each category in col

            df = pd.get_dummies(X[col])  # Encoding

            if self.keep_nan == True:  #restoring NaNs
                for index, row in df.iterrows():
                    # if there are all zero rows it means Nan for those features
                    if pd.Series(row.values).any():
                        pass
                    else:
                        # replacing zeros and restoring Nans
                        new_row = row.replace(0, np.nan, inplace=True)
                        df.loc[index] = new_row

            # Changing columns labels to get unique for each one
            same_columns_names = set(df.columns).intersection(output.columns)
            if same_columns_names:
                count = 1
                new_names = []
                for i in range(len(df.columns)):
                    name = df.columns[i]
                    new_names.append(f'{name}_{count}')
                count += 1
                df.columns = new_names

        #To this point we should have got a df with encoded columns to concatenate to output
        output = pd.concat([output, df], axis=1)

        # Now we drop the old columns
        output.drop(self.columns, axis=1, inplace=True)

        #If y is passed, it is encoded as well
        if y is not None and self.keep_nan == True:
            original = y
            mask = y.isnull()
            target = LabelEncoder().fit_transform(y.astype('str'))
            target = target.astype('int')
            target = pd.Series(target, name=y.name)
            target = target.where(~mask, original)

            return output, target

        elif y is not None and self.keep_nan == False:
            target = LabelEncoder().fit_transform(y)

            return output, target

        else:
            return output
예제 #12
0
    def __init__(self, csv_file='telescope.dat', path='data/'):
        """
        constructor to load a csv, preprocess it into torch Dataset
        """

        self.dataset = pd.read_table(path + csv_file,
                                     header=None,
                                     delimiter=',')
        self.dataset.columns = [
            'FLength', 'FWidth', 'FSize', 'FConc', 'FConc1', 'FAsym',
            'FM3Long', 'FM3Trans', 'FAlpha', 'FDist', 'Class'
        ]

        scaler = StandardScaler()
        data = scaler.fit_transform(self.dataset.iloc[:, :-1])
        target = LabelEncoder().fit_transform(self.dataset.Class)
        self.x = data.astype(np.float32)
        self.y = target.astype(np.long)
예제 #13
0
파일: 3lp.py 프로젝트: KorfLab/genDL
class CSVDataset(Dataset):
    def __init__(self, path):
        df = read_csv(path, header=None)  # load the csv file as a dataframe
        self.X = df.values[:, :-1]  # store the inputs
        self.y = df.values[:, -1]  # and outputs
        self.X = self.X.astype('float32')  # ensure input data is floats
        self.y = LabelEncoder().fit_transform(self.y)  # label target
        self.y = self.y.astype('float32')  # ensure floats
        self.y = self.y.reshape((len(self.y), 1))

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    def get_splits(self, n_test):
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        return random_split(self, (train_size, test_size))  # originally list
예제 #14
0
def process_openml_task(task: OpenMLTask) -> Tuple[ndarray, ndarray]:
    """
    Process an openml task in a generic way,
        LabelEncoder for the categorical labels
        One Hot Encoding for categorical features """
    X, y, categorical_mask, _ = task.get_dataset().get_data(task.target_name)

    # Process labels
    if y is not None:
        if y.dtype == 'category' or y.dtype == object:
            y = LabelEncoder().fit_transform(y.values)
            # y = pandas.Series(encoded_labels)

        elif y.dtype == bool:
            y = y.astype('int')

    if type(y) == pandas.core.series.Series:
        y = y.to_numpy()

    # Process NA's
    for col in X.columns:
        mode = X[col].mode()[0]
        X[col].fillna(mode, inplace=True)

    # Process Categorical features
    encoding_frames = []
    for col_name in list(X.columns[categorical_mask]):
        encodings = pandas.get_dummies(X[col_name],
                                       prefix=col_name,
                                       prefix_sep='_')
        encoding_frames.append(encodings)
        X.drop(col_name, axis=1, inplace=True)

    X = pandas.concat([X, *encoding_frames], axis=1)

    return X, y
예제 #15
0
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv'
dataframe = read_csv(url, header=None)
data = dataframe.values
# separate into input and output elements
X, y = data[:, :-1], data[:, -1]
# minimally prepare dataset
X = X.astype('float')
#Encoding!
y = LabelEncoder().fit_transform(y.astype('str'))
# define the model
model = LogisticRegression(solver='liblinear')
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model
m_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize the result
print('Accuracy: %.3f (%.3f)' % (mean(m_scores), std(m_scores)))
예제 #16
0
X_train = X[[
    'PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'
]].copy()
#print(X_train.shape)

le = preprocessing.LabelEncoder()
#print(X_train.head())
X_2 = X_train.apply(le.fit_transform)
#print(X_2.head())

enc = preprocessing.OneHotEncoder()
enc.fit(X_2)

#Transform
onehotlabels = enc.transform(X_2).toarray()
print(onehotlabels.shape)

#label reformatting
label_2 = LabelEncoder().fit_transform(label)
label_2 = label_2.astype('float32')
label_2 = label_2.reshape((len(label_2), 1))
print(label_2.shape)


class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear()
        self.fc2 = nn.Linear()
        self.fc2 = nn.Linear()
예제 #17
0
    def predict(self, X):
        return np.sign(self.project(X))


if __name__ == "__main__":
    import pandas as pd
    from sklearn.metrics import confusion_matrix
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder

    df = pd.read_csv('Iris.csv')
    df_1 = df.iloc[:100, 2:4]

    y = df.iloc[:100, -1]
    y = LabelEncoder().fit_transform(y)

    yi = y.astype(float)
    X = df_1.to_numpy()

    yi[yi == 0] = -1
    tmp = np.ones(len(X))
    yi = tmp * yi
    X_train, X_test, y_train, y_test = train_test_split(X, yi, random_state=0)

    svc = SVM()
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)

    print('Confusion matrix')
    print(confusion_matrix(y_test, y_pred))
예제 #18
0
        f'14-ada-': ada1,
        f'15-gpc': gpc1,
        f'16-GBclass': GBclass1,
        f'17-histgclas': histgclass,
        f'18-bagclas': bagclass,
        f'19-ridge': ridge1,
        f'20-SVC2': SVC2,
        f'21-linear SVC': linear1,
    }

    for model_name, model in classifier_mapping.items():

        train_test_model(model_name, model, x, y, train_size_pct)


from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/breast-cancer-wisconsin.data.txt')
df.replace('?', -99999, inplace=True)
df.drop(['id'], 1, inplace=True)
X = np.array(df.drop(['class'], 1))
X = X.astype('float32')
y = np.array(df['class'])
y = LabelEncoder().fit_transform(y.astype(str))
# Look at the dataset again
print(X.shape, y.shape)
print(df.head())

print(f'[*] Beginning evaluations: All Features')
evaluateIndividualClassifiers(X, y, TRAIN_PCT)