Python load示例，arff2pandas.a2p.load Python示例

示例#1

0

显示文件

def main(config):

        with open('ECG_TRAIN.arff') as f:
              train = a2p.load(f)
        with open('ECG_TEST.arff') as f:
              test = a2p.load(f)

        #Merge dataset
        df = train.append(test)
        #shuffling data frame by sampling with frac=1
        df = df.sample(frac=1.0)

        CLASS_NORMAL = 1
        class_names = ['Normal','R on T','PVC','SP','UB']



        normal_df = df[df.target == str(CLASS_NORMAL)].drop(labels='target', axis=1)
        #We'll merge all other classes and mark them as anomalies:
        anomaly_df = df[df.target != str(CLASS_NORMAL)].drop(labels='target', axis=1)

        #We'll split the normal examples into train, validation and test sets:
        train_df, val_df = train_test_split(
          normal_df,
          test_size=0.15,
          random_state=101
        )

        val_df, test_df = train_test_split(
          val_df,
          test_size=0.33,
          random_state=101
        )
        test_normal_dataset, seq_len, _ = create_dataset(test_df)
        test_anomaly_dataset, _, _ = create_dataset(anomaly_df)

示例#2

0

显示文件

    def create_dataframes(self):
        with open(self.train_db_file) as f:
            self.test_df = a2p.load(f)
            # print(self.test_df)

        with open(self.train_db_file) as f:
            self.train_df = a2p.load(f)

示例#3

0

显示文件

def main():

    with open('ECG_TRAIN.arff') as f:
        train = a2p.load(f)
    with open('ECG_TEST.arff') as f:
        test = a2p.load(f)

    #Merge dataset
    df = train.append(test)
    #shuffling data frame by sampling with frac=1
    df = df.sample(frac=1.0)

    CLASS_NORMAL = 1
    class_names = ['Normal', 'R on T', 'PVC', 'SP', 'UB']

    normal_df = df[df.target == str(CLASS_NORMAL)].drop(labels='target',
                                                        axis=1)
    #We'll merge all other classes and mark them as anomalies:
    anomaly_df = df[df.target != str(CLASS_NORMAL)].drop(labels='target',
                                                         axis=1)

    #We'll split the normal examples into train, validation and test sets:
    train_df, val_df = train_test_split(normal_df,
                                        test_size=0.15,
                                        random_state=101)

    val_df, test_df = train_test_split(val_df,
                                       test_size=0.33,
                                       random_state=101)

    train_dataset, seq_len, n_features = create_dataset(train_df)
    val_dataset, _, _ = create_dataset(val_df)
    test_normal_dataset, _, _ = create_dataset(test_df)
    test_anomaly_dataset, _, _ = create_dataset(anomaly_df)

    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), torch.cuda.get_device_name(0)))
        model = model.cuda()
    else:
        raise ValueError('CPU training is not supported')

    model = RecurrentAutoencoder(seq_len, n_features, 128)
    model = model.to(device)

    model, history = train_model(model,
                                 train_dataset,
                                 val_dataset,
                                 n_epochs=150)

示例#4

0

显示文件

def parse_arff_file(filename, normalize):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    classes = []
    with open(filename) as file:
        df = a2p.load(file)

        try:
            df.iloc[:,-1] = df.iloc[:,-1].apply(int)
            classes = None
        except:
            le = preprocessing.LabelEncoder()
            le.fit(df.iloc[:,-1])
            classes = list(le.classes_)
            df.iloc[:,-1] = le.transform(df.iloc[:,-1]) 

        df = df.select_dtypes(include=numerics)

        original_dataframe = copy.deepcopy(df)
        if normalize:
            headers = df.columns
            x = df.values
            scaler = preprocessing.Normalizer()
            scaled_df = scaler.fit_transform(df)
            df = pd.DataFrame(scaled_df)
            df.columns=headers

        return df, classes, original_dataframe

示例#5

0

显示文件

def _read_arff_dataset(infile_path):
    """ Loads and ARFF file to a pandas dataframe and drops meta-info on column type. """
    with open(infile_path) as fh:
        df = a2p.load(fh)
    # Default column names follow ARFF, e.g. petalwidth@REAL, class@{a,b,c}
    df.columns = [col.split('@')[0] for col in df.columns]
    return df

示例#6

0

显示文件

def dataset_header_features_only(path):
    with open('../dataset/' + path) as f:
        df = a2p.load(f)
        # just using header-based features
        df1 = pd.concat([df.iloc[:, 0:20], df.iloc[:, len(df.columns) - 1]],
                        axis=1)
    return df1

示例#7

0

显示文件

def read_data_arff(filename):
    """
    Function read arff data and prepare X and y data
    :param filename:
    :return: data, X, y
    """
    with open(filename) as f:
        df = a2p.load(f)

    X = pd.DataFrame(df.iloc[:, :-1])
    y = pd.DataFrame(df.iloc[:, -1])

    le = LabelEncoder()
    for col in X.columns.values:
        if X[col].dtypes == 'object':
            le.fit(X[col])
            X[col] = le.transform(X[col])

    y = pd.DataFrame(le.fit_transform(y))
    y.astype('int32')

    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X))
    data = pd.concat([X, y], axis=1)
    data = data.values

    return data, X, y

示例#8

0

显示文件

def get_data(file_path):
    with open(file_path) as f:
        df = a2p.load(f)
        df = df.interpolate()
        input_features = df.drop(["defects@{false,true}"], axis=1)
        output_class = np.where(df["defects@{false,true}"] == 'true', 1, 0)
        return np.array(input_features), np.array(output_class)

    return

示例#9

0

显示文件

    def __init__(self, mode):

        assert mode in ['normal', 'anomaly']

        trainset_file = '/opt/data_and_extra/ECG5000/ECG5000_TRAIN.arff'
        testset_file = '/opt/data_and_extra/ECG5000/ECG5000_TEST.arff'

        with open(trainset_file) as f:
            train = a2p.load(f)
        with open(testset_file) as f:
            test = a2p.load(f)

        df = train.append(test)

        # split in normal and anomaly data, then drop label
        CLASS_NORMAL = 1
        new_columns = list(df.columns)
        new_columns[-1] = 'target'
        df.columns = new_columns

        if mode == 'normal':
            df = df[df.target == str(CLASS_NORMAL)].drop(labels='target',
                                                         axis=1)
        else:
            df = df[df.target != str(CLASS_NORMAL)].drop(labels='target',
                                                         axis=1)

        print(df.shape)
        # train_df, val_df = train_test_split(
        #     normal_df,
        #     test_size=0.15,
        #     random_state=random_seed
        # )
        #
        # val_df, test_df = train_test_split(
        #     val_df,
        #     test_size=0.33,
        #     random_state=random_seed
        # )

        self.X = df.astype(np.float32).to_numpy()

示例#10

0

显示文件

def convert_arff_to_dataframe(data, fold):
    """
    Converts arff files to pandas DataFrames
    :param data: configuration dictionary
    :param fold: current fold
    :return: train and test DataFrames
    """
    path_to_instances = data["path_to_instances"]
    dataset_folder = data["dataset_folder"]
    feature_folder = data["feature_folder"]
    train_file = "train.arff"
    test_file = "test.arff"
    path_to_test_file = join(path_to_instances, dataset_folder, feature_folder,
                             fold, test_file)
    path_to_train_file = join(path_to_instances, dataset_folder,
                              feature_folder, fold, train_file)

    with open(path_to_test_file) as f:
        test_df = a2p.load(f)
        test_df = test_df.sample(frac=1)
    with open(path_to_train_file) as f:
        train_df = a2p.load(f)
        train_df = train_df.sample(frac=1)
    return train_df, test_df

示例#11

0

显示文件

    def __init__(self, train_db_file, test_db_file):
        self.train_df = None
        self.test_df = None
        self.train_df_x = None
        self.train_df_y = None
        self.test_df_x = None
        self.test_df_y = None
        self.train_df_xs = None
        self.train_df_ys = None
        self.test_df_xs = None
        self.test_df_ys = None
        self.xscale_object = None
        self.yscale_object = None

        with open(test_db_file) as f:
            self.test_df = a2p.load(f)
            self.test_df_x = self.test_df.drop(labels='target@{-1,1}', axis=1)
            self.test_df_y = self.test_df[self.test_df.columns[-1]]
            # print(self.test_df)

        with open(train_db_file) as f:
            self.train_df = a2p.load(f)
            self.train_df_x = self.train_df.drop(labels='target@{-1,1}', axis=1)
            self.train_df_y = self.train_df[self.train_df.columns[-1]]

示例#12

0

显示文件

# Importing the libraries
import pandas as pd
from arff2pandas import a2p
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt

# Open the ouput ARFF file containing the features from audio files
with open('actors.arff') as f:
    df_actors_features = a2p.load(f)
    print(df_actors_features)
    
df_actors_features.set_index('name@STRING',inplace=True,drop=True)
df_actors_features = df_actors_features.iloc[:,:-1]
# Open the annotation CSV file

df_actors_emotions = pd.read_csv('annotation.csv', sep=';')
df_actors_emotions.set_index('filenames',inplace=True,drop=True)

# Join dataset for emotions (dependent variable) with features
df_actors_all = df_actors_emotions.join(df_actors_features, how = 'inner')


# Open the ouput ARFF file containing the features from audio files
with open('interviews.arff') as f:
    df_interview_features = a2p.load(f)
    print(df_interview_features)

示例#13

0

显示文件

文件： anomaly-detection.py 项目： MurphyPone/PyTorch-basics

import matplotlib.pyplot as plt

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
# PALLETTE = ["#01BEFE", "#FFD00", "#FF7D00", "#FF006D", "8F00FF"]
# sns.set_palette(sns.color_palette(PALLETTE))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"using {device}")

RANDOM_SEED = 86
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Load the data
with open("data/ECG5000_TRAIN.arff") as f:
    train = a2p.load(f)

with open("data/ECG5000_TEST.arff") as f:
    test = a2p.load(f)
# print(train.head(), test.head())

# combining the dataset since we're not doing classification and just want maxmimal data
df = train.append(test)
df = df.sample(frac=1.0)  # shuffle the data for ... TODO ?

# 5 classes:
#   1 Normal (N)    (everything else is anomalous) <-- this is the training set
#   2 R-on-T Premature Ventricular Contraction (R-on-T PVC)
#   3 Premature Ventricular Contraction (PVC)
#   4 Supra-ventricular PRemature Ectopic Beat (SP or EB)
#   5 Unclassified Beat (UB)

示例#14

0

显示文件

from arff2pandas import a2p

if __name__ == '__main__':
    with open('sample.arff', 'r') as f:
        df = a2p.load(f)
        print(df)
    with open('sample.arff', 'w') as f:
        a2p.dump(df, f)

示例#15

0

显示文件

def arff_to_pandas(filename):
    with open(filename) as f:
        df = a2p.load(f)
    df.columns = [re.sub(r'@(NUMERIC|{N,Y})', '', col) for col in df.columns]

    return df

示例#16

0

显示文件

文件： 04-android.py 项目： ocatak/ocatak.github.io

"""

from arff2pandas import a2p
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

np.set_printoptions(linewidth=260)

f = "drebin_reduced.arff"

df = a2p.load(open(f, 'r'))

print(list(df))

print(df.shape)

df['app_label'] = df.iloc[:, 1121]

print(df['app_label'].head(10))

df.drop(df.columns[[1121]], axis=1, inplace=True)
df.drop(['@NUMERIC', 'APP_NAME@STRING'], axis=1, inplace=True)

y = df['app_label'].as_matrix()
df.drop(['app_label'], axis=1, inplace=True)
X = df.as_matrix()

示例#17

0

显示文件

 def load_from_arff2(self, filename):
     with open(str(filename), 'r') as f:
         self.df = a2p.load(f)

示例#18

0

显示文件

# Importing the libraries
import pandas as pd
from arff2pandas import a2p
import numpy as np
import matplotlib.pyplot as plt

# Open the ouput ARFF file containing the features from audio files
with open('actors.arff') as f:
    df_features = a2p.load(f)
    print(df_features)
    
df_features.set_index('name@STRING',inplace=True,drop=True)

# Open the annotation CSV file

df_emotions = pd.read_csv('annotation.csv', sep=';')
df_emotions.set_index('filenames',inplace=True,drop=True)

# Join dataset for emotions (dependent variable) with features
df_all = df_emotions.join(df_features, how = 'inner')

# Set independent variables (X) and dependent variable (y)
X = df_all.iloc[:, 1:-1].values
y = df_emotions.iloc[:, 0].values

# Encoding categorical data
# Encoding the dependent Variable
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

示例#19

0

显示文件

文件： predict_interview_data.py 项目： skumarece44/ARP_Emotions

                emotion = emotion_count.columns[j]
            elif (emotion_count.iloc[i, j] == max_val):
                emotion = emotion + ' or ' + emotion_count.columns[j]

        strongest_emotion.append(emotion)
        strongness.append(max_val / total)
        folders.append(emotion_count.index[i])

    del total, i, j, max_val, emotion

    return folders, strongest_emotion, strongness


# Open the actors training ARFF file containing the features from audio files
with open('actors.arff') as f:
    df_features = a2p.load(f)

df_features.set_index('name@STRING', inplace=True, drop=True)

# Open the annotation CSV file

df_emotions = pd.read_csv('annotation.csv', sep=';')
df_emotions.set_index('filenames', inplace=True, drop=True)

# Join dataset for emotions (dependent variable) with features
df_all = df_emotions.join(df_features, how='inner')

# Set independent variables (X) and dependent variable (y)
X = df_all.iloc[:, 1:-1].values
y = df_emotions.iloc[:, 0].values

示例#20

0

显示文件

def get_fair_data1(dataset_key=None):
	map_dataset = {}

	map_dataset['31'] = 'foreign_worker@{yes,no}'
	map_dataset['802'] = 'sex@{female,male}'
	map_dataset['1590'] = 'sex@{Female,Male}'
	map_dataset['1461'] = 'AGE@{True,False}'
	map_dataset['42193'] = 'race_Caucasian@{0,1}'
	map_dataset['1480'] = 'V2@{Female,Male}'
	# map_dataset['804'] = 'Gender@{0,1}'
	map_dataset['42178'] = 'gender@STRING'
	map_dataset['981'] = 'Gender@{Female,Male}'
	map_dataset['40536'] = 'samerace@{0,1}'
	map_dataset['40945'] = 'sex@{female,male}'
	map_dataset['451'] = 'Sex@{female,male}'
	# map_dataset['945'] = 'sex@{female,male}'
	map_dataset['446'] = 'sex@{Female,Male}'
	map_dataset['1017'] = 'sex@{0,1}'
	map_dataset['957'] = 'Sex@{0,1,4}'
	map_dataset['41430'] = 'SEX@{True,False}'
	map_dataset['1240'] = 'sex@{Female,Male}'
	map_dataset['1018'] = 'sex@{Female,Male}'
	# map_dataset['55'] = 'SEX@{male,female}'
	map_dataset['38'] = 'sex@{F,M}'
	map_dataset['1003'] = 'sex@{male,female}'
	map_dataset['934'] = 'race@{black,white}'


	number_instances = []
	number_attributes = []
	number_features = []

	def get_class_attribute_name(df):
		for i in range(len(df.columns)):
			if str(df.columns[i]).startswith('class@'):
				return str(df.columns[i])

	def get_sensitive_attribute_id(df, sensitive_attribute_name):
		for i in range(len(df.columns)):
			if str(df.columns[i]) == sensitive_attribute_name:
				return i

	key = dataset_key
	if type(dataset_key) == type(None):
		key = list(map_dataset.keys())[random.randint(0, len(map_dataset) - 1)]

	value = map_dataset[key]
	with open(Config.get('data_path') + "/downloaded_arff/" + str(key) + ".arff") as f:
		df = a2p.load(f)

		print("dataset: " + str(key))

		number_instances.append(df.shape[0])
		number_attributes.append(df.shape[1])

		y = copy.deepcopy(df[get_class_attribute_name(df)])
		X = df.drop(columns=[get_class_attribute_name(df)])

		categorical_features = []
		continuous_columns = []
		for type_i in range(len(X.columns)):
			if X.dtypes[type_i] == object:
				categorical_features.append(type_i)
			else:
				continuous_columns.append(type_i)

		sensitive_attribute_id = get_sensitive_attribute_id(X, value)

		print(sensitive_attribute_id)

		X_datat = X.values
		for x_i in range(X_datat.shape[0]):
			for y_i in range(X_datat.shape[1]):
				if type(X_datat[x_i][y_i]) == type(None):
					if X.dtypes[y_i] == object:
						X_datat[x_i][y_i] = 'missing'
					else:
						X_datat[x_i][y_i] = np.nan


		X_train, X_test, y_train, y_test = train_test_split(X_datat, y.values.astype('str'), test_size=0.5,
															random_state=42, stratify=y.values.astype('str'))
		'''
		X_train, X_test, y_train, y_test = train_test_split(X_datat[0:200,:], y.values[0:200].astype('str'), test_size=0.5,
															random_state=42, stratify=y.values[0:200].astype('str'))
		'''

		cat_sensitive_attribute_id = -1
		for c_i in range(len(categorical_features)):
			if categorical_features[c_i] == sensitive_attribute_id:
				cat_sensitive_attribute_id = c_i
				break

		my_transformers = []
		if len(categorical_features) > 0:
			ct = ColumnTransformer(
				[("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)])
			my_transformers.append(("o", ct))
		if len(continuous_columns) > 0:
			scale = ColumnTransformer([("scale", Pipeline(
				[('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scale', MinMaxScaler())]),
										continuous_columns)])
			my_transformers.append(("s", scale))

		pipeline = FeatureUnion(my_transformers)
		pipeline.fit(X_train)
		X_train = pipeline.transform(X_train)
		X_test = pipeline.transform(X_test)

		number_features.append(X_train.shape[1])

		all_columns = []
		for ci in range(len(X.columns)):
			all_columns.append(str(X.columns[ci]).split('@')[0])
		X.columns = all_columns

		names = ct.get_feature_names()
		for c in continuous_columns:
			names.append(str(X.columns[c]))

		for n_i in range(len(names)):
			if names[n_i].startswith('onehot__x'):
				tokens = names[n_i].split('_')
				category = ''
				for ti in range(3, len(tokens)):
					category += '_' + tokens[ti]
				cat_id = int(names[n_i].split('_')[2].split('x')[1])
				names[n_i] = str(X.columns[categorical_features[cat_id]]) + category

		print(names)

		sensitive_ids = []
		all_names = ct.get_feature_names()
		for fname_i in range(len(all_names)):
			if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'):
				sensitive_ids.append(fname_i)

		le = preprocessing.LabelEncoder()
		le.fit(y_train)
		y_train = le.fit_transform(y_train)
		y_test = le.transform(y_test)

		return X_train, X_test, y_train, y_test, names, sensitive_ids, key, sensitive_attribute_id

示例#21

0

显示文件

文件： utils.py 项目： jaym096/Anomaly-detection-using-LSTM-autoencoders

def load_data(filename):
    with open(filename) as f:
        train = a2p.load(f)
    return train

示例#22

0

显示文件

文件： DataLoader.py 项目： BigDaMa/DFS

	def get_data(self, dataset='Adult', random_number=42):

		if isinstance(dataset, str):
			dataset_key = self.map_name2id[dataset]
		else:
			dataset_key = str(dataset)

		number_instances = []
		number_attributes = []
		number_features = []

		def get_class_attribute_name(df):
			for i in range(len(df.columns)):
				if str(df.columns[i]).startswith('class@'):
					return str(df.columns[i])

		def get_sensitive_attribute_id(df, sensitive_attribute_name):
			for i in range(len(df.columns)):
				if str(df.columns[i]) == sensitive_attribute_name:
					return i

		key = dataset_key
		if type(dataset_key) == type(None):
			key = list(self.map_dataset.keys())[random.randint(0, len(self.map_dataset) - 1)]

		data_path = './google_drive_data'
		if not os.path.isdir(data_path):
			print("Downloading Datasets ...")
			download_file_from_google_drive("19Qj3T9Yt_hQ4bM0Ac9D2MS7x507sTJRU", 'DFS_datasets.zip')

			with zipfile.ZipFile('DFS_datasets.zip') as zf:
				zf.extractall('google_drive_data')
			os.remove('DFS_datasets.zip')

			print("Downloading Query Optimizer Models ...")
			download_file_from_google_drive("1lxbcs9vS6U8t-5II2qpx0OIv08EON7NL", 'DFS_models.zip')
			with zipfile.ZipFile('DFS_models.zip') as zf:
				zf.extractall('google_drive_models')
			os.remove('DFS_models.zip')

		value = self.map_dataset[key]
		with open(data_path + "/dfs_datasets/" + str(key) + ".arff") as f:
			df = a2p.load(f)

			number_instances.append(df.shape[0])
			number_attributes.append(df.shape[1])

			y = copy.deepcopy(df[get_class_attribute_name(df)])
			X = df.drop(columns=[get_class_attribute_name(df)])

			categorical_features = []
			continuous_columns = []
			for type_i in range(len(X.columns)):
				if X.dtypes[type_i] == object:
					categorical_features.append(type_i)
				else:
					continuous_columns.append(type_i)

			sensitive_attribute_id = get_sensitive_attribute_id(X, value)

			#print(sensitive_attribute_id)

			X_datat = X.values
			for x_i in range(X_datat.shape[0]):
				for y_i in range(X_datat.shape[1]):
					if type(X_datat[x_i][y_i]) == type(None):
						if X.dtypes[y_i] == object:
							X_datat[x_i][y_i] = 'missing'
						else:
							X_datat[x_i][y_i] = np.nan

			X_temp, X_test, y_temp, y_test = train_test_split(X_datat, y.values.astype('str'), test_size=0.2,
															  random_state=random_number,
															  stratify=y.values.astype('str'))

			X_train, X_validation, y_train, y_validation = train_test_split(X_temp, y_temp, test_size=0.25,
																			random_state=random_number, stratify=y_temp)

			cat_sensitive_attribute_id = -1
			for c_i in range(len(categorical_features)):
				if categorical_features[c_i] == sensitive_attribute_id:
					cat_sensitive_attribute_id = c_i
					break

			my_transformers = []
			if len(categorical_features) > 0:
				ct = ColumnTransformer(
					[("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)])
				my_transformers.append(("o", ct))
			if len(continuous_columns) > 0:
				scale = ColumnTransformer([("scale", Pipeline(
					[('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scale', MinMaxScaler())]),
											continuous_columns)])
				my_transformers.append(("s", scale))

			pipeline = FeatureUnion(my_transformers)
			pipeline.fit(X_train)
			X_train = pipeline.transform(X_train)
			X_validation = pipeline.transform(X_validation)
			X_test = pipeline.transform(X_test)

			number_features.append(X_train.shape[1])

			all_columns = []
			for ci in range(len(X.columns)):
				all_columns.append(str(X.columns[ci]).split('@')[0])
			X.columns = all_columns

			names = ct.get_feature_names()
			for c in continuous_columns:
				names.append(str(X.columns[c]))

			for n_i in range(len(names)):
				if names[n_i].startswith('onehot__x'):
					tokens = names[n_i].split('_')
					category = ''
					for ti in range(3, len(tokens)):
						category += '_' + tokens[ti]
					cat_id = int(names[n_i].split('_')[2].split('x')[1])
					names[n_i] = str(X.columns[categorical_features[cat_id]]) + category

			sensitive_ids = []
			all_names = ct.get_feature_names()
			for fname_i in range(len(all_names)):
				if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'):
					sensitive_ids.append(fname_i)

			le = preprocessing.LabelEncoder()
			le.fit(y_train)
			y_train = le.fit_transform(y_train)
			y_validation = le.transform(y_validation)
			y_test = le.transform(y_test)

			return X_train, X_validation, X_test, y_train, y_validation, y_test, names, sensitive_ids

示例#23

0

显示文件

    def _load_fields_from_file(self, filename, target_col_name,
                               target_col_value, missing_values_strategy_str):
        """ Initializes all the fields according to the info read in the given file"""

        self._attributes_dictionary = {}
        self._col_names = []

        with open(filename) as f:
            df = a2p.load(f)

        # Extracting the columns names and the possible values for each column
        for col in df.columns:
            pos_at = col.find('@')
            col_name = col[:pos_at]

            values = col[pos_at + 1:]
            values = values.replace("{", "")
            values = values.replace("}", "")

            values = values.split(",")

            df[col_name] = df[col]
            df = df.drop(col, axis=1)
            self._attributes_dictionary[col_name] = values
            self._col_names.append(col_name)

        if target_col_name is None:
            # Supposing the target column is the last one
            target_index = len(self._col_names) - 1
            self._target_col_name = self._col_names[target_index]
        else:
            self._target_col_name = target_col_name
            found_target_col = False
            for i, col_name in enumerate(self._col_names):
                if col_name == target_col_name:
                    target_index = i
                    found_target_col = True
                    break
            if not found_target_col:
                raise NotFoundTargetCol

        # Creating the pos and neg dataframes
        self._create_dataframes(df, target_index, target_col_value,
                                missing_values_strategy_str)

        if self._verbose:

            print()
            print("Positive examples of the train set :")
            print(self._df_train_pos)

            print()
            print("Negative examples of the train set :")
            print(self._df_train_neg)

            if self.prediction_mode:

                print()
                print("Positive examples of the test set :")
                print(self._df_test_pos)

                print()
                print("Negative examples of the test set :")
                print(self._df_test_neg)

            print()
            print("Attributes dictionary :")
            print(self._attributes_dictionary)