def load_dataset(): '''Return Wines and labels.''' col_Names = [i[0] for i in cols] col_Names.append(labelcol) from pyspark.sql import SparkSession from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType spark = SparkSession.builder\ .appName("Import Wine Table")\ .config("spark.yarn.access.hadoopFileSystems",s3_bucket)\ .config("spark.hadoop.fs.s3a.s3guard.ddb.region", s3_bucket_region)\ .getOrCreate() df = spark.sql("SELECT * FROM `default`.`wine`").toPandas() spark.stop() df = drop_missing(df).reset_index() df.index.name = 'id' clean(df) #Add a (random) wine label name in order to have an identifier df['ranName'] = df.index.to_series().map(lambda x: namegenerator.gen()) features, labels = utils.splitdf(df, labelcol) features = utils.drop_non_features(features, cols) features = utils.categorize(features, cols) #labels = pd.Categorical(labels) return features, labels
def load_dataset(): '''Return IBM customers and labels.''' df = pd.read_excel(ibmxlsxpath) df = drop_missing(df).reset_index() df.index.name = 'id' features, labels = utils.splitdf(df, labelcol) features = booleanize_senior_citizen(features) features = utils.drop_non_features(features, cols) features = utils.categorize(features, cols) labels = (labels == 'Yes') return features, labels
def load_dataset(): col_Names = [i[0] for i in cols] col_Names.append(labelcol) df = pd.read_csv(csvpath, sep=";", header=None, names=col_Names, index_col=None) df = drop_missing(df).reset_index() df.index.name = 'id' clean(df) #Add a (random) wine label name in order to have an identifier df['ranName'] = df.index.to_series().map(lambda x: namegenerator.gen()) features, labels = utils.splitdf(df, labelcol) features = utils.drop_non_features(features, cols) features = utils.categorize(features, cols) #labels = pd.Categorical(labels) return features, labels
def load_dataset(): try: loans = utils.load_processed_dataset('loans') except IOError: print('Not found. Regenerating...') loans = read_raw_data() loans = loans.set_index(idcol) loans = remove_incomplete(loans) loans = remove_missing_revol_util(loans) loans = add_frac_repaid(loans) loans = remove_unfully_paid(loans) loans = remove_overpaid(loans) loans = add_not_repaid(loans) loans = parse_term(loans) loans = parse_percent(loans) loans = utils.categorize(loans, cols) loans = utils.drop_non_features(loans, cols) utils.save_processed_dataset(loans, 'loans') return utils.splitdf(loans, labelcol)