예제 #1
0
def load_dataset():
    '''Return Wines and labels.'''
    col_Names = [i[0] for i in cols]
    col_Names.append(labelcol)

    from pyspark.sql import SparkSession
    from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType

    spark = SparkSession.builder\
        .appName("Import Wine Table")\
        .config("spark.yarn.access.hadoopFileSystems",s3_bucket)\
        .config("spark.hadoop.fs.s3a.s3guard.ddb.region", s3_bucket_region)\
        .getOrCreate()

    df = spark.sql("SELECT * FROM `default`.`wine`").toPandas()
    spark.stop()

    df = drop_missing(df).reset_index()
    df.index.name = 'id'
    clean(df)
    #Add a (random) wine label name in order to have an identifier
    df['ranName'] = df.index.to_series().map(lambda x: namegenerator.gen())
    features, labels = utils.splitdf(df, labelcol)
    features = utils.drop_non_features(features, cols)
    features = utils.categorize(features, cols)
    #labels = pd.Categorical(labels)
    return features, labels
예제 #2
0
def load_dataset():
    '''Return IBM customers and labels.'''
    df = pd.read_excel(ibmxlsxpath)
    df = drop_missing(df).reset_index()
    df.index.name = 'id'
    features, labels = utils.splitdf(df, labelcol)
    features = booleanize_senior_citizen(features)
    features = utils.drop_non_features(features, cols)
    features = utils.categorize(features, cols)
    labels = (labels == 'Yes')
    return features, labels
예제 #3
0
def load_dataset():
    col_Names = [i[0] for i in cols]
    col_Names.append(labelcol)
    df = pd.read_csv(csvpath,
                     sep=";",
                     header=None,
                     names=col_Names,
                     index_col=None)
    df = drop_missing(df).reset_index()
    df.index.name = 'id'
    clean(df)
    #Add a (random) wine label name in order to have an identifier
    df['ranName'] = df.index.to_series().map(lambda x: namegenerator.gen())
    features, labels = utils.splitdf(df, labelcol)
    features = utils.drop_non_features(features, cols)
    features = utils.categorize(features, cols)
    #labels = pd.Categorical(labels)
    return features, labels
예제 #4
0
def load_dataset():
    try:
        loans = utils.load_processed_dataset('loans')
    except IOError:
        print('Not found. Regenerating...')
        loans = read_raw_data()
        loans = loans.set_index(idcol)
        loans = remove_incomplete(loans)
        loans = remove_missing_revol_util(loans)

        loans = add_frac_repaid(loans)
        loans = remove_unfully_paid(loans)
        loans = remove_overpaid(loans)
        loans = add_not_repaid(loans)

        loans = parse_term(loans)
        loans = parse_percent(loans)

        loans = utils.categorize(loans, cols)
        loans = utils.drop_non_features(loans, cols)
        utils.save_processed_dataset(loans, 'loans')

    return utils.splitdf(loans, labelcol)