Пример #1
0
def get_iris_data(file_in):
    """
    Fetch the UCI data set on physical characteristics of Iris species.
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-2]  #last two lines are blank

    x_headers = [
        'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = helpers.replace_missing_mode(output)

    for col in output:
        print(col)
        if col not in cat_variables:
            output[col] = [float(x) for x in output[col]]

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)
Пример #2
0
def get_glass_data(file_in):
    """
    Fetch the UCI data set on age of chemical characteristics of glass.
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-1]  #last line is blank

    x_headers = [
        'id', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'target'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = helpers.replace_missing_mode(output)

    for col in output:
        print(col)
        if col not in cat_variables:
            output[col] = [float(x) for x in output[col]]

    output = output.drop(['id'], axis=1)

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)
Пример #3
0
def get_cancer_data(file_in):
    """
    Fetch the UCI data set on breast cancer characteristics 
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-1]  #last line is blank

    x_headers = [
        'id', 'clump_thickness', 'unif_cell_size', 'unif_cell_shape',
        'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei',
        'bland_chrmatin', 'normal_nucleoli', 'mitoses', 'target'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = output.drop(['id'], axis=1)

    output = helpers.replace_missing_mode(output)

    for col in output:
        if col not in cat_variables:
            output[col] = [float(x) for x in output[col]]

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)
Пример #4
0
def get_vote_data(file_in):
    """
    Fetch and clean the UCI data set on US Representative vote records
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-1]  #last line is blank

    x_headers = [
        'target', 'handicapped-infants', 'water-project-cost-sharing',
        'adoption-of-the-budget-resolution', 'physician-fee-freeze',
        'el-salvador-aid', 'religious-groups-in-schools',
        'anti-satellite-test-ban', 'aid-to-nicaraguan-contras', 'mx-missile',
        'immigration', 'synfuels-corporation-cutback', 'education-spending',
        'superfund-right-to-sue', 'crime', 'duty-free-exports',
        'export-administration-act-south-africa'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = helpers.replace_missing_mode(output)

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)
Пример #5
0
def get_soy_data(file_in):
    """
    Fetch the UCI data set on diseases of soybean samples.
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-1]  #last line is blank

    x_headers = [
        'date', 'plant-stand', 'precip', 'temp', 'hail', 'crop-hist',
        'area-damaged', 'severity', 'seed-tmt', 'germination', 'plant-growth',
        'leaves', 'leafspots-halo', 'leafspots-marg', 'leafspot-size',
        'leaf-shread', 'leaf-malf', 'leaf-mild', 'stem', 'lodging',
        'stem-cankers', 'canker-lesion', 'fruiting-bodies', 'external decay',
        'mycelium', 'int-discolor', 'sclerotia', 'fruit-pods', 'fruit spots',
        'seed', 'mold-growth', 'seed-discolor', 'seed-size', 'shriveling',
        'roots', 'target'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = helpers.replace_missing_mode(output)

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)