示例#1
0
def marginalise_demo2():
    """Generates a GUI to demo marginalisation on a small joint distribution"""
    from IO import read_csv
    from Data import CompactFactor
    from Demos import marginalise_gui
    from StringIO import StringIO

    cancer = CompactFactor(read_csv(StringIO(cancerdat)))
    # create a normal factor
    data = cancer['Smoker', 'Cancer', 'Bronchitis']
    marginalise_gui(data.normalised())
示例#2
0
def florida_demo():
    """Show Florida death penalty data"""
    from IO import read_csv
    from Data import CompactFactor
    import Parameters
    from StringIO import StringIO

    florida = CompactFactor(read_csv(StringIO(floridadat)))
    #create a normal factor
    table = florida['Murderer', 'Sentence', 'Victim']
    print table
    print 'Number of observations is %d' % table.z()
    Parameters.precision = 6
    print table.normalised()
示例#3
0
def cancer_table():
    """Prints out a small contingency table and its normalised version"""
    from IO import read_csv
    from Data import CompactFactor
    import Parameters
    from StringIO import StringIO

    cancer = CompactFactor(read_csv(StringIO(cancerdat)))
    # create a normal factor
    table = cancer['Smoker', 'Cancer', 'Bronchitis']
    print table
    print 'Number of observations is %d' % table.z()
    Parameters.precision = 6
    print table.normalised()
from random import sample

def random_undersample(dataset, sample_size=-1, sample_percent=-1):
    """Randomply undersamples without replacemnt a dataset."""
    # determine sample size
    try:
        if 0 <= sample_size <= len(dataset):
            s_size = int(sample_size)
        elif 0 <= sample_percent <= 100:
            s_size = int(len(dataset) * sample_percent / 100)
        else:
            s_size = 0
    except:
        return []
    # return sample
    return sample(dataset, s_size)

if __name__ == '__main__':
    from IO import read_csv, write_csv, read_header
    file_path = '../data/train.csv'
    out_file_path = '../data_preprocessed/train_undersampled.csv'
    # read dataset
    dataset = read_csv(file_path, has_header=True)
    header = read_header(file_path)
    # undersample
    sample = random_undersample(dataset, sample_percent=5)
    # write to file
    write_csv(out_file_path , sample, header=header)