def construct_net(self, df): anxiety = make_bernoulli('anxiety', value=df.anxiety) peer_pressure = make_bernoulli('peer_pressure', value=df.peer_pressure) smoking = cartesian_child('smoking', parents=[anxiety, peer_pressure], value=df.smoking) yellow_fingers = cartesian_child('yellow_fingers', parents=[smoking], value=df.yellow_fingers) genetics = make_bernoulli('genetics', value=df.genetics) lung_cancer = cartesian_child('lung_cancer', parents=[smoking, genetics], value=df.lung_cancer) allergy = make_bernoulli('allergy', value=df.allergy) coughing = cartesian_child('coughing', parents=[allergy, lung_cancer], value=df.coughing) fatigue = cartesian_child('fatigue', parents=[lung_cancer, coughing], value=df.fatigue) attention_disorder = cartesian_child('attention_disorder', parents=[genetics], value=df.attention_disorder) car_accident = cartesian_child('car_accident', parents=[fatigue, attention_disorder], value=df.car_accident) # sample from the prior model = pymc.Model([ anxiety, peer_pressure, smoking, yellow_fingers, genetics, lung_cancer, allergy, coughing, fatigue, attention_disorder, car_accident ]) return model
def test_cartesian_bernoulli_child_of_categorical_parent(): coeffs = { 'CF: p(feeling_sick)': 0.55, 'CF: p(day_of_week)': [0.013, 0.626, 0.039, 0.108, 0.134, 0.019], 'CF: p(staying_home | day_of_week=5 feeling_sick=1)': 0.240, 'CF: p(staying_home | day_of_week=3 feeling_sick=1)': 0.467, 'CF: p(staying_home | day_of_week=4 feeling_sick=1)': 0.603, 'CF: p(staying_home | day_of_week=0 feeling_sick=0)': 0.974, 'CF: p(staying_home | day_of_week=6 feeling_sick=1)': 0.331, 'CF: p(staying_home | day_of_week=6 feeling_sick=0)': 0.009, 'CF: p(staying_home | day_of_week=2 feeling_sick=1)': 0.317, 'CF: p(staying_home | day_of_week=0 feeling_sick=1)': 0.900, 'CF: p(staying_home | day_of_week=2 feeling_sick=0)': 0.651, 'CF: p(staying_home | day_of_week=1 feeling_sick=0)': 0.603, 'CF: p(staying_home | day_of_week=1 feeling_sick=1)': 0.954, 'CF: p(staying_home | day_of_week=3 feeling_sick=0)': 0.856, 'CF: p(staying_home | day_of_week=5 feeling_sick=0)': 0.606, 'CF: p(staying_home | day_of_week=4 feeling_sick=0)': 0.828 } np.random.seed = 1 # define the model with fixed *coefficients* day_of_week = make_categorical('day_of_week', levels=7, N=1, fixed=coeffs) feeling_sick = make_bernoulli('feeling_sick', N=1, fixed=coeffs) staying_home = cartesian_bernoulli_child('staying_home', [day_of_week, feeling_sick], N=1, fixed=coeffs) model = pymc.Model([day_of_week, feeling_sick, staying_home]) # sample data from the model sampler = pymc.MCMC(model) sampler.sample(iter=500) day_of_week_val = sampler.trace('day_of_week')[:] feeling_sick_val = sampler.trace('feeling_sick')[:] staying_home_val = sampler.trace('staying_home')[:] # define the model again this time with fixed *data* day_of_week, c1 = make_categorical('day_of_week', levels=7, value=day_of_week_val, return_coeffs=True) feeling_sick, c2 = make_bernoulli('feeling_sick', value=feeling_sick_val, return_coeffs=True) staying_home, c3 = cartesian_bernoulli_child('staying_home', [day_of_week, feeling_sick], value=staying_home_val, return_coeffs=True) model = pymc.Model([day_of_week, feeling_sick, staying_home] + c1 + c2 + c3) sampler = pymc.MCMC(model) sampler.sample(iter=500, burn=300) cname = 'CF: p(staying_home | day_of_week=1 feeling_sick=0)' print cname assert np.isclose(coeffs[cname], sampler.trace(cname)[:].mean(), atol=0.3) for c in coeffs: print c, coeffs[c], sampler.trace(c)[:].mean()
def construct_net(self, df): rain = make_bernoulli('rain', value=df.rain) sprinkler = make_bernoulli('sprinkler', value=df.sprinkler) sidewalk = cartesian_child('wet_sidewalk', parents=[rain, sprinkler], value=df.wet_sidewalk) model = pymc.Model([rain, sprinkler, sidewalk]) return model
def test_cartesian_categorical_child_creates_correctly_named_coefficients(): mum = make_bernoulli('mum', N=1) dad = make_bernoulli('dad', N=1) child, coeffs = cartesian_categorical_child('child', [mum, dad], levels=4, N=1, return_coeffs=True) coeff_names = {str(coeff) for coeff in coeffs} assert coeff_names == { 'CF: p(child | mum=0 dad=0)', 'CF: p(child | mum=1 dad=1)', 'CF: p(child | mum=1 dad=0)', 'CF: p(child | mum=0 dad=1)', 'p(child)' }
def construct_net(self, df): # This is the same network as was used to generate the data, but there # are some differences in how it is set up: # - we don't fix the values of coefficients - the whole point is for the model to learn them! # - we provide 'value' parameter - because these variables are observed # - we don't need the 'N' parameter - because it is assumed that N = len(value) rain = make_bernoulli('rain', value=df.rain) sprinkler = make_bernoulli('sprinkler', value=df.sprinkler) grass_wet = cartesian_child('grass_wet', parents=[rain, sprinkler], levels=2, value=df.grass_wet) model = pymc.Model([rain, sprinkler, grass_wet]) return model
def test_make_bernoulli_returns_variable_with_beta_parent(): name = 'rosebud' bernoulli_var = make_bernoulli(name, value=[0, 1, 1, 0]) parent = bernoulli_var.parents['p'] assert bernoulli_var.observed assert not parent.observed assert str(parent) == COEFFS_PREFIX + 'p(%s)' % name assert isinstance(parent, pymc.distributions.Beta)
def test_cartesian_bernoulli_child_creates_correctly_named_coefficients(): mum = make_bernoulli('mum', N=1) dad = make_bernoulli('dad', N=1) child, coeffs = cartesian_bernoulli_child('child', [mum, dad], N=1, return_coeffs=True) coeff_names = {str(coeff) for coeff in coeffs} assert coeff_names == { 'CF: p(child | mum=0 dad=0)', 'CF: p(child | mum=1 dad=1)', 'CF: p(child | mum=1 dad=0)', 'CF: p(child | mum=0 dad=1)', 'p(child)' } single_parent = make_bernoulli('single_parent', N=1) child, coeffs = cartesian_bernoulli_child('child', [single_parent], N=1, return_coeffs=True) coeff_names = {str(coeff) for coeff in coeffs} assert coeff_names == { 'p(child)', 'CF: p(child | single_parent=0)', 'CF: p(child | single_parent=1)' }
def get_rsw_data_random_coeffs(size): """generates a data from the famous 'rain - sprinkler - wet grass' bayesian network returns as a dataframe. """ rain = make_bernoulli('rain', N=1) sprinkler = cartesian_child('sprinkler', parents=[rain], levels=2, N=1) grass_wet = cartesian_child('grass_wet', parents=[sprinkler, rain], levels=2, N=1) # sample from the prior model = pymc.Model([rain, sprinkler, grass_wet]) coefficients = sample_coeffs(model) # define the bayesian net rain = make_bernoulli('rain', N=1, fixed=coefficients) sprinkler = cartesian_child('sprinkler', parents=[rain], levels=2, N=1, fixed=coefficients) grass_wet = cartesian_child('grass_wet', parents=[sprinkler, rain], levels=2, N=1, fixed=coefficients) # sample from the prior model = pymc.Model([rain, sprinkler, grass_wet]) sampler = pymc.MCMC(model) sampler.sample(iter=size + 1000, burn=1000) data = pd.DataFrame({ str(node): sampler.trace(str(node))[:].ravel() + 0 for node in sampler.stochastics if not str(node).startswith('CF: ') }) return data
def test_cartesian_bernoulli_child(): # define the model with no data just to sample all the coefficients from # their priors has_garden = make_bernoulli('has_garden', N=1) is_big = make_bernoulli('is_big', N=1) is_green = cartesian_bernoulli_child('is_green', [is_big, has_garden], N=1) model = pymc.Model([has_garden, is_big, is_green]) coeff_values = sample_coeffs(model) # define identical model again but fix coefficient values has_garden = make_bernoulli('has_garden', N=1, fixed=coeff_values) is_big = make_bernoulli('is_big', N=1, fixed=coeff_values) is_green = cartesian_bernoulli_child('is_green', [is_big, has_garden], N=1, fixed=coeff_values) fx_model = pymc.Model([has_garden, is_big, is_green]) # sample from the model with fixed coefficients fx_sampler = pymc.MCMC(fx_model) fx_sampler.sample(iter=2000) has_garden_sample = fx_sampler.trace('has_garden')[:] is_big_sample = fx_sampler.trace('is_big')[:] is_green_sample = fx_sampler.trace('is_green')[:] # define identical model again but fix coefficient values has_garden, cfs1 = make_bernoulli('has_garden', value=has_garden_sample, return_coeffs=True) is_big, cfs2 = make_bernoulli('is_big', value=is_big_sample, return_coeffs=True) is_green, cfs3 = cartesian_bernoulli_child('is_green', [is_big, has_garden], value=is_green_sample, return_coeffs=True) model = pymc.Model(cfs1 + cfs2 + cfs3) sampler = pymc.MCMC(model) sampler.sample(iter=2000, burn=1000) for pymc_var in sampler.stochastics: name = str(pymc_var) mean_posterior = sampler.trace(name)[:].mean() actual = coeff_values[name] print "%s %.3f %.3f" % (pad(name, 30), mean_posterior, actual) assert np.isclose(mean_posterior, actual, rtol=0.1, atol=0.1)
def get_rsw_data(size): """generates a data from the famous 'rain - sprinkler - wet grass' bayesian network returns as a dataframe. """ coefficients = { 'CF: p(rain)': 0.2, 'CF: p(grass_wet | sprinkler=0 rain=0)': 0., 'CF: p(grass_wet | sprinkler=0 rain=1)': 0.8, 'CF: p(grass_wet | sprinkler=1 rain=0)': 0.9, 'CF: p(grass_wet | sprinkler=1 rain=1)': 0.99, 'CF: p(sprinkler | rain=0)': 0.4, 'CF: p(sprinkler | rain=1)': 0.01 } # define the bayesian net rain = make_bernoulli('rain', N=1, fixed=coefficients) sprinkler = cartesian_child('sprinkler', parents=[rain], levels=2, N=1, fixed=coefficients) grass_wet = cartesian_child('grass_wet', parents=[sprinkler, rain], levels=2, N=1, fixed=coefficients) # sample from the prior model = pymc.Model([rain, sprinkler, grass_wet]) sampler = pymc.MCMC(model) sampler.sample(iter=size, burn=0) data = pd.DataFrame({ str(node): sampler.trace(str(node))[:].ravel() + 0 for node in sampler.stochastics }) return data
def construct_net(self, df): mileage = make_bernoulli('mileage', value=df.mileage) age = make_bernoulli('age', value=df.age) socio_econ = cartesian_child('socio_econ', parents=[age], value=df.socio_econ) risk_aversion = cartesian_child('risk_aversion', parents=[age, socio_econ], value=df.risk_aversion) senior_train = cartesian_child('senior_train', parents=[age, risk_aversion], value=df.senior_train) good_student = cartesian_child('good_student', parents=[age, socio_econ], value=df.good_student) extra_car = cartesian_child('extra_car', parents=[socio_econ], value=df.extra_car) driving_skill = cartesian_child('driving_skill', parents=[age, senior_train], value=df.driving_skill) driving_hist = cartesian_child('driving_hist', parents=[driving_skill, risk_aversion], value=df.driving_hist) driving_quality = cartesian_child( 'driving_quality', parents=[driving_skill, risk_aversion], value=df.driving_quality) make_model = cartesian_child('make_model', parents=[risk_aversion, socio_econ], value=df.make_model) vehicle_year = cartesian_child('vehicle_year', parents=[socio_econ], value=df.vehicle_year) antilock = cartesian_child('antilock', parents=[make_model, vehicle_year], value=df.antilock) ruggedness = cartesian_child('ruggedness', parents=[make_model, antilock], value=df.ruggedness) accident = cartesian_child( 'accident', parents=[driving_quality, antilock, mileage], value=df.accident) airbag = cartesian_child('airbag', parents=[make_model, vehicle_year], value=df.airbag) car_value = cartesian_child( 'car_value', parents=[make_model, mileage, vehicle_year], value=df.car_value) home_base = cartesian_child('home_base', parents=[risk_aversion, socio_econ], value=df.home_base) anti_theft = cartesian_child('anti_theft', parents=[risk_aversion, socio_econ], value=df.anti_theft) theft = cartesian_child('theft', parents=[car_value, home_base, anti_theft], value=df.theft) own_damage = cartesian_child('own_damage', parents=[ruggedness, accident], value=df.own_damage) own_car_cost = cartesian_child('own_car_cost', parents=[own_damage, car_value, theft], value=df.own_car_cost) cushioning = cartesian_child('cushioning', parents=[ruggedness, accident], value=df.cushioning) medical_cost = cartesian_child('medical_cost', parents=[age, cushioning, accident], value=df.medical_cost) liability_cost = cartesian_child('liability_cost', parents=[accident], value=df.liability_cost) other_car_cost = cartesian_child('other_car_cost', parents=[ruggedness, accident], value=df.other_car_cost) property_cost = cartesian_child('property_cost', parents=[other_car_cost, own_car_cost], value=df.property_cost) model = pymc.Model([ mileage, age, socio_econ, good_student, extra_car, driving_skill, risk_aversion, senior_train, driving_hist, driving_quality, make_model, vehicle_year, antilock, ruggedness, accident, airbag, car_value, home_base, anti_theft, theft, own_damage, own_car_cost, cushioning, medical_cost, liability_cost, other_car_cost, property_cost ]) return model
def get_car_data_random_coeffs(size): mileage = make_bernoulli('mileage', N=1) age = make_bernoulli('age', N=1) socio_econ = cartesian_child('socio_econ', parents=[age], N=1) risk_aversion = cartesian_child('risk_aversion', parents=[age, socio_econ], N=1) senior_train = cartesian_child('senior_train', parents=[age, risk_aversion], N=1) good_student = cartesian_child('good_student', parents=[age, socio_econ], N=1) extra_car = cartesian_child('extra_car', parents=[socio_econ], N=1) driving_skill = cartesian_child('driving_skill', parents=[age, senior_train], N=1) driving_hist = cartesian_child('driving_hist', parents=[driving_skill, risk_aversion], N=1) driving_quality = cartesian_child('driving_quality', parents=[driving_skill, risk_aversion], N=1) make_model = cartesian_child('make_model', parents=[risk_aversion, socio_econ], N=1) vehicle_year = cartesian_child('vehicle_year', parents=[socio_econ], N=1) antilock = cartesian_child('antilock', parents=[make_model, vehicle_year], N=1) ruggedness = cartesian_child('ruggedness', parents=[make_model, antilock], N=1) accident = cartesian_child('accident', parents=[driving_quality, antilock, mileage], N=1) airbag = cartesian_child('airbag', parents=[make_model, vehicle_year], N=1) car_value = cartesian_child('car_value', parents=[make_model, mileage, vehicle_year], N=1) home_base = cartesian_child('home_base', parents=[risk_aversion, socio_econ], N=1) anti_theft = cartesian_child('anti_theft', parents=[risk_aversion, socio_econ], N=1) theft = cartesian_child('theft', parents=[car_value, home_base, anti_theft], N=1) own_damage = cartesian_child('own_damage', parents=[ruggedness, accident], N=1) own_car_cost = cartesian_child('own_car_cost', parents=[own_damage, car_value, theft], N=1) cushioning = cartesian_child('cushioning', parents=[ruggedness, accident], N=1) medical_cost = cartesian_child('medical_cost', parents=[age, cushioning, accident], N=1) liability_cost = cartesian_child('liability_cost', parents=[accident], N=1) other_car_cost = cartesian_child('other_car_cost', parents=[ruggedness, accident], N=1) property_cost = cartesian_child('property_cost', parents=[other_car_cost, own_car_cost], N=1) model = pymc.Model([ mileage, age, socio_econ, good_student, extra_car, driving_skill, risk_aversion, senior_train, driving_hist, driving_quality, make_model, vehicle_year, antilock, ruggedness, accident, airbag, car_value, home_base, anti_theft, theft, own_damage, own_car_cost, cushioning, medical_cost, liability_cost, other_car_cost, property_cost ]) coefficients = sample_coeffs(model) mileage = make_bernoulli('mileage', N=1, fixed=coefficients) age = make_bernoulli('age', N=1, fixed=coefficients) socio_econ = cartesian_child('socio_econ', parents=[age], N=1, fixed=coefficients) risk_aversion = cartesian_child('risk_aversion', parents=[age, socio_econ], N=1, fixed=coefficients) senior_train = cartesian_child('senior_train', parents=[age, risk_aversion], N=1, fixed=coefficients) good_student = cartesian_child('good_student', parents=[age, socio_econ], N=1, fixed=coefficients) extra_car = cartesian_child('extra_car', parents=[socio_econ], N=1, fixed=coefficients) driving_skill = cartesian_child('driving_skill', parents=[age, senior_train], N=1) driving_hist = cartesian_child('driving_hist', parents=[driving_skill, risk_aversion], N=1, fixed=coefficients) driving_quality = cartesian_child('driving_quality', parents=[driving_skill, risk_aversion], N=1, fixed=coefficients) make_model = cartesian_child('make_model', parents=[risk_aversion, socio_econ], N=1, fixed=coefficients) vehicle_year = cartesian_child('vehicle_year', parents=[socio_econ], N=1, fixed=coefficients) antilock = cartesian_child('antilock', parents=[make_model, vehicle_year], N=1, fixed=coefficients) ruggedness = cartesian_child('ruggedness', parents=[make_model, antilock], N=1, fixed=coefficients) accident = cartesian_child('accident', parents=[driving_quality, antilock, mileage], N=1, fixed=coefficients) airbag = cartesian_child('airbag', parents=[make_model, vehicle_year], N=1, fixed=coefficients) car_value = cartesian_child('car_value', parents=[make_model, mileage, vehicle_year], N=1, fixed=coefficients) home_base = cartesian_child('home_base', parents=[risk_aversion, socio_econ], N=1, fixed=coefficients) anti_theft = cartesian_child('anti_theft', parents=[risk_aversion, socio_econ], N=1, fixed=coefficients) theft = cartesian_child('theft', parents=[car_value, home_base, anti_theft], N=1, fixed=coefficients) own_damage = cartesian_child('own_damage', parents=[ruggedness, accident], N=1, fixed=coefficients) own_car_cost = cartesian_child('own_car_cost', parents=[own_damage, car_value, theft], N=1, fixed=coefficients) cushioning = cartesian_child('cushioning', parents=[ruggedness, accident], N=1, fixed=coefficients) medical_cost = cartesian_child('medical_cost', parents=[age, cushioning, accident], N=1, fixed=coefficients) liability_cost = cartesian_child('liability_cost', parents=[accident], N=1, fixed=coefficients) other_car_cost = cartesian_child('other_car_cost', parents=[ruggedness, accident], N=1, fixed=coefficients) property_cost = cartesian_child('property_cost', parents=[other_car_cost, own_car_cost], N=1, fixed=coefficients) model = pymc.Model([ mileage, age, socio_econ, good_student, extra_car, driving_skill, risk_aversion, senior_train, driving_hist, driving_quality, make_model, vehicle_year, antilock, ruggedness, accident, airbag, car_value, home_base, anti_theft, theft, own_damage, own_car_cost, cushioning, medical_cost, liability_cost, other_car_cost, property_cost ]) sampler = pymc.MCMC(model) sampler.sample(iter=size, burn=0) data = pd.DataFrame({ str(node): sampler.trace(str(node))[:].ravel() + 0 for node in sampler.stochastics if not str(node).startswith('CF: ') }) return data
def get_cancer_data_random_coeffs(size): anxiety = make_bernoulli('anxiety', N=1) peer_pressure = make_bernoulli('peer_pressure', N=1) smoking = cartesian_child('smoking', parents=[anxiety, peer_pressure], N=1) yellow_fingers = cartesian_child('yellow_fingers', parents=[smoking], N=1) genetics = make_bernoulli('genetics', N=1) lung_cancer = cartesian_child('lung_cancer', parents=[smoking, genetics], N=1) allergy = make_bernoulli('allergy', N=1) coughing = cartesian_child('coughing', parents=[allergy, lung_cancer], N=1) fatigue = cartesian_child('fatigue', parents=[lung_cancer, coughing], N=1) attention_disorder = cartesian_child('attention_disorder', parents=[genetics], N=1) car_accident = cartesian_child('car_accident', parents=[fatigue, attention_disorder], N=1) model = pymc.Model([ anxiety, peer_pressure, smoking, yellow_fingers, genetics, lung_cancer, allergy, coughing, fatigue, attention_disorder, car_accident ]) coefficients = sample_coeffs(model) anxiety = make_bernoulli('anxiety', N=1, fixed=coefficients) peer_pressure = make_bernoulli('peer_pressure', N=1, fixed=coefficients) smoking = cartesian_child('smoking', parents=[anxiety, peer_pressure], N=1, fixed=coefficients) yellow_fingers = cartesian_child('yellow_fingers', parents=[smoking], N=1, fixed=coefficients) genetics = make_bernoulli('genetics', N=1, fixed=coefficients) lung_cancer = cartesian_child('lung_cancer', parents=[smoking, genetics], N=1, fixed=coefficients) allergy = make_bernoulli('allergy', N=1, fixed=coefficients) coughing = cartesian_child('coughing', parents=[allergy, lung_cancer], N=1, fixed=coefficients) fatigue = cartesian_child('fatigue', parents=[lung_cancer, coughing], N=1, fixed=coefficients) attention_disorder = cartesian_child('attention_disorder', parents=[genetics], N=1, fixed=coefficients) car_accident = cartesian_child('car_accident', parents=[fatigue, attention_disorder], N=1, fixed=coefficients) # sample from the prior model = pymc.Model([ anxiety, peer_pressure, smoking, yellow_fingers, genetics, lung_cancer, allergy, coughing, fatigue, attention_disorder, car_accident ]) sampler = pymc.MCMC(model) sampler.sample(iter=size, burn=0) data = pd.DataFrame({ str(node): sampler.trace(str(node))[:].ravel() + 0 for node in sampler.stochastics }) return data