def test_row_similarity_wrt(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) s3 = pd.Series(np.random.rand(30)) df = pd.concat([s1, s2, s3], axis=1) df.columns = ['c0', 'c1', 'c2'] engine = Engine(df, n_models=4, use_mp=False) engine.init_models() engine._models[0]['col_assignment'] = [0, 0, 1] engine._models[1]['col_assignment'] = [0, 0, 1] engine._models[2]['col_assignment'] = [0, 0, 1] engine._models[3]['col_assignment'] = [0, 0, 1] engine._models[0]['row_assignments'] = [[0] + [1]*29, [0]*30] engine._models[1]['row_assignments'] = [[0] + [1]*29, [0]*30] engine._models[2]['row_assignments'] = [[1]*29 + [0], [0]*30] engine._models[3]['row_assignments'] = [[1]*29 + [0], [0]*30] assert engine.row_similarity(0, 1, wrt=['c0']) == .5 assert engine.row_similarity(0, 1, wrt=['c2']) == 1.
def test_row_similarity_wrt(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) s3 = pd.Series(np.random.rand(30)) df = pd.concat([s1, s2, s3], axis=1) df.columns = ['c0', 'c1', 'c2'] engine = Engine(df, n_models=4, use_mp=False) engine.init_models() engine._models[0]['col_assignment'] = [0, 0, 1] engine._models[1]['col_assignment'] = [0, 0, 1] engine._models[2]['col_assignment'] = [0, 0, 1] engine._models[3]['col_assignment'] = [0, 0, 1] engine._models[0]['row_assignments'] = [[0] + [1] * 29, [0] * 30] engine._models[1]['row_assignments'] = [[0] + [1] * 29, [0] * 30] engine._models[2]['row_assignments'] = [[1] * 29 + [0], [0] * 30] engine._models[3]['row_assignments'] = [[1] * 29 + [0], [0] * 30] assert engine.row_similarity(0, 1, wrt=['c0']) == .5 assert engine.row_similarity(0, 1, wrt=['c2']) == 1.
engine.heatmap('row_similarity', plot_kwargs={'figsize': (10, 10,)}) plt.show() # The paint job is an important part of what makes a pine wood derby car fast, # but does it matter for animals? We'll use the linfoot information to # determine how predictive variables are of whether an animal is fast. Linfoot # if basically the information-theoretic counterpart to correlation. linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False) linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False) print('Linfoot(fast, lean) = %f' % (linfoot_lean,)) print('Linfoot(fast, stripes) = %f' % (linfoot_stripes,)) # We can also figure out which animals are more similar. Is a wolf more # similar to a dalmatian or a rat. sim_wolves = engine.row_similarity('chihuahua', 'wolf') sim_rats = engine.row_similarity('chihuahua', 'rat') print('Similarity between Chihuahuas and wolves is %f' % (sim_wolves,)) print('Similarity between Chihuahuas and rats is %f' % (sim_rats,)) # Which animals are outliers with respect to their being fast. We can find out # by calculating the surprisal (self infotmation). s = engine.surprisal('fast') s.sort(['surprisal'], ascending=False, inplace=True) print(s.head(10)) # Lets say we're out in the woods and we see a lean, spotted animal with a # tail. What is the probability that it is fierce and fast? # Note that for continuous variables, Engine.probability returns the log PDF
)}) plt.show() # The paint job is an important part of what makes a pine wood derby car fast, # but does it matter for animals? We'll use the linfoot information to # determine how predictive variables are of whether an animal is fast. Linfoot # if basically the information-theoretic counterpart to correlation. linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False) linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False) print('Linfoot(fast, lean) = %f' % (linfoot_lean, )) print('Linfoot(fast, stripes) = %f' % (linfoot_stripes, )) # We can also figure out which animals are more similar. Is a wolf more # similar to a dalmatian or a rat. sim_wolves = engine.row_similarity('chihuahua', 'wolf') sim_rats = engine.row_similarity('chihuahua', 'rat') print('Similarity between Chihuahuas and wolves is %f' % (sim_wolves, )) print('Similarity between Chihuahuas and rats is %f' % (sim_rats, )) # Which animals are outliers with respect to their being fast. We can find out # by calculating the surprisal (self infotmation). s = engine.surprisal('fast') s.sort(['surprisal'], ascending=False, inplace=True) print(s.head(10)) # Lets say we're out in the woods and we see a lean, spotted animal with a # tail. What is the probability that it is fierce and fast? # Note that for continuous variables, Engine.probability returns the log PDF # of an event given observations.