示例#1
0
def test_row_similarity_wrt():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models()

    engine._models[0]['col_assignment'] = [0, 0, 1]
    engine._models[1]['col_assignment'] = [0, 0, 1]
    engine._models[2]['col_assignment'] = [0, 0, 1]
    engine._models[3]['col_assignment'] = [0, 0, 1]

    engine._models[0]['row_assignments'] = [[0] + [1]*29, [0]*30]
    engine._models[1]['row_assignments'] = [[0] + [1]*29, [0]*30]
    engine._models[2]['row_assignments'] = [[1]*29 + [0], [0]*30]
    engine._models[3]['row_assignments'] = [[1]*29 + [0], [0]*30]

    assert engine.row_similarity(0, 1, wrt=['c0']) == .5
    assert engine.row_similarity(0, 1, wrt=['c2']) == 1.
示例#2
0
def test_row_similarity_wrt():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models()

    engine._models[0]['col_assignment'] = [0, 0, 1]
    engine._models[1]['col_assignment'] = [0, 0, 1]
    engine._models[2]['col_assignment'] = [0, 0, 1]
    engine._models[3]['col_assignment'] = [0, 0, 1]

    engine._models[0]['row_assignments'] = [[0] + [1] * 29, [0] * 30]
    engine._models[1]['row_assignments'] = [[0] + [1] * 29, [0] * 30]
    engine._models[2]['row_assignments'] = [[1] * 29 + [0], [0] * 30]
    engine._models[3]['row_assignments'] = [[1] * 29 + [0], [0] * 30]

    assert engine.row_similarity(0, 1, wrt=['c0']) == .5
    assert engine.row_similarity(0, 1, wrt=['c2']) == 1.
示例#3
0
engine.heatmap('row_similarity', plot_kwargs={'figsize': (10, 10,)})
plt.show()

# The paint job is an important part of what makes a pine wood derby car fast,
# but does it matter for animals? We'll use the linfoot information to
# determine how predictive variables are of whether an animal is fast. Linfoot
# if basically the information-theoretic counterpart to correlation.
linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False)
linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False)

print('Linfoot(fast, lean) = %f' % (linfoot_lean,))
print('Linfoot(fast, stripes) = %f' % (linfoot_stripes,))

# We can also figure out which animals are more similar. Is a wolf more
# similar to a dalmatian or a rat.
sim_wolves = engine.row_similarity('chihuahua', 'wolf')
sim_rats = engine.row_similarity('chihuahua', 'rat')

print('Similarity between Chihuahuas and wolves is %f' % (sim_wolves,))
print('Similarity between Chihuahuas and rats is %f' % (sim_rats,))


# Which animals are outliers with respect to their being fast. We can find out
# by calculating the surprisal (self infotmation).
s = engine.surprisal('fast')
s.sort(['surprisal'], ascending=False, inplace=True)
print(s.head(10))

# Lets say we're out in the woods and we see a lean, spotted animal with a
# tail. What is the probability that it is fierce and fast?
# Note that for continuous variables, Engine.probability returns the log PDF
示例#4
0
)})
plt.show()

# The paint job is an important part of what makes a pine wood derby car fast,
# but does it matter for animals? We'll use the linfoot information to
# determine how predictive variables are of whether an animal is fast. Linfoot
# if basically the information-theoretic counterpart to correlation.
linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False)
linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False)

print('Linfoot(fast, lean) = %f' % (linfoot_lean, ))
print('Linfoot(fast, stripes) = %f' % (linfoot_stripes, ))

# We can also figure out which animals are more similar. Is a wolf more
# similar to a dalmatian or a rat.
sim_wolves = engine.row_similarity('chihuahua', 'wolf')
sim_rats = engine.row_similarity('chihuahua', 'rat')

print('Similarity between Chihuahuas and wolves is %f' % (sim_wolves, ))
print('Similarity between Chihuahuas and rats is %f' % (sim_rats, ))

# Which animals are outliers with respect to their being fast. We can find out
# by calculating the surprisal (self infotmation).
s = engine.surprisal('fast')
s.sort(['surprisal'], ascending=False, inplace=True)
print(s.head(10))

# Lets say we're out in the woods and we see a lean, spotted animal with a
# tail. What is the probability that it is fierce and fast?
# Note that for continuous variables, Engine.probability returns the log PDF
# of an event given observations.