def test_list_of_dicts_large_predict_proba(): obs = [{'large_monty_friend' : True, 'large_monty_guest': 'A', 'large_monty_prize': 'A', 'large_monty': 'C'}] y1 = DiscreteDistribution({0: 0.0472, 1: 0.781, 2: 0.17167}) y2 = DiscreteDistribution({True: 0.8562, False: 0.143776}) y_hat = large_monty_network.predict_proba(obs) assert_equal(y_hat[0][0], True) assert_equal(y_hat[0][1], 'A') assert_equal(y_hat[0][2], 'A') assert_equal(y_hat[0][3], 'C') assert_discrete_equal(y_hat[0][4], y1, 3) assert_discrete_equal(y_hat[0][5], y2, 3) obs = [{'large_monty_friend' : True, 'large_monty_prize': 'A', 'large_monty': 'C', 'large_monty_remaining' : 2}] y1 = DiscreteDistribution({'A': 0.5, 'B': 0.5, 'C': 0.0}) y2 = DiscreteDistribution({True: 0.75, False: 0.25}) y_hat = large_monty_network.predict_proba(obs) assert_equal(y_hat[0][0], True) assert_equal(y_hat[0][2], 'A') assert_equal(y_hat[0][3], 'C') assert_equal(y_hat[0][4], 2) assert_discrete_equal(y_hat[0][1], y1) assert_discrete_equal(y_hat[0][5], y2)
def get_bayesnet(self): door_lock = DiscreteDistribution({'d1': 0.7, 'd2': 0.3}) clock_alarm = DiscreteDistribution( { 'a1' : 0.8, 'a2' : 0.2} ) light = ConditionalProbabilityTable( [[ 'd1', 'a1', 'l1', 0.96 ], ['d1', 'a1', 'l2', 0.04 ], [ 'd1', 'a2', 'l1', 0.89 ], [ 'd1', 'a2', 'l2', 0.11 ], [ 'd2', 'a1', 'l1', 0.96 ], [ 'd2', 'a1', 'l2', 0.04 ], [ 'd2', 'a2', 'l1', 0.89 ], [ 'd2', 'a2', 'l2', 0.11 ]], [door_lock, clock_alarm]) coffee_maker = ConditionalProbabilityTable( [[ 'a1', 'c1', 0.92 ], [ 'a1', 'c2', 0.08 ], [ 'a2', 'c1', 0.03 ], [ 'a2', 'c2', 0.97 ]], [clock_alarm] ) s_door_lock = State(door_lock, name="door_lock") s_clock_alarm = State(clock_alarm, name="clock_alarm") s_light = State(light, name="light") s_coffee_maker = State(coffee_maker, name="coffee_maker") network = BayesianNetwork("User_pref") network.add_nodes(s_door_lock, s_clock_alarm, s_light, s_coffee_maker) network.add_edge(s_door_lock,s_light) network.add_edge(s_clock_alarm,s_coffee_maker) network.add_edge(s_clock_alarm,s_light) network.bake() return network
def build_an_hmm_example(): # i think the characters in each DiscreteDistribution definition, means the emission matrix for each state # because it says the probability of seeing each character when the system is in that state d1 = DiscreteDistribution({'A': 0.35, 'C': 0.20, 'G': 0.05, 'T': 0.40}) d2 = DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}) d3 = DiscreteDistribution({'A': 0.10, 'C': 0.40, 'G': 0.40, 'T': 0.10}) s1 = State(d1, name="s1") s2 = State(d2, name="s2") s3 = State(d3, name="s3") model = HiddenMarkovModel('example') model.add_states([s1, s2, s3]) model.add_transition(model.start, s1, 0.90) model.add_transition(model.start, s2, 0.10) model.add_transition(s1, s1, 0.80) model.add_transition(s1, s2, 0.20) model.add_transition(s2, s2, 0.90) model.add_transition(s2, s3, 0.10) model.add_transition(s3, s3, 0.70) model.add_transition(s3, model.end, 0.30) model.bake() for i in range(len(model.states)): print(model.states[i].name) model.plot() #print(model.log_probability(list('ACGACTATTCGAT'))) #print(", ".join(state.name for i, state in model.viterbi(list('ACGACTATTCGAT'))[1])) print("forward:", model.forward(list('ACG')))
def test_check_input_list(): obs = ['A', None, None] _check_input(obs, monty_network) obs = ['A', numpy.nan, numpy.nan] _check_input(obs, monty_network) obs = numpy.array(['A', None, None]) _check_input(obs, monty_network) obs = numpy.array(['A', numpy.nan, numpy.nan]) _check_input(obs, monty_network) obs = numpy.array(['A', 'B', 'C']) _check_input(obs, monty_network) obs = numpy.array(['NaN', numpy.nan, numpy.nan]) assert_raises(ValueError, _check_input, obs, monty_network) obs = numpy.array(['A', 'B', 'D']) assert_raises(ValueError, _check_input, obs, monty_network) obs = ['A'] assert_raises(ValueError, _check_input, obs, monty_network) obs = ['A', 'C', 'E', 'F'] assert_raises(ValueError, _check_input, obs, monty_network) d = DiscreteDistribution({'A': 0.25, 'B': 0.25, 'C': 0.25}) obs = [d, None, None] _check_input(obs, monty_network) d = DiscreteDistribution({'A': 0.25, 'B': 0.25, 'D': 0.25}) obs = [d, None, None] assert_raises(ValueError, _check_input, obs, monty_network)
def test_monty(): a = monty_network.predict_proba({'monty': 'A'}) discrete_equality(a[monty_index], DiscreteDistribution( {'A': 1.0, 'B': 0.0, 'C': 0.0})) discrete_equality(a[guest_index], a[prize_index]) discrete_equality(a[guest_index], DiscreteDistribution( {'A': 0.0, 'B': 1. / 2, 'C': 1. / 2}))
def hmmer2pom(hmm): # set up environment from math import exp from pomegranate import DiscreteDistribution,HiddenMarkovModel,State tags = dict(); header = 0; alphabet = None; hmmlines = list() # parse HMMER file for line in hmm.splitlines(): l = line.strip() if len(l) == 0 or l[0] == '#': continue elif header == 0: if l.startswith('HMM') and l[3] != 'E': # beginning of actual HMM header = 1; alphabet = l.split()[1:] else: parts = l.strip().split() if parts[0] in tags: if not isinstance(tags[parts[0]], list): tags[parts[0]] = [tags[parts[0]]] tags[parts[0]].append(' '.join(parts[1:])) else: tags[parts[0]] = ' '.join(parts[1:]) elif header == 1: header = 2 else: if l.startswith('COMPO'): parts = l.strip().split(); tags[parts[0]] = ' '.join(parts[1:]) else: hmmlines.append(l) # create all states model = HiddenMarkovModel(tags['NAME']); tmpstates = list(); K = 0 i_emit = hmmlines[0].split(); tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(i_emit[i])) for i in range(len(alphabet))}), name="I0")) # insertion state for l in range(2,len(hmmlines),3): m_emit,i_emit,state_trans = [hmmlines[l+i].split() for i in range(0,3)]; K = int(m_emit[0]) tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(m_emit[i+1])) for i in range(len(alphabet))}), name="M%d" % K)) # match state tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(i_emit[i])) for i in range(len(alphabet))}), name="I%d" % K)) # insertion state tmpstates.append(State(None, name="D%d" % K)) # deletion state assert K != 0, "No match states in profile HMM" model.add_states(tmpstates); name2state = {state.name:state for state in tmpstates}; name2state["M0"] = model.start; name2state["M%d"%(K+1)] = model.end # create all transitions for l in range(1,len(hmmlines),3): k = int(l/3); parts = hmmlines[l].split() model.add_transition(name2state["M%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[0]))) # 0: M_k -> M_k+1 model.add_transition(name2state["M%d"%k], name2state["I%d"%k], exp(-1*float(parts[1]))) # 1: M_k -> I_k if parts[2] != '*': # no D_k+1 in last row model.add_transition(name2state["M%d"%k], name2state["D%d"%(k+1)], exp(-1*float(parts[2]))) # 2: M_k -> D_k+1 model.add_transition(name2state["I%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[3]))) # 3: I_k -> M_k+1 model.add_transition(name2state["I%d"%k], name2state["I%d"%k], exp(-1*float(parts[4]))) # 4: I_k -> I_k if k != 0: # no D0 state model.add_transition(name2state["D%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[5]))) # 5: D_k -> M_k+1 if parts[6] != '*': # no D0 state and no D_k+1 in last row model.add_transition(name2state["D%d"%k], name2state["D%d"%(k+1)], exp(-1*float(parts[6]))) # 6: D_k -> D_k+1 model.bake() return model.to_json()
def setup_monty(): # Build a model of the Monty Hall Problem global monty_network, monty_index, prize_index, guest_index random.seed(0) # Friends emissions are completely random guest = DiscreteDistribution({'A': 1. / 3, 'B': 1. / 3, 'C': 1. / 3}) # The actual prize is independent of the other distributions prize = DiscreteDistribution({'A': 1. / 3, 'B': 1. / 3, 'C': 1. / 3}) # Monty is dependent on both the guest and the prize. monty = ConditionalProbabilityTable( [['A', 'A', 'A', 0.0], ['A', 'A', 'B', 0.5], ['A', 'A', 'C', 0.5], ['A', 'B', 'A', 0.0], ['A', 'B', 'B', 0.0], ['A', 'B', 'C', 1.0], ['A', 'C', 'A', 0.0], ['A', 'C', 'B', 1.0], ['A', 'C', 'C', 0.0], ['B', 'A', 'A', 0.0], ['B', 'A', 'B', 0.0], ['B', 'A', 'C', 1.0], ['B', 'B', 'A', 0.5], ['B', 'B', 'B', 0.0], ['B', 'B', 'C', 0.5], ['B', 'C', 'A', 1.0], ['B', 'C', 'B', 0.0], ['B', 'C', 'C', 0.0], ['C', 'A', 'A', 0.0], ['C', 'A', 'B', 1.0], ['C', 'A', 'C', 0.0], ['C', 'B', 'A', 1.0], ['C', 'B', 'B', 0.0], ['C', 'B', 'C', 0.0], ['C', 'C', 'A', 0.5], ['C', 'C', 'B', 0.5], ['C', 'C', 'C', 0.0]], [guest, prize]) # Make the states s1 = State(guest, name="guest") s2 = State(prize, name="prize") s3 = State(monty, name="monty") # Make the bayes net, add the states, and the conditional dependencies. monty_network = BayesianNetwork("test") monty_network.add_nodes(s1, s2, s3) monty_network.add_edge(s1, s3) monty_network.add_edge(s2, s3) monty_network.bake() monty_index = monty_network.states.index(s3) prize_index = monty_network.states.index(s2) guest_index = monty_network.states.index(s1)
def test_conditional(): phditis = DiscreteDistribution({True: 0.01, False: 0.99}) test_result = ConditionalProbabilityTable( [[True, True, 0.95], [True, False, 0.05], [False, True, 0.05], [False, False, 0.95]], [phditis]) assert discrete_equality(test_result.marginal(), DiscreteDistribution({False: 0.941, True: 0.059}))
def __init__(self): Pollution = DiscreteDistribution({'F': 0.9, 'T': 0.1}) Smoker = DiscreteDistribution({'T': 0.3, 'F': 0.7}) print(Smoker) Cancer = ConditionalProbabilityTable([ ['T', 'T', 'T', 0.05], ['T', 'T', 'F', 0.95], ['T', 'F', 'T', 0.02], ['T', 'F', 'F', 0.98], ['F', 'T', 'T', 0.03], ['F', 'T', 'F', 0.97], ['F', 'F', 'T', 0.001], ['F', 'F', 'F', 0.999], ], [Pollution, Smoker]) print(Cancer) XRay = ConditionalProbabilityTable([ ['T', 'T', 0.9], ['T', 'F', 0.1], ['F', 'T', 0.2], ['F', 'F', 0.8], ], [Cancer]) Dyspnoea = ConditionalProbabilityTable([ ['T', 'T', 0.65], ['T', 'F', 0.35], ['F', 'T', 0.3], ['F', 'F', 0.7], ], [Cancer]) s1 = Node(Pollution, name="Pollution") s2 = Node(Smoker, name="Smoker") s3 = Node(Cancer, name="Cancer") s4 = Node(XRay, name="XRay") s5 = Node(Dyspnoea, name="Dyspnoea") model = BayesianNetwork("Lung Cancer") model.add_states(s1, s2, s3, s4, s5) model.add_edge(s1, s3) model.add_edge(s2, s3) model.add_edge(s3, s4) model.add_edge(s3, s5) model.bake() self.model = model meta = [] name_mapper = ["Pollution", "Smoker", "Cancer", "XRay", "Dyspnoea"] for i in range(self.model.node_count()): meta.append({ "name": name_mapper[i], "type": "categorical", "size": 2, "i2s": ['T', 'F'] }) self.meta = meta
def test_guest_with_monty(): b = monty_network.predict_proba({'guest': 'A', 'monty': 'B'}) c = monty_network.predict_proba({'guest': 'A', 'monty': 'C'}) assert_equal(b[guest_index], 'A') assert_equal(b[monty_index], 'B') assert_discrete_equal(b[prize_index], DiscreteDistribution( {'A': 1. / 3, 'B': 0.0, 'C': 2. / 3})) assert_equal(c[guest_index], 'A') assert_equal(c[monty_index], 'C') assert_discrete_equal(c[prize_index], DiscreteDistribution( {'A': 1. / 3, 'B': 2. / 3, 'C': 0.0}))
def test_io_fit(): d1 = DiscreteDistribution({True: 0.6, False: 0.4}) d2 = ConditionalProbabilityTable([ [True, 'A', 0.2], [True, 'B', 0.8], [False, 'A', 0.3], [False, 'B', 0.7]], [d1]) d3 = ConditionalProbabilityTable([ ['A', 0, 0.3], ['A', 1, 0.7], ['B', 0, 0.8], ['B', 1, 0.2]], [d2]) n1 = Node(d1) n2 = Node(d2) n3 = Node(d3) model1 = BayesianNetwork() model1.add_nodes(n1, n2, n3) model1.add_edge(n1, n2) model1.add_edge(n2, n3) model1.bake() model1.fit(X, weights=weights) d1 = DiscreteDistribution({True: 0.2, False: 0.8}) d2 = ConditionalProbabilityTable([ [True, 'A', 0.7], [True, 'B', 0.2], [False, 'A', 0.4], [False, 'B', 0.6]], [d1]) d3 = ConditionalProbabilityTable([ ['A', 0, 0.9], ['A', 1, 0.1], ['B', 0, 0.0], ['B', 1, 1.0]], [d2]) n1 = Node(d1) n2 = Node(d2) n3 = Node(d3) model2 = BayesianNetwork() model2.add_nodes(n1, n2, n3) model2.add_edge(n1, n2) model2.add_edge(n2, n3) model2.bake() model2.fit(data_generator) logp1 = model1.log_probability(X) logp2 = model2.log_probability(X) assert_array_almost_equal(logp1, logp2)
def test_check_input_list_of_dicts(): obs = {'guest': 'A'} _check_input([obs], monty_network) obs = {'guest': 'NaN'} assert_raises(ValueError, _check_input, [obs], monty_network) obs = {'guest': None} assert_raises(ValueError, _check_input, [obs], monty_network) obs = {'guest': numpy.nan} assert_raises(ValueError, _check_input, [obs], monty_network) obs = {'guest': 'NaN', 'prize': 'B'} assert_raises(ValueError, _check_input, [obs], monty_network) obs = {'guest': 'A', 'prize': 'C'} _check_input([obs], monty_network) obs = {'guest': 'A', 'prize': 'C', 'monty': 'C'} _check_input([obs], monty_network) obs = {'guest': DiscreteDistribution({'A': 0.25, 'B': 0.25, 'C': 0.50})} _check_input([obs], monty_network) obs = {'hello': 'A', 'prize': 'B'} assert_raises(ValueError, _check_input, [obs], monty_network) obs = [{ 'guest': 'A' }, { 'guest': 'A', 'prize': 'C' }, { 'guest': 'A', 'prize': 'C', 'monty': 'C' }, { 'guest': DiscreteDistribution({ 'A': 0.25, 'B': 0.25, 'C': 0.50 }) }] _check_input(obs, monty_network) obs.append({'guest': 'NaN', 'prize': 'B'}) assert_raises(ValueError, _check_input, obs, monty_network)
def update_hmm(self): num_states = self.num_states start_prob = self.start_prob num_emissions = self.num_emissions hmm = HiddenMarkovModel('hmm') dist = [ DiscreteDistribution( dict(zip(range(num_emissions), self.emissions[i]))) for i in range(num_states) ] states = [ State(dist[i], 's' + str(i).zfill(2)) for i in range(num_states) ] hmm.add_states(states) for i in range(num_states): s_i = states[i] hmm.add_transition(hmm.start, s_i, start_prob[i]) for j in range(num_states): s_j = states[j] p = self.transitions[i, j] hmm.add_transition(s_i, s_j, p) self.hmm = hmm self.hmm.bake()
def train_model(data: np.ndarray, clusters: int = 5, init_nodes: list = None) -> BayesianNetwork: bn = BayesNet() #Сluster the initial data in order to fill in a hidden variable based on the distribution of clusters kmeans = KMeans(n_clusters=clusters, random_state=0).fit(data) labels = kmeans.labels_ hidden_dist = DiscreteDistribution.from_samples(labels) hidden_var = np.array(hidden_dist.sample(data.shape[0])) new_data = np.column_stack((data, hidden_var)) latent = (new_data.shape[1]) - 1 #Train the network structure on data taking into account a hidden variable bn = hc_rr(new_data, latent=latent, init_nodes=init_nodes) structure = [] nodes = sorted(list(bn.nodes())) for rv in nodes: structure.append(tuple(bn.F[rv]['parents'])) structure = tuple(structure) bn = BayesianNetwork.from_structure(new_data, structure) bn.bake() #Learn a hidden variable hidden_var = np.array([np.nan] * (data.shape[0])) new_data = np.column_stack((data, hidden_var)) bn.predict(new_data) bn.fit(new_data) bn.bake() return (bn)
def make_insert(zone, name): emission = {} total = 0 for column in zone['columns']: for el in column.elements: if el != '-': if el not in emission: emission[el] = 2 total += 2 else: emission[el] += 1 total += 1 for key in emission: emission[key] = emission[key] / total # print(emission) return { 'type': 'insert', 'emission': emission, 'zone': zone, 'insert_state': State(DiscreteDistribution(emission), name='insert ' + name) }
def get_insert_dist(self, n_features, initial_seq): if isinstance(initial_seq[0], int) \ or np.issubdtype(initial_seq[0], np.integer): #equal distribution return DiscreteDistribution.from_samples(range(n_features)) else: #distribution based on initial sequence return MultivariateGaussianDistribution.from_samples( np.array(initial_seq))
def test_list_of_dicts_predict_proba_parallel(): obs = [{ 'guest': 'A', 'monty': 'B' }, { 'guest': 'B', 'prize': 'A' }, { 'monty': 'C', 'prize': 'B' }, { 'monty': 'B' }, { 'prize': 'A' }] y = DiscreteDistribution({'A': 1. / 3, 'B': 0., 'C': 2. / 3}) y_hat = monty_network.predict_proba(obs, n_jobs=2) assert_equal(y_hat[0][0], 'A') assert_equal(y_hat[0][2], 'B') assert_discrete_equal(y_hat[0][1], y) assert_equal(y_hat[1][0], 'B') assert_equal(y_hat[1][1], 'A') assert_equal(y_hat[3][2], 'B') assert_equal(y_hat[4][1], 'A')
def train(self): logger.info("Building tossing graphs...") start_time = time.time() tossing_path_collection = self._c['tossing'] logger.info("Found %d paths" % len(tossing_path_collection)) target_dict = self._c['target_dict'] logger.info('length of tossing collection is %d' % len(tossing_path_collection)) train_ratio = 0.8 train_border = int(len(tossing_path_collection) * train_ratio) logger.info( 'taking %d data to train, %d to test' % (train_border, len(tossing_path_collection) - train_border)) total = target_dict['__total'] distribution = {k: v / total for k, v in target_dict.items()} distribution.pop('__total', None) paths = [t.get_assignee_path() for t in tossing_path_collection] # get discrete distribution zeroth_dist = DiscreteDistribution(distribution) first_chain = MarkovChain.from_samples(paths) # ([zeroth_dist]) first_chain.fit(paths) logger.info('Fitting the paths took {} seconds'.format(time.time() - start_time)) return self._c
def test_cpd_sampling(): d1 = DiscreteDistribution({"A": 0.1, "B": 0.9}) d2 = ConditionalProbabilityTable( [["A", "A", 0.1], ["A", "B", 0.9], ["B", "A", 0.7], ["B", "B", 0.3]], [d1]) # P(A) = 0.1*0.1 + 0.9*0.7 = 0.64 # P(B) = 0.1*0.9 + 0.9*0.3 = 0.36 true = [0.64, 0.36] est = numpy.bincount([0 if d2.sample() == "A" else 1 for i in range(1000)]) / 1000.0 assert_almost_equal(est[0], true[0], 1) assert_almost_equal(est[1], true[1], 1) # when A is observed, it reduces to [0.1, 0.9] true1 = [0.1, 0.9] par_val = {} par_val[d1] = "A" est = numpy.bincount([ 0 if d2.sample(parent_values=par_val) == "A" else 1 for i in range(1000) ]) / 1000.0 assert_almost_equal(est[0], true1[0], 1) assert_almost_equal(est[1], true1[1], 1) true2 = [0.7, 0.3] par_val = {} par_val[d1] = "B" est = numpy.bincount([ 0 if d2.sample(parent_values=par_val) == "A" else 1 for i in range(1000) ]) / 1000.0 assert_almost_equal(est[0], true2[0], 1) assert_almost_equal(est[1], true2[1], 1)
def make_main(zone, name): emission = {} total = 0 for el in zone['column'].elements: if el != '-': if el not in emission: emission[el] = 2 total += 2 else: emission[el] += 1 total += 1 for key in emission: emission[key] = emission[key] / total # print('main', emission) return { 'type': 'main', 'emission': emission, 'zone': zone, 'main_state': State(DiscreteDistribution(emission), name='main ' + name), 'delete_state': State(None, name='none delete ' + name) if zone['delete'] else None }
def setup_titanic(): # Build a model of the titanic disaster global titanic_network, passenger, gender, tclass # Passengers on the Titanic either survive or perish passenger = DiscreteDistribution({'survive': 0.6, 'perish': 0.4}) # Gender, given survival data gender = ConditionalProbabilityTable( [['survive', 'male', 0.0], ['survive', 'female', 1.0], ['perish', 'male', 1.0], ['perish', 'female', 0.0]], [passenger]) # Class of travel, given survival data tclass = ConditionalProbabilityTable( [['survive', 'first', 0.0], ['survive', 'second', 1.0], ['survive', 'third', 0.0], ['perish', 'first', 1.0], ['perish', 'second', 0.0], ['perish', 'third', 0.0]], [passenger]) # State objects hold both the distribution, and a high level name. s1 = State(passenger, name="passenger") s2 = State(gender, name="gender") s3 = State(tclass, name="class") # Create the Bayesian network object with a useful name titanic_network = BayesianNetwork("Titanic Disaster") # Add the three nodes to the network titanic_network.add_nodes(s1, s2, s3) # Add transitions which represent conditional dependencies, where the # second node is conditionally dependent on the first node (Monty is # dependent on both guest and prize) titanic_network.add_edge(s1, s2) titanic_network.add_edge(s1, s3) titanic_network.bake()
def sequence_state_factory(states_data, name): states = [] for index, data in enumerate(states_data): state = State(DiscreteDistribution(data.states_distribution), name=name + str(index)) states.append(state) return states
def test_single_list_predict_proba(): obs = ['A', None, 'B'] y = DiscreteDistribution({'A': 1. / 3, 'B': 0., 'C': 2. / 3}) y_hat = monty_network.predict_proba(obs) assert_equal(y_hat[0], 'A') assert_equal(y_hat[2], 'B') assert_discrete_equal(y_hat[1], y)
def test_list_of_dicts_predict_proba(): obs = [{'guest': 'A', 'monty': 'B'}] y = DiscreteDistribution({'A': 1./3, 'B': 0., 'C': 2./3}) y_hat = monty_network.predict_proba(obs) assert_equal(y_hat[0][0], 'A') assert_equal(y_hat[0][2], 'B') assert_discrete_equal(y_hat[0][1], y)
def bake_model(tags_sequence, words_sequence): """ 'tags' are the time-demand labels that generate the emitted demand level. Demand level are represented by 'words' """ # rdemand words = [x for x in chain(*words_sequence)] tag_unigrams = unigram_counts(words) tag_bigrams = bigram_counts(words) # Uniform distribution for starting and ending labels all_labels = list(set(words)) tag_starts = starting_counts(all_labels) tag_ends = ending_counts(all_labels) basic_model = HiddenMarkovModel(name="base-hmm-tagger") # Emission count label_train = tags_sequence rdemand_train = words_sequence emission_count = pair_counts(rdemand_train, label_train) # States with emission probability distributions P(word | tag) states = [] for rdemand, label_dict in emission_count.items(): dist_tag = DiscreteDistribution({ label: cn / tag_unigrams[rdemand] for label, cn in label_dict.items() }) states.append(State(dist_tag, name=rdemand)) basic_model.add_states(states) state_names = [s.name for s in states] state_index = {tag: num for num, tag in enumerate(state_names)} # Start transition total_start = sum(tag_starts.values()) for tag, cn in tag_starts.items(): # sname = state_index[tag] basic_model.add_transition(basic_model.start, states[state_index[tag]], cn / total_start) # End transition total_end = sum(tag_ends.values()) for tag, cn in tag_ends.items(): basic_model.add_transition(states[state_index[tag]], basic_model.end, cn / total_end) # Edges between states for the observed transition frequencies P(tag_i | tag_i-1) for key, value in tag_bigrams.items(): basic_model.add_transition(states[state_index[key[0]]], states[state_index[key[1]]], value / tag_unigrams[key[0]]) # Finalize the model basic_model.bake() return basic_model
def state_sequence_from(emissions, name): states = [] for index, emission in enumerate(emissions): distribution = DiscreteDistribution(emission) state_name = name + '_' + str(index) print('creado estado', state_name) state = State(distribution, name=state_name) states.append(state) return states, [1] * (len(states) - 1)
def test_guest_monty(): a = monty_network.predict_proba({'guest': 'A'}) b = monty_network.predict_proba({'guest': 'B'}) c = monty_network.predict_proba({'guest': 'C'}) prize_correct = DiscreteDistribution( {'A': 1. / 3, 'B': 1. / 3, 'C': 1. / 3}) discrete_equality(a[prize_index], b[prize_index]) discrete_equality(a[prize_index], c[prize_index]) discrete_equality(a[prize_index], prize_correct) discrete_equality(a[monty_index], DiscreteDistribution( {'A': 0.0, 'B': 1. / 2, 'C': 1. / 2})) discrete_equality(b[monty_index], DiscreteDistribution( {'A': 1. / 2, 'B': 0.0, 'C': 1. / 2})) discrete_equality(c[monty_index], DiscreteDistribution( {'A': 1. / 2, 'B': 1. / 2, 'C': 0.0}))
def get_match_dist(self, index, n_features, initial_seq): if isinstance(initial_seq[index], int): return DiscreteDistribution.from_samples(range(n_features)) #return DiscreteDistribution.from_samples(np.concatenate( # (np.repeat(index, INITIAL_EMPHASIS), range(n_features)))) else: return MultivariateGaussianDistribution.from_samples( np.concatenate( (np.tile(index, (INITIAL_EMPHASIS, 1)), np.array(initial_seq))))
def buildHmm(minAmpliconLength, maxGap, windowSize): b_bkgd_1 = 0.1 a_interstate = b_bkgd_1**(2 * minAmpliconLength / windowSize) b_amp_0 = (a_interstate)**(0.5 * windowSize / maxGap) b_amp_1 = 1 - b_amp_0 b_bkgd_0 = 1 - b_bkgd_1 bkgdDist = DiscreteDistribution({0: b_bkgd_0, 1: b_bkgd_1}) ampDist = DiscreteDistribution({0: b_amp_0, 1: b_amp_1}) s_bkgd = State(bkgdDist, name='background') s_amp = State(ampDist, name='amplicon') hmm = HiddenMarkovModel() hmm.add_states(s_bkgd, s_amp) hmm.add_transition(hmm.start, s_bkgd, 1 - a_interstate) hmm.add_transition(hmm.start, s_amp, a_interstate) hmm.add_transition(s_bkgd, s_bkgd, 1 - a_interstate) hmm.add_transition(s_bkgd, s_amp, a_interstate) hmm.add_transition(s_amp, s_bkgd, a_interstate) hmm.add_transition(s_amp, s_amp, 1 - a_interstate) hmm.bake() return hmm
def with_variations(dist, name): st = State(dist, name=name) sti = State(DiscreteDistribution({ 'a': 0.25, 'c': 0.25, 'g': 0.25, 't': 0.25 }), name='i_' + name) std = State(None, name='d_' + name) return st, sti, std
def test_discrete(): d = DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}) assert_equal(d.log_probability('C'), -1.3862943611198906) assert_equal(d.log_probability('A'), d.log_probability('C')) assert_equal(d.log_probability('G'), d.log_probability('T')) assert_equal(d.log_probability('a'), float('-inf')) seq = "ACGTACGTTGCATGCACGCGCTCTCGCGC" d.fit(list(seq)) assert_equal(d.log_probability('C'), -0.9694005571881036) assert_equal(d.log_probability('A'), -1.9810014688665833) assert_equal(d.log_probability('T'), -1.575536360758419) seq = "ACGTGTG" d.fit(list(seq), weights=[0., 1., 2., 3., 4., 5., 6.]) assert_equal(d.log_probability('A'), float('-inf')) assert_equal(d.log_probability('C'), -3.044522437723423) assert_equal(d.log_probability('G'), -0.5596157879354228) d.summarize(list("ACG"), weights=[0., 1., 2.]) d.summarize(list("TGT"), weights=[3., 4., 5.]) d.summarize(list("G"), weights=[6.]) d.from_summaries() assert_equal(d.log_probability('A'), float('-inf')) assert_equal(round(d.log_probability('C'), 4), -3.0445) assert_equal(round(d.log_probability('G'), 4), -0.5596) d = DiscreteDistribution({'A': 0.0, 'B': 1.0}) d.summarize(list("ABABABAB")) d.summarize(list("ABAB")) d.summarize(list("BABABABABABABABABA")) d.from_summaries(inertia=0.75) assert_equal(d.parameters[0], {'A': 0.125, 'B': 0.875}) d = DiscreteDistribution({'A': 0.0, 'B': 1.0}) d.summarize(list("ABABABAB")) d.summarize(list("ABAB")) d.summarize(list("BABABABABABABABABA")) d.from_summaries(inertia=0.5) assert_equal(d.parameters[0], {'A': 0.25, 'B': 0.75}) d.freeze() d.fit(list('ABAABBAAAAAAAAAAAAAAAAAA')) assert_equal(d.parameters[0], {'A': 0.25, 'B': 0.75}) d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A']) assert_equal(d.parameters[0], {'A': 0.75, 'B': 0.25}) d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'], pseudocount=0.5) assert_equal(d.parameters[0], {'A': 0.70, 'B': 0.30}) d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'], pseudocount=6) assert_equal(d.parameters[0], {'A': 0.5625, 'B': 0.4375}) e = Distribution.from_json(d.to_json()) assert_equal(e.name, "DiscreteDistribution") assert_equal(e.parameters[0], {'A': 0.5625, 'B': 0.4375}) f = pickle.loads(pickle.dumps(e)) assert_equal(f.name, "DiscreteDistribution") assert_equal(f.parameters[0], {'A': 0.5625, 'B': 0.4375})