def get_insert_dist(self, n_features, initial_seq): if isinstance(initial_seq[0], int) \ or np.issubdtype(initial_seq[0], np.integer): #equal distribution return DiscreteDistribution.from_samples(range(n_features)) else: #distribution based on initial sequence return MultivariateGaussianDistribution.from_samples( np.array(initial_seq))
def train_model(data: np.ndarray, clusters: int = 5, init_nodes: list = None) -> BayesianNetwork: bn = BayesNet() #Сluster the initial data in order to fill in a hidden variable based on the distribution of clusters kmeans = KMeans(n_clusters=clusters, random_state=0).fit(data) labels = kmeans.labels_ hidden_dist = DiscreteDistribution.from_samples(labels) hidden_var = np.array(hidden_dist.sample(data.shape[0])) new_data = np.column_stack((data, hidden_var)) latent = (new_data.shape[1]) - 1 #Train the network structure on data taking into account a hidden variable bn = hc_rr(new_data, latent=latent, init_nodes=init_nodes) structure = [] nodes = sorted(list(bn.nodes())) for rv in nodes: structure.append(tuple(bn.F[rv]['parents'])) structure = tuple(structure) bn = BayesianNetwork.from_structure(new_data, structure) bn.bake() #Learn a hidden variable hidden_var = np.array([np.nan] * (data.shape[0])) new_data = np.column_stack((data, hidden_var)) bn.predict(new_data) bn.fit(new_data) bn.bake() return (bn)
def get_match_dist(self, index, n_features, initial_seq): if isinstance(initial_seq[index], int): return DiscreteDistribution.from_samples(range(n_features)) #return DiscreteDistribution.from_samples(np.concatenate( # (np.repeat(index, INITIAL_EMPHASIS), range(n_features)))) else: return MultivariateGaussianDistribution.from_samples( np.concatenate( (np.tile(index, (INITIAL_EMPHASIS, 1)), np.array(initial_seq))))
def worker(node: Type[BaseNode]) -> DiscreteParams: parents = node.disc_parents + node.cont_parents if not parents: dist = DiscreteDistribution.from_samples( data[node.name].values) cprob = list(dict(sorted(dist.items())).values()) vals = sorted( [str(x) for x in list(dist.parameters[0].keys())]) else: dist = DiscreteDistribution.from_samples( data[node.name].values) vals = sorted( [str(x) for x in list(dist.parameters[0].keys())]) dist = ConditionalProbabilityTable.from_samples( data[parents + [node.name]].values) params = dist.parameters[0] cprob = dict() for i in range(0, len(params), len(vals)): probs = [] for j in range(i, (i + len(vals))): probs.append(params[j][-1]) combination = [str(x) for x in params[i][0:len(parents)]] cprob[str(combination)] = probs return {"cprob": cprob, 'vals': vals}
def test_discrete(): d = DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}) assert_equal(d.log_probability('C'), -1.3862943611198906) assert_equal(d.log_probability('A'), d.log_probability('C')) assert_equal(d.log_probability('G'), d.log_probability('T')) assert_equal(d.log_probability('a'), float('-inf')) seq = "ACGTACGTTGCATGCACGCGCTCTCGCGC" d.fit(list(seq)) assert_equal(d.log_probability('C'), -0.9694005571881036) assert_equal(d.log_probability('A'), -1.9810014688665833) assert_equal(d.log_probability('T'), -1.575536360758419) seq = "ACGTGTG" d.fit(list(seq), weights=[0., 1., 2., 3., 4., 5., 6.]) assert_equal(d.log_probability('A'), float('-inf')) assert_equal(d.log_probability('C'), -3.044522437723423) assert_equal(d.log_probability('G'), -0.5596157879354228) d.summarize(list("ACG"), weights=[0., 1., 2.]) d.summarize(list("TGT"), weights=[3., 4., 5.]) d.summarize(list("G"), weights=[6.]) d.from_summaries() assert_equal(d.log_probability('A'), float('-inf')) assert_equal(round(d.log_probability('C'), 4), -3.0445) assert_equal(round(d.log_probability('G'), 4), -0.5596) d = DiscreteDistribution({'A': 0.0, 'B': 1.0}) d.summarize(list("ABABABAB")) d.summarize(list("ABAB")) d.summarize(list("BABABABABABABABABA")) d.from_summaries(inertia=0.75) assert_equal(d.parameters[0], {'A': 0.125, 'B': 0.875}) d = DiscreteDistribution({'A': 0.0, 'B': 1.0}) d.summarize(list("ABABABAB")) d.summarize(list("ABAB")) d.summarize(list("BABABABABABABABABA")) d.from_summaries(inertia=0.5) assert_equal(d.parameters[0], {'A': 0.25, 'B': 0.75}) d.freeze() d.fit(list('ABAABBAAAAAAAAAAAAAAAAAA')) assert_equal(d.parameters[0], {'A': 0.25, 'B': 0.75}) d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A']) assert_equal(d.parameters[0], {'A': 0.75, 'B': 0.25}) d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'], pseudocount=0.5) assert_equal(d.parameters[0], {'A': 0.70, 'B': 0.30}) d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'], pseudocount=6) assert_equal(d.parameters[0], {'A': 0.5625, 'B': 0.4375}) e = Distribution.from_json(d.to_json()) assert_equal(e.name, "DiscreteDistribution") assert_equal(e.parameters[0], {'A': 0.5625, 'B': 0.4375}) f = pickle.loads(pickle.dumps(e)) assert_equal(f.name, "DiscreteDistribution") assert_equal(f.parameters[0], {'A': 0.5625, 'B': 0.4375})
def test_discrete_robust_json_serialization(): d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'], pseudocount=6) e = from_json(d.to_json()) assert_equal(e.name, "DiscreteDistribution") assert_equal(e.parameters[0], {'A': 0.5625, 'B': 0.4375})