예제 #1
0
    def __init__(self, model_file=None, components=None):

        if os.path.exists(model_file):
            self.model = joblib.load(model_file)
        else:
            alu_file = 'Alu_sequence.pkl'
            if os.path.exists(alu_file):
                locis = joblib.load(alu_file)
            else:
                locis = read_sequence('hg19_Alu.bed', 0)
                locis = random.sample(locis, 100000)
                for l in tqdm(locis):
                    l.init_seq()
                    l.decode_seq()
                locis = list(filter(lambda l: l.seq is not None, locis))
                joblib.dump(locis, alu_file)

            print('Alu Loaded')
            locis = locis[0:5000]
            model = MultinomialHMM(n_components=components,
                                   verbose=True,
                                   n_iter=50)
            x = np.concatenate(list(map(attrgetter('seq'), locis)))
            x = np.reshape(x, [x.shape[0], 1])
            length = list(map(attrgetter('length'), locis))
            model.fit(x, length)
            self.model = model
            joblib.dump(self.model, model_file)
예제 #2
0
def main():
    rand_p_matrix = np.random.rand(4, 4)
    rand_b_matrix = np.random.rand(4, 3)

    print("\nGernerating p matrix...............")
    p_matrix = normalization(rand_p_matrix)
    print(p_matrix)

    print("\nGernerating b matrix...............")
    b_matrix = normalization(rand_b_matrix)
    print(b_matrix)

    # Generate 1000 observations
    O, _ = generate_observation(1000, p_matrix, b_matrix)

    # training the selection of number of states
    aic = []
    bic = []
    likelihood = []
    m = 3
    print("\nTraining the HMM for selection of number of states........")
    for n in range(2, 30):
        observations = LabelEncoder().fit_transform(O)
        model = MultinomialHMM(n_components=n, random_state=200263453)
        model.fit(np.atleast_2d(observations))
        logL = model.score(np.atleast_2d(observations))
        p = compute_p(n, m)
        a = AIC(logL, p)
        b = BIC(logL, observations, p)
        likelihood.append(logL)
        aic.append(a)
        bic.append(b)
    plot(aic, 'AIC')
    plot(bic, 'BIC')
    plot(likelihood, 'Log likelihood')
예제 #3
0
def fit_hmm_learn(X, n_states):
    samples = np.concatenate(X)
    lengths = [len(x) for x in X]

    hmm_learn_model = MultinomialHMM(n_components=n_states)
    hmm_learn_model.fit(samples, lengths)

    # Label data using hmmlearn model
    return hmm_learn_model.predict(samples, lengths)
예제 #4
0
def train_hmm():
    """
    HMM for sequence learning.
    """
    print "Loading training data..."
    train_sequence, num_classes = get_sequence("./train_data/*")

    print "Build HMM..."
    model = MultinomialHMM(n_components=2)

    print "Train HMM..."
    model.fit([train_sequence])
예제 #5
0
def train_hmm():
    """
    HMM for sequence learning.
    """
    print "Loading training data..."
    train_sequence, num_classes = get_sequence("./train_data/*")

    print "Build HMM..."
    model = MultinomialHMM(n_components=2)

    print "Train HMM..."
    model.fit([train_sequence])
예제 #6
0
class BKT:
    """
    Implements the Bayesian Knowledge Tracing model. This only
    implements the Viterbi and EM algorithms. These may be used
    together to implement an Intelligent Tutoring System.
    """
    def __init__(self, observed):
        """
        Initializes the object and sets the internal state.

        Args:
            observed: array-like, shape (n_samples, n_features)
        """
        self.observed = np.array(observed)

        if len(self.observed.shape) == 1:
            self.observed = self.observed.reshape(-1, 1)
        # TODO: Check other parameters to this constructor
        self.model = MultinomialHMM(n_components=2, n_iter=100)

    def fit(self) -> None:
        """
        Fits the model to the observed states. Uses the EM algorithm
        to estimate model parameters.
        """
        self.model.fit(self.observed)

    def get_model_params(self) -> tuple:
        """
        Returns the model parameters. This must be run only after
        calling the `fit` function.

        Returns:
            (A, pi, B): The start probabilities, the transition
                        probabilities, and the emission probabilities.
        """
        return np.round_(self.model.startprob_, 2), np.round_(self.model.transmat_, 2), \
            np.round_(self.model.emissionprob_, 2)

    def predict(self, sequence) -> np.array:
        """
        Returns the most likely hidden state sequence corresponding to
        `sequence`.

        Args:
            sequence: List of observable states

        Returns:
            state_sequence: Array
        """
        return self.model.predict(sequence)
def test_DiscreteHMM_decode(cases: str) -> None:
    np.random.seed(12346)
    cases = int(cases)
    i = 1
    N_decimal = 4
    while i < cases:
        tol=1e-3
        n_samples = np.random.randint(10, 50)
        hidden_states = np.random.randint(3, 6)
        # symbols is the number of unqiue observation types.
        symbols = np.random.randint(4, 9)
        X = []
        lengths = []
        for _ in range(n_samples):
            # the actual length is seq_length + 1
            seq_length = symbols
            this_x = np.random.choice(range(symbols), size=seq_length, replace=False)
            X.append(this_x)
            lengths.append(seq_length)
        max_iter = 100


        hmm_gold = MultinomialHMM(n_components=hidden_states, n_iter=100, tol=tol)
        X_gold = np.concatenate(X).reshape((-1,1))
        hmm_gold.fit(X_gold, lengths)
        gold_A = hmm_gold.transmat_
        gold_B = hmm_gold.emissionprob_
        gold_pi = hmm_gold.startprob_
        gold_logprob, gold_state_sequence = hmm_gold.decode(X_gold, lengths)
        hmm_mine = DiscreteHMM(hidden_states=hidden_states,
                               symbols=symbols,
                               A=gold_A,
                               B=gold_B,
                               pi=gold_pi)
        mine_logprob_list = []
        mine_state_sequence = []
        for this_x in X:
            this_mine_logprob, this_mine_state_sequence = hmm_mine.decode(this_x)
            mine_logprob_list.append(this_mine_logprob)
            mine_state_sequence.append(this_mine_state_sequence)
        mine_state_sequence = np.concatenate(mine_state_sequence)
        mine_logprob = sum(mine_logprob_list)
        assert_almost_equal(mine_logprob, gold_logprob, decimal=N_decimal)
        assert_almost_equal(mine_state_sequence, gold_state_sequence, decimal=N_decimal)
        i+=1
    print('Successfully testing the function of computing decodes in discrete HMM!')
예제 #8
0
def main():
    rand_p_matrix = np.random.rand(4, 4)
    rand_b_matrix = np.random.rand(4, 3)

    print("\nGernerating p matrix...............")
    p_matrix = normalization(rand_p_matrix)
    print(p_matrix)

    print("\nGernerating b matrix...............")
    b_matrix = normalization(rand_b_matrix)
    print(b_matrix)

    # Generate 1000 observations
    O, Q = generate_observation(1000, p_matrix, b_matrix)

    O_seq = [1, 2, 3, 3, 1, 2, 3, 3, 1, 2, 3, 3]
    pi = (1, 0, 0, 0)
    print("\nThe Orginal Observation Sequence O: {}".format(O[:12]))
    print("The probability 𝑝(𝑂|𝜆) is {} with O: {}".format(
        forward(O_seq, p_matrix, b_matrix, pi)[-1].sum(), O_seq))

    print("\nThe Orginal Sequence Q: {}".format(Q[:12]))
    print("The Most Probable Sequence Q: {} with O: {}".format(
        list(viterbi(O_seq, p_matrix, b_matrix, pi)), O_seq))

    obersvations = LabelEncoder().fit_transform(O)
    model = MultinomialHMM(n_components=4)
    model.fit(np.atleast_2d(obersvations))
    est_pi = model.startprob_
    est_p = model.transmat_
    est_b = model.emissionprob_
    print("\nThe estimated transition matrix P:\n {}".format(est_p))
    print("\nThe estimated event matrix B:\n {}".format(est_b))
    print("\nThe estimated start probability pi:\n {}".format(est_pi))

    _, p = chisquare(p_matrix, est_p, axis=None)
    print("\np-value of transition matrix P: {}".format(p))

    _, p = chisquare(b_matrix, est_b, axis=None)
    print("p-value of event matrix B: {}".format(p))

    _, p = chisquare(pi, est_pi, axis=None)
    print("p-value of start probability pi: {}".format(p))
예제 #9
0
def run_hmm_model(input_df, n_unique, A_df, Eta, n_iter = 10000, 
                        tol=1e-2, verbose = False, params = 'e', init_params = ''):
    '''
        Runs the hmm model and returns the predicted results, score and model 

            input_df : The dataframe of keypresses 

            n_unique : number of unqique chars 


            A_df : Dataframe of trasnmission matrix 

            Eta : Emissions matrix 

            n_iter : Max number of iterations for hmm

            tol : The value to stop the hmm model if score does not improve by more than this 

            verbose : Whether or not to print out 

            params : Parameters to tune 

            init_params : Paramters to initialize
    '''
    # Propotion of characters starting words in english 
    char_counts = get_char_counts()

    # Construct model 
    hmm = MultinomialHMM(n_components=n_unique, startprob_prior=np.append(0, char_counts.values), 
               transmat_prior=A_df.values, algorithm='viterbi', 
               random_state=None, n_iter=n_iter, tol=tol, 
               verbose=verbose, params=params, init_params=init_params)
    
    # Set values 
    hmm.emissionprob_ = Eta
    hmm.transmat_ = A_df.values
    hmm.startprob_ = np.append(0, char_counts.values)

    # Feed in the clusters as the expected output
    model_input = input_df['cluster'].values
    
    # Reshape    
    if len(model_input.shape) == 1:
        model_input = model_input.reshape((len(model_input), 1))
    
    # Fit the model
    hmm = hmm.fit(model_input)

    # Score model
    score, results = hmm.decode(model_input)

    return score, results, hmm  
예제 #10
0
    def predict(self, day_to_predict):
        # Get records of 30 days before day_to_predict
        previous_thirty_days = get_previous_month(self.time_series, day_to_predict)
        binary_crime_sequence = previous_thirty_days['Violent Crime Committed?'].values.tolist()

        # Unsupervised HMM can't account for string of identical emissions.
        # If we see such a string, just predict the same emission for the following day.
        if binary_crime_sequence == [1]*30:
            return True
        if binary_crime_sequence == [0]*30:
            return False

        votes = []
        # Train nine HMMs. They are initialized randomly, so we take "votes" from nine HMMs.
        #  Why 9? Odd numbers preclude ties.
        #  And nine is a decent tradeoff between performance and getting bad results by chance
        for _ in range(3):
            # Train HMM
            model = MultinomialHMM(n_components=3, n_iter=10000)
            model.fit([np.array(binary_crime_sequence)])

            # Determine the most likely state of the last day in the sequence
            last_state_probs = model.predict_proba(binary_crime_sequence)[-1]
            current_state = self.get_most_likely(last_state_probs)

            # Determine the most likely state of the day we're trying to predict
            transition_probs = model.transmat_[current_state]
            next_state = self.get_most_likely(transition_probs)

            # Determine the most likely emission (crime/no crime) from a day in that state
            emissions = model.emissionprob_[next_state]
            vote = self.get_most_likely(emissions)

            # Record this HMM's vote
            votes.append(vote)

        # Votes are 1 for crime, 0 for no crime. Return True if majority votes for crime.
        return sum(votes) > 1
예제 #11
0
def test_HMM():
    np.random.seed(12345)
    np.set_printoptions(precision=5, suppress=True)

    P = default_hmm()
    ls, obs = P["latent_states"], P["obs_types"]

    # generate a new sequence
    O = generate_training_data(P, n_steps=30, n_examples=25)

    tol = 1e-5
    n_runs = 5
    best, best_theirs = (-np.inf, []), (-np.inf, [])
    for _ in range(n_runs):
        hmm = MultinomialHMM()
        A_, B_, pi_ = hmm.fit(O, ls, obs, tol=tol, verbose=True)

        theirs = MHMM(
            tol=tol,
            verbose=True,
            n_iter=int(1e9),
            transmat_prior=1,
            startprob_prior=1,
            algorithm="viterbi",
            n_components=len(ls),
        )

        O_flat = O.reshape(1, -1).flatten().reshape(-1, 1)
        theirs = theirs.fit(O_flat, lengths=[O.shape[1]] * O.shape[0])

        hmm2 = MultinomialHMM(A=A_, B=B_, pi=pi_)
        like = np.sum([hmm2.log_likelihood(obs) for obs in O])
        like_theirs = theirs.score(O_flat, lengths=[O.shape[1]] * O.shape[0])

        if like > best[0]:
            best = (like, {"A": A_, "B": B_, "pi": pi_})

        if like_theirs > best_theirs[0]:
            best_theirs = (
                like_theirs,
                {
                    "A": theirs.transmat_,
                    "B": theirs.emissionprob_,
                    "pi": theirs.startprob_,
                },
            )
    print("Final log likelihood of sequence: {:.5f}".format(best[0]))
    print("Final log likelihood of sequence (theirs): {:.5f}".format(
        best_theirs[0]))
    plot_matrices(P, best, best_theirs)
class HMM_Learner:
    def __init__(self, M):
        self.con = MultinomialHMM(n_components=M)
        self.incon = MultinomialHMM(n_components=M)
        self.daID = {
            'ass': 0,
            'bck': 1,
            'be.neg': 2,
            'be.pos': 3,
            'el.ass': 4,
            'el.inf': 5,
            'el.sug': 6,
            'el.und': 7,
            'fra': 8,
            'inf': 9,
            'off': 10,
            'oth': 11,
            'stl': 12,
            'sug': 13,
            'und': 14
        }
        self.da_choose_n = itertools.combinations([
            'ass', 'bck', 'be.neg', 'be.pos', 'el.ass', 'el.inf', 'el.sug',
            'el.und', 'fra', 'inf', 'off', 'oth', 'stl', 'sug', 'und'
        ], 4)

    def addRandomAllSequence(self, X, lengths):
        da_keys = self.daID.keys()
        random.shuffle(da_keys)
        X1 = [[self.daID[x.lower().strip()]] for x in da_keys]
        X.append(X1)
        lengths.append(len(X1))

    def trainHMMs(self, topics, sequences, labels):
        try:
            self.con = pickle.load(open('HMM_consistent.model', 'rb'))
            self.incon = pickle.load(open('HMM_inconsistet.model', 'rb'))
        except:
            X_con = []
            l_con = []
            X_incon = []
            l_incon = []
            for t in topics:
                try:
                    temp = sequences[t]
                    temp = labels[t]
                except:
                    continue

                if sequences[t]:
                    X1 = [[self.daID[da.lower().strip()]]
                          for da in sequences[t]]
                    if 'weak' in labels[t].lower():
                        X_incon.append(X1)
                        l_incon.append(len(sequences[t]))
                    else:
                        X_con.append(X1)
                        l_con.append(len(sequences[t]))

            # Add two complete random sequence to support Multinomial in HMMs
            self.addRandomAllSequence(X_incon, l_incon)
            self.addRandomAllSequence(X_con, l_con)

            self.con.fit(np.concatenate(X_con), l_con)
            self.incon.fit(np.concatenate(X_incon), l_incon)

            pickle.dump(self.con, open('HMM_consistent.model', 'wb'))
            pickle.dump(self.incon, open('HMM_inconsistet.model', 'wb'))

    def testHMMs(self, topics, sequences):
        prediction = {}
        for t in topics:
            try:
                temp = sequences[t]
            except:
                continue

            if sequences[t]:
                X1 = [[self.daID[da.lower().strip()]] for da in sequences[t]]
                c = self.con.score(np.concatenate([X1]), [len(sequences[t])])
                i = self.incon.score(np.concatenate([X1]), [len(sequences[t])])
                prediction[t] = (c, i)

        return prediction

    def generateLabelSequence(self, sequence):
        MIN_VAL = -10000000
        topics = []
        sequences = {}
        isConsistent = False
        max_score_da_seq = ''
        max_score = MIN_VAL
        for n in xrange(2, 5):
            da_choose_n = [
                p for p in itertools.product([
                    'ass', 'bck', 'be.neg', 'be.pos', 'el.ass', 'el.inf',
                    'el.sug', 'el.und', 'fra', 'inf', 'off', 'oth', 'stl',
                    'sug', 'und'
                ],
                                             repeat=n)
            ]
            #print da_choose_n
            for s in da_choose_n:
                temp_sequence = copy.deepcopy(sequence)
                temp_sequence.extend(list(s))
                topics.append(str(s))
                sequences[str(s)] = temp_sequence

            scores = self.testHMMs(topics, sequences)
            for t in topics:
                if scores[t][0] > max_score:
                    max_score = scores[t][0]
                    max_score_da_seq = t
                if scores[t][0] > scores[t][1]:
                    isConsistent = True
                    max_score = scores[t][0]
                    max_score_da_seq = t
            if isConsistent:
                break
            print(max_score, max_score_da_seq)

        return max_score_da_seq, isConsistent
### preparing the testdata as the list of numbers
## task -3

dummy_data = []
for i in s:
    if i == ' ':
        dummy_data.append(26)
    else:
        dummy_data.append(ord(i) - ord('A'))

training_data = np.array(dummy_data)
training_data = training_data.reshape((training_data.shape[0], 1))

### hmm model
hmm_model = MultinomialHMM(n_components=2, n_iter=500, tol=0.01, verbose=False)
hmm_model.fit(training_data)
print(hmm_model.monitor_)

print("Tranisition probalitity of this model is \n")
print(hmm_model.transmat_)
print("\n")
print("Emission probalitity of this model is \n")
print(np.transpose(hmm_model.emissionprob_))
print("\n")
## the seven most probable characters
transition_prob1 = transition_prob
emission_prob1 = np.transpose(hmm_model.emissionprob_)

print("For this trained model, the seven most likely charcters are\n")

print("For state 0, the seven most likely characters are\n")
예제 #14
0
class TOE_HMM_CHARS:

    def __init__(self, N=2, maxIters = 200):
        self._N = N
        self._M = COMPONENTS
        self._maxIters = maxIters
        self._syms = []

    def loadBrownSymsSeq(self, T):
        taggedWordsIter = brown.tagged_words()
        retIdx = 0
        symSequence = []
        for wrd, tag in taggedWordsIter:
            if wrd:
                for c in wrd:
                    val = ord(c)
                    symSequence.append(val)
                    retIdx += 1
            if retIdx >= T:
                break

        self._syms = symSequence
        self._syms = np.concatenate((self._syms, np.arange(256))).tolist()
        return  symSequence

    def textSeqToSymSeq(self, txtSeqArr):
        symSequence  =[]
        for wrd in txtSeqArr:
            if wrd:
                for c in wrd:
                    val = ord(c)
                    symSequence.append(val)
        return symSequence

    def initHMM(self):

        self._hmm = MultinomialHMM(n_components=self._N, n_iter=self._maxIters, 
            verbose=True, params='ste', init_params='ste')
        # n_features  (int) Number of possible symbols emitted by the model (in the samples).
        # monitor_    (ConvergenceMonitor) Monitor object used to check the convergence of EM.
        # transmat_   (array, shape (n_components, n_components)) Matrix of transition probabilities between states.
        # startprob_  (array, shape (n_components, )) Initial state occupation distribution.
        # emissionprob_   (array, shape (n_components, n_features)) Probability of emitting a given symbol when in each state.

    def trainHMM(self):
        self._hmm.fit(np.array(self._syms).reshape(-1, 1))

    def testTxt(self, txtSeqArr):
        testSymsArr = self.textSeqToSymSeq(txtSeqArr)
        score = self._hmm.score(testSymsArr)
        return score

    def testSyms(self, symsArr):
        score = self._hmm.score(symsArr)
        return score

    def trainedLambda(self):
        A = self._hmm.transmat_
        B = self._hmm.emissionprob_
        pi = self._hmm.startprob_
        return (A, B, pi)

    def persistHMM(self, filename):
        import pickle
        s = pickle.dumps(self)
        with open(filename, 'w') as f:
            f.write(s)

    @staticmethod
    def loadHMM(filename):
        import pickle
        with open(filename, 'r') as f:
            s = f.read()
        model = pickle.loads(s)
        return model
    
    def pickRandomSeq(self, length = 100):
        symSequence = [random.randint(0, 255) for idx in range(length)]
        return symSequence

    def pickOrderedSeq(self, length = 100):
        symSequence = []
        taggedWordsIter = brown.tagged_words()
        maxIdx = len(taggedWordsIter)
        maxMinIdx = maxIdx - length
        minIdx = random.randint(0, maxMinIdx)
        count = 0
        idx = minIdx
        while True:
            (wrd, tag) = taggedWordsIter[idx]
            if wrd:
                for c in wrd:
                    val = ord(c)
                    symSequence.append(val)
                    count += 1
            idx += 1
            if count >= length:
                break

        return symSequence

    def printHMM(self):
        print("A = %s" % str(self._hmm.transmat_))
        print("B = %s" % str(self._hmm.emissionprob_))
        print("PI = %s" % str(self._hmm.startprob_))
        print("Verify A = %s" % np.sum(self._hmm.transmat_, axis=1))
        print("Verify B = %s" % np.sum(self._hmm.emissionprob_, axis=1))
        print("Verify PI = %s" % np.sum(self._hmm.startprob_, axis=0))
    
    def histo(self):
        retHisto = dict((x, self._syms.count(x)) for x in range(256))
        return retHisto
n_states = 2
n_emissions = len(possible_emissions)

# Training data
X = np.array([
    random.sample(possible_emissions, len(possible_emissions)), [1, 1, 2, 1],
    [6, 5, 5, 4, 7, 7]
])
lengths = [len(row) for row in X]
X = np.atleast_2d(np.concatenate(X))

# Create randomly initialized model
model = MultinomialHMM(n_components=2, n_iter=100)

# Train on data
model.fit(X.T, lengths=lengths)

# Trained parameters
transition_matrix = model.transmat_
emission_matrix = model.emissionprob_
initial_stat_probability = model.startprob_

# Observed sequence
observed = np.array([1, 1, 1])

# Get helmet
helmet = 0

# Sequence emissions
helping_emission = (helmet, 1, 0)
backstabbing_emission = (helmet, 0, 1)
예제 #16
0
    def train(self, data, labels, tp=None):
        labels = np.array(labels)
        for i in range(self.nb_class):
            print "Class", i
            ind = np.where(labels == i)
            digit_data = np.array(data)[ind]

            self.fit_encode_class(digit_data, i)

            sks, lengths = self.transform_encode_class(digit_data, i)

            if not tp:

                model = MultinomialHMM(n_components=self.nb_components,
                                   n_iter=self.max_iter,
                                   tol=self.tol,
                                   verbose=True,
                                   params='ste',
                                   init_params='e')
                init = 1. / self.nb_components
                model.startprob_ = np.full(self.nb_components, init)
                model.transmat_ = np.full((self.nb_components, self.nb_components),
                                        init)

            else:
                model =  model = MultinomialHMM(n_components=self.nb_components,
                                   n_iter=self.max_iter,
                                   tol=self.tol,
                                   verbose=True,
                                   params='ste')

                # Number of distinct centroids
                num_obs = len(np.unique(np.concatenate(sks)))
                model.emissionprob_ = np.zeros((self.nb_components, num_obs))
                hist = {}
                curr = 0
                bucket_len = num_obs / self.nb_components
                for j in range(self.nb_components):
                    if j == self.nb_components - 1 and curr + bucket_len < num_obs:
                        offset = num_obs - curr - bucket_len
                        for k in range(curr, curr + bucket_len + offset):
                            if not j in hist:
                                hist[j] = []
                            hist[j].append(k)
                            model.emissionprob_[j, k] = 1
                        curr += bucket_len + offset
                    else:
                        for k in range(curr, curr + bucket_len):
                            if not j in hist:
                                hist[j] = []
                            hist[j].append(k)
                            model.emissionprob_[j, k] = 1
                        curr += bucket_len


                model.startprob_ = np.zeros(self.nb_components)
                # always ends by penup
                model.startprob_[-1] = 1


                model.transmat_ = np.zeros((self.nb_components, self.nb_components))

                state_occ_count = np.zeros(self.nb_components)
                for example in digit_data:
                    j = 0
                    prevobs = 0
                    for obs in example:
                        le = self.les[i]
                        val = le.transform(obs)
                        if j == 0:
                            prevobs = val
                            j += 1
                            continue
                        prevobs_state = None
                        obs_state = None
                        for k in range(self.nb_components):
                            if (prevobs_state != None and obs_state != None):
                                break
                            if prevobs in hist[k]:
                                prevobs_state = k
                            if val in hist[k]:
                                obs_state = k
                        state_occ_count[prevobs_state] += 1
                        model.transmat_[prevobs_state, obs_state] += 1
                        prevobs = val
                        j += 1



                for j in range(self.nb_components):
                    for k in range(self.nb_components):
                        model.transmat_[j, k] = model.transmat_[j, k] / state_occ_count[j]


            model.fit(sks, lengths)
            self.models[i] = model
예제 #17
0
def convert(string): # mapping function, map A->0 , B->1 , C->3 ... 
    output = []
    for character in string:
        if character is " ":
            number = 26
        else:
            number = ord(character) - 65
        output.append(number)
    return (output)
data2 = convert(data) #Convert the data from stream of chacters to stream of numbers
DD = np.array(data2)
Data_arr = DD.reshape((DD.shape[0],1))
model = MultinomialHMM(n_components=2,n_iter=200, tol=0.01, verbose=False)
print("Training started")
model.fit(Data_arr)
print("Training Done")
print("Model = ",model.monitor_)
print("The transition prob of this trained model : ")
print(model.transmat_)
emiso = np.transpose(model.emissionprob_)
print("\nThe emmision prob of this trained model : ")
print("  State-0    State-1")
print(emiso)
seven_most_probabe(emiso) #printing the 7 most likely characters
print("Stationary probbabilities : ",model. get_stationary_distribution())
print("So seeing the emission probabilities we can say that State 1 is Consonant and State 0 is Vowel")

print("\nTask - 4")
model_nat = MultinomialHMM(n_components=2)
model_nat.transmat_ = trans_prob
예제 #18
0
        if train_data['return'][i] < 0.0 and analysis_data['v'][i] == 2:
            emission_probability[1][2] += 1
        if train_data['return'][i] < 0.0 and analysis_data['v'][i] == 3:
            emission_probability[1][3] += 1
    emission_probability[0] /= sum(1 for e in train_data['return'] if e >= 0.0)
    emission_probability[1] /= sum(1 for e in train_data['return'] if e < 0.0)
    #print(emission_probability)


    hmm = MultinomialHMM(n_components=n_states)
    hmm.startprob = start_probability
    hmm.transmat = transition_probability
    hmm.emissionprob = emission_probability

    bob_says = np.array([[0, 2, 1, 1, 2, 0]]).T
    hmm = hmm.fit(bob_says)

    logprob, alice_hears = hmm.decode(bob_says, algorithm="viterbi")
    print("Bob says:", ", ".join(map(lambda x: observations[x], bob_says)))
    print("Alice hears:", ", ".join(map(lambda x: states[x], alice_hears)))



    '''
    law_data['hmm_states'] = hmm.predict(rets)
    panel = Figure_Util.Figure()
    panel.draw(law_data, title='close', subplots=['hmm_states'], figsize=(20, 10))
    '''

    db.disconnect()
def test_DiscreteHMM_fit(cases: str) -> None:
    np.random.seed(12346)
    cases = int(cases)
    i = 1
    N_decimal = 4
    max_iter = 100
    tol=1e-3
    while i < cases:
        n_samples = np.random.randint(10, 50)
        hidden_states = np.random.randint(3, 6)
        # symbols is the number of unqiue observation types.
        symbols = np.random.randint(4, 9)
        X = []
        lengths = []
        for _ in range(n_samples):
            # the actual length is seq_length + 1
            seq_length = symbols
            this_x = np.random.choice(range(symbols), size=seq_length, replace=False)
            X.append(this_x)
            lengths.append(seq_length)

        A = np.full((hidden_states, hidden_states),1/hidden_states)

        B = []
        for _ in range(hidden_states):
            this_B = np.random.dirichlet(np.ones(symbols),size=1)[0]
            B.append(this_B)
        B = np.array(B)

        pi = np.ones(hidden_states)
        pi = pi/hidden_states


        hmm_gold = MultinomialHMM(n_components=hidden_states,
                                  startprob_prior=1,
                                  transmat_prior=1,
                                  init_params='',
                                  n_iter=max_iter,
                                  tol=tol)
        hmm_gold.transmat_ = A
        hmm_gold.emissionprob_ = B
        hmm_gold.startprob_ = pi

        X_gold = np.concatenate(X).reshape((-1,1))
        hmm_gold.fit(X_gold, lengths)

        gold_A = hmm_gold.transmat_
        gold_B = hmm_gold.emissionprob_
        gold_pi = hmm_gold.startprob_

        hmm_mine = DiscreteHMM(hidden_states=hidden_states,
                               symbols=symbols,
                               A=A,
                               B=B,
                               pi=pi,
                               tol=tol,
                               max_iter=max_iter)
        hmm_mine.fit(X)
        mine_A = hmm_mine.A
        mine_B = hmm_mine.B
        mine_pi = hmm_mine.pi
        assert_almost_equal(mine_pi, gold_pi, decimal=N_decimal)
        assert_almost_equal(mine_A, gold_A, decimal=N_decimal)
        assert_almost_equal(mine_B, gold_B, decimal=N_decimal)
        i+=1

    print('Successfully testing the function of estimating parameters in discrete HMM!')
예제 #20
0
 def computeHMM(dataset, alphabet, num_matchstates=9):
     num_sequences = len(dataset)
     best_score = None
     best_model = None
     alphabet = list(alphabet)
     residue_mapper = {alphabet[j]: j for j in range(0, len(alphabet))}
     #one begin, one end, num_matchstates + 1 insert states, num_matchstates match states, num_matchstates deletion states.
     num_states = 3 + 3 * num_matchstates
     concat_dataset = np.concatenate([[[residue_mapper[x]] for x in y]
                                      for y in dataset])
     dataset_lengths = [len(x) for x in dataset]
     for x in range(0, 10):
         transition_matrix = np.zeros((num_states, num_states))
         emission_matrix = np.zeros((num_states, len(alphabet)))
         #first num_matchstates + 2 are the matchstates (including beginning and end, though those two are mute
         #first do B, then M_1,...,M_m
         #B goes to either I_0 or M_1.
         b_row = ProfileHMM.compute_random_row(2)
         transition_matrix[0][1] = b_row[0]
         transition_matrix[0][2] = b_row[1]
         for i in range(1, num_matchstates + 1):
             #go to either match state, insertion state, or delete state.
             m_row = ProfileHMM.compute_random_row(3)
             #next match state
             transition_matrix[i][i + 1] = m_row[0]
             #insert state
             transition_matrix[i][i + num_matchstates + 2] = m_row[1]
             #deletion state
             print('i: %d' % i)
             transition_matrix[i][i + 2 * num_matchstates + 2] = m_row[2]
             emission_matrix[i] = ProfileHMM.compute_random_row(
                 len(alphabet))
         #now we do the insertion states.
         for i in range(num_matchstates + 2, 2 * num_matchstates + 3):
             #either go to self, or next match state.
             row = ProfileHMM.compute_random_row(2)
             transition_matrix[i][i] = row[0]
             transition_matrix[i][i - (num_matchstates + 1)] = row[1]
             emission_matrix[i] = ProfileHMM.compute_random_row(
                 len(alphabet))
         #now do deletion states. In the loop, do all but the last one
         for i in range(2 * num_matchstates + 3, 3 * num_matchstates + 2):
             row = ProfileHMM.compute_random_row(2)
             transition_matrix[i][i] = row[0]
             transition_matrix[i][i - 2 * num_matchstates - 1] = row[1]
         model = MultinomialHMM(num_states, params="ets")
         model.n_features = len(alphabet)
         start_prob = np.zeros(num_states)
         start_prob[0] = 1.0
         print('start prob array')
         print(start_prob)
         model.startprob_ = start_prob
         model.transmat_ = transition_matrix
         model.emissionprob_ = emission_matrix
         try:
             model.fit(concat_dataset, dataset_lengths)
         except ValueError:
             pdb.set_trace()
         print('model')
         print(model)
         """
         for row in range(0, len(model.emissionprob_)):
             for col in range(0, len(model.emissionprob_[row])):
                 count = model.emissionprob_[row][col]*num_sequences
                 model.emissionprob_[row][col] = (count + 0.01)/(num_sequences + len(alphabet)*0.01)
         """
         print('emission probabilities')
         print(model.emissionprob_)
         score = model.score(concat_dataset, dataset_lengths)
         if x == 0:
             best_score = score
             best_model = model
         elif score > best_score:
             best_score = score
             best_model = model
     return best_model
예제 #21
0
    model = MultinomialHMM(n_components=3,
                           random_state=42,
                           params='e',
                           init_params='e')
    model.startprob_ = [0.16, 0.04, 0.8]
    model.transmat_ = [[0.67, 0.13, 0.2], [0, 0.5, 0.5], [0, 0, 1]]
    '''
    startprob_:
    V: 
    D:
    J:
    
    trasmat_:
    from each row, row become column (the probability of row become column, each row sum to 1, not column)   
    '''
    model.fit(X, length)
    emission = model.emissionprob_
    model.n_features
    model.transmat_
    model.startprob_
    model.get_stationary_distribution()
    # let's test
    train_score = []
    for i in cdr3_train_index:
        test = string2matrix_plain(cdr3[i]).astype(np.int)
        score = model.score(test)
        train_score.append(score)
    train_score = np.array(train_score)

    test_score = []
    for i in cdr3_test_index:
예제 #22
0
def main(params):
    DEBUG = params['DEBUG']
    dataset = params['dataset']
    nh_part = params['nh_part']
    nh_chords = params['nh_chords']
    num_gen = params['num_gen']

    ##################################################################
    # DATA PROCESSING
    # Songs indices
    song_indices = [
        43, 85, 133, 183, 225, 265, 309, 349, 413, 471, 519, 560, 590, 628,
        670, 712, 764, 792, 836, 872, 918, 966, 1018, 1049, 1091, 1142, 1174,
        1222, 1266, 1278, 1304, 1340, 1372, 1416, 1456, 1484, 1536, 1576, 1632,
        1683, 1707, 1752, 1805, 1857, 1891, 1911
    ]
    # Chords mapping
    chord_names = [
        'C;Em', 'A#;F', 'Dm;Em', 'Dm;G', 'Dm;C', 'Am;Em', 'F;C', 'F;G', 'Dm;F',
        'C;C', 'C;E', 'Am;G', 'F;Em', 'F;F', 'G;G', 'Am;Am', 'Dm;Dm', 'C;A#',
        'Em;F', 'C;G', 'G#;A#', 'F;Am', 'G#;Fm', 'Am;Gm', 'F;E', 'Dm;Am',
        'Em;Em', 'G#;G#', 'Em;Am', 'C;Am', 'F;Dm', 'G#;G', 'F;A#', 'Am;G#',
        'C;D', 'G;Am', 'Am;C', 'Am;A#', 'A#;G', 'Am;F', 'A#;Am', 'E;Am',
        'Dm;E', 'A;G', 'Am;Dm', 'Em;Dm', 'C;F#m', 'Am;D', 'G#;Em', 'C;Dm',
        'C;F', 'G;C', 'A#;A#', 'Am;Caug', 'Fm;G', 'A;A'
    ]

    # Import .mat file
    dataset_root = os.path.join('data', dataset)
    mat_path = os.path.join(dataset_root, 'data.mat')
    data_mat = sio.loadmat(mat_path)
    chords_per_part = 2
    chords_per_bar = 4
    num_chords = 56
    num_parts = 4
    sub_sampling_ratio_parts = chords_per_bar / chords_per_part

    # Get parts
    parts_data_ = (np.dot(np.transpose(data_mat["feats"][-num_parts:]),
                          np.asarray(range(num_parts))).astype(int)).reshape(
                              -1, 1)
    # Group by bar
    parts_data = parts_data_[::sub_sampling_ratio_parts]
    # Parts with position in bar. Used condition chords generation
    parts_bar_data = post_processing_parts(parts_data,
                                           sub_sampling_ratio_parts)
    # Get chords transitions
    chords_data = (np.dot(np.transpose(data_mat["feats"][:-num_parts]),
                          np.asarray(range(num_chords))).astype(int)).reshape(
                              -1, 1)

    #################################
    # Group by song
    parts_length = []
    chords_length = []
    start_ind = 0
    for end_ind in song_indices:
        chords_length.append(end_ind - start_ind + 1)
        start_ind = end_ind + 1
    parts_length = [e / 2 for e in chords_length]
    ##################################################################

    ##################################################################
    # PARTS
    # Compute HMM for part modeling
    hmm_part = MultinomialHMM(n_components=nh_part, n_iter=20)
    hmm_part.fit(parts_data, parts_length)

    # def plot_mat(matrix, name):
    #   fig = plt.figure()
    #   ax = fig.add_subplot(1,1,1)
    #   ax.set_aspect('equal')
    #   plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.ocean)
    #   plt.colorbar()
    #   plt.savefig(name, format='pdf')

    # plot_mat(hmm_part.transmat_, 'part_transmat.pdf')
    # plot_mat(np.reshape(hmm_part.startprob_, [-1, 1]), 'part_startprob.pdf')
    # plot_mat(hmm_part.emissionprob_, 'part_emissionprob.pdf')
    ##################################################################

    ##################################################################
    # CHORDS
    hmm_chords = MultinomialHMM_prod(n_components=nh_chords, n_iter=20)
    hmm_chords.fit(chords_data, chords_length)
    # plot_mat(hmm_chords.transmat_, 'chords_transmat.pdf')
    # plot_mat(np.reshape(hmm_chords.startprob_, [-1, 1]), 'chords_startprob.pdf')
    # plot_mat(hmm_chords.emissionprob_, 'chords_emissionprob.pdf')
    ##################################################################

    #################################
    # GENERATION
    # Sample sequence
    for n in range(num_gen):
        gen_part_sequence_, _ = hmm_part.sample(params["gen_seq_length"])
        gen_part_sequence = post_processing_parts(gen_part_sequence_,
                                                  sub_sampling_ratio_parts)
        # Compute conditioning on parts
        p_chords_given_partBar = build_proba(chords_data, parts_bar_data)
        gen_chord_sequence, _ = hmm_chords.sampling_prod_hmm(
            p_chords_given_partBar, gen_part_sequence)
        ######## T E S T  ################
        # Independent HMM ?
        # gen_chord_sequence, _ = hmm_chords.sampling(n_samples=44)
        ##################################
        if params["DEBUG"]:
            with open("results_chords/" + str(n), 'wb') as f:
                for count, (part, chord) in enumerate(
                        zip(gen_part_sequence, gen_chord_sequence)):
                    if count % 2 == 0:
                        f.write(
                            str(part / 2) + " ; " + chord_names[chord[0]] +
                            "\n")
                    else:
                        f.write("  ; " + chord_names[chord[0]] + "\n")
                    if count % 8 == 7:
                        f.write("\n")
    gen_part_sequence = [e / 2 for e in gen_part_sequence]
    return gen_part_sequence, gen_chord_sequence, num_chords, num_parts
def train_syllable_hmm(song_corpus, n_iterations=50):
    hmm = MultinomialHMM(3)
    hmm.transmat_ = np.array([[0, 0, 1], [1, 0, 0], [0, .01, .99]])
    hmm.n_iter = n_iterations
    hmm.fit([np.concatenate(song_corpus)])
    return hmm
예제 #24
0
discrete_obs, delta_hws, delta_fas = [], [], []
for idx in mice:
    d = _data_on_mouse(data, idx, smoothing_time_radius,
                       smoothing_amplitude_radius, smoothing_tolerance,
                       sampling_interval, bins)
    discrete_obs.append(d[0])
    delta_hws.append(d[1])
    delta_fas.append(d[2])

X = np.array(discrete_obs)

model = MultinomialHMM(n_components=n_components)
predictions = []
for i in range(7):
    held_out_X = np.vstack((X[:i], X[i + 1:]))
    model.fit(held_out_X)
    predictions.append(model.decode(X[i].reshape(X[i].shape[0], 1)))

f, axarr = plt.subplots(7, 1)
yranges = np.arange(n_components + 1, dtype=float) / n_components
colors = plt.cm.rainbow(np.linspace(0, 1, n_components))
for i in range(7):
    states, indices = _axvspan_maker(predictions[i][1])
    for s, idxs in zip(states, indices):
        axarr[i].axvspan(idxs[0],
                         idxs[1],
                         ymin=yranges[s],
                         ymax=yranges[s + 1],
                         color=colors[s])
plt.show()
예제 #25
0
target_test = np.hstack([d["label"] for d in dataset[split_pos:]])

# Train scaler
scaler = StandardScaler()
scaler.fit(feature_train)
feature_train = scaler.transform(feature_train)

# Train random forest classifier
clf = RandomForestClassifier()
clf.fit(feature_train, target_train)

# Train HMM
pred_probs = clf.predict_proba(feature_train)[:, 1]
pred_labels = np.array([map_pred(x) for x in pred_probs], dtype=np.int64)
hmm = MultinomialHMM(n_components=2,
                     startprob_prior=np.array([0.5, 0.5]),
                     transmat_prior=np.array([
                         [0.8, 0.2],
                         [0.2, 0.8],
                     ]))
hmm.fit(pred_labels.reshape(-1, 1))

# Evaluation of the entire procedure
predict_results = infer(feature_test, scaler, clf, hmm)
print(classification_report(target_test, predict_results))

# Save models
pickle.dump(scaler, open(path.join(project_dir, "models/scaler.pkl"), "wb"))
pickle.dump(clf, open(path.join(project_dir, "models/clf.pkl"), "wb"))
pickle.dump(hmm, open(path.join(project_dir, "models/hmm.pkl"), "wb"))
예제 #26
0
for idx in mice:
    d = _data_on_mouse(data, idx, smoothing_time_radius,
                       smoothing_amplitude_radius, smoothing_tolerance, 
                       sampling_interval, bins)
    discrete_obs.append(d[0])
    delta_hws.append(d[1])
    delta_fas.append(d[2])

X = np.array(discrete_obs)


model = MultinomialHMM(n_components = n_components)
predictions = []
for i in range(7):
    held_out_X = np.vstack((X[:i], X[i+1:]))
    model.fit(held_out_X)
    predictions.append(model.decode(X[i].reshape(X[i].shape[0], 1)))

f, axarr = plt.subplots(7, 1)
yranges = np.arange(n_components+1, dtype=float)/n_components
colors = plt.cm.rainbow(np.linspace(0, 1, n_components))
for i in range(7):
    states, indices = _axvspan_maker(predictions[i][1])
    for s, idxs in zip(states, indices): 
        axarr[i].axvspan(idxs[0], idxs[1], ymin=yranges[s], ymax=yranges[s+1], color=colors[s])
plt.show()

# healthy_model = MultinomialHMM(n_components = n_components)
# healthy_model.fit(dos)
# hs_preds = healthy_model.predict(dos.reshape(len(dos), 1))
예제 #27
0
def train_on_X(X):
    X_train_hmm, X_train_lengths = transform_X_for_hmm(X)
    clf = MultinomialHMM(n_components=n_components, n_iter=n_iter)
    clf.fit(X_train_hmm, lengths=X_train_lengths)
    return clf
예제 #28
0
final_array = []
count_array = []       
for x in input_sequences:
    count = 0
    for y in x:
        count += 1
        final_array.append(y)
    count_array.append(count)
        

data = np.loadtxt('train.csv' , delimiter=',')
sample_vector = np.array(final_array)
sequence_lengths = np.array(count_array)
num_components = 3
model = MultinomialHMM(n_components=num_components  , n_iter = 1000)
model.fit(sample_vector , lengths = sequence_lengths)


#------------------------------------------------------------------------------------------------------------------------
print("Second Phase")
validating_sequences = []
# for idx in train.index:
count =0
idx = 0  
flag1 = True
flag2 = True   

while flag1:
    temp_list = []
    flag2 = True
    
            high = high + 1
        elif percent >= .50:
            highMid = highMid + 1
        elif percent >= .25:
            lowMid = lowMid + 1
        else:
            low = low + 1
    matrix[1, 0] = low / len(wins)
    matrix[1, 1] = lowMid / len(wins)
    matrix[1, 2] = highMid / len(wins)
    matrix[1, 3] = high / len(wins)
    return matrix


# Load Data
filename = 'data.csv'
X = np.loadtxt(filename, delimiter=',')

player1 = X[:, 0]
player2 = X[:, 1]
record = X[:, 2]

print "stateProbs(record)", stateProbs(record)
print "eProbs(player1, record", eProbs(player1, record)
clf = MultinomialHMM(n_components=2)
clf.transmat_ = stateProbs(record)
clf.emissionprob_ = eProbs(player1, record)
print "here"
clf.fit(clf.transmat_, clf.emissionprob_)
clf.predict(player1)
예제 #30
0
class TOE_HMM:

    def __init__(self, N=2, maxIters = 200):
        self._N = N
        self._M = len(WHITE_LIST)
        self._pi = self.randProbMat(1,N)[0]
        self._A = self.equiProbMat(N, N)
        self._B = self.equiProbMat(N, self._M)
        self._maxIters = maxIters
        self._syms = []

    def randProbMat(self, M, N):
        ret = np.random.rand(M,N)
        ret = ret/ret.sum(axis=1)[:,None]
        return ret

    def equiProbMat(self, M, N):
        ret = np.ones((M,N), dtype=float)
        ret = ret/ret.sum(axis=1)[:,None]
        return ret

    def loadBrownSymsSeq(self, T):
        taggedWordsIter = brown.tagged_words()
        retIdx = 0
        iterIdx = 0
        symSequence = []
        for wrd, tag in taggedWordsIter:
            if retIdx >= T:
                break
            if tag in WHITE_LIST:
                val = WHITE_LIST.index(tag)
                symSequence.append(val)
                retIdx += 1
            iterIdx += 1

        self._syms = symSequence
        return  symSequence

    def textSeqToSymSeq(self, txtSeqArr):
        tags = nltk.pos_tag(txtSeqArr) #PerceptronTagger
        tags = [t[1] for t in tags]
        tags = [WHITE_LIST.index(t) for t in tags if t in WHITE_LIST]
        return tags

    def initHMM(self):
        # self._hmm = MultinomialHMM(n_components=self._N, startprob_prior=None, transmat_prior=None, 
        #     algorithm='viterbi', random_state=None, n_iter=self._maxIters, tol=0.01, 
        #     verbose=True, params='ste', init_params='ste')

        self._hmm = MultinomialHMM(n_components=self._N, n_iter=self._maxIters, 
            verbose=True, params='ste', init_params='ste')
        # self._hmm.emissionprob_ = self._B
        # n_features  (int) Number of possible symbols emitted by the model (in the samples).
        # monitor_    (ConvergenceMonitor) Monitor object used to check the convergence of EM.
        # transmat_   (array, shape (n_components, n_components)) Matrix of transition probabilities between states.
        # startprob_  (array, shape (n_components, )) Initial state occupation distribution.
        # emissionprob_   (array, shape (n_components, n_features)) Probability of emitting a given symbol when in each state.

    def trainHMM(self):
        self._hmm.fit(np.array(self._syms).reshape(-1, 1))

    def testTxt(self, txtSeqArr):
        testSymsArr = self.textSeqToSymSeq(txtSeqArr)
        score = self._hmm.score(testSymsArr)
        return score

    def testSyms(self, symsArr):
        # testSymsArr = self.textSeqToSymSeq(txtSeqArr)
        score = self._hmm.score(symsArr)
        return score

    def persistHMM(self, filename):
        import pickle
        s = pickle.dumps(self)
        with open(filename, 'w') as f:
            f.write(s)

    @staticmethod
    def loadHMM(filename):
        import pickle
        with open(filename, 'r') as f:
            s = f.read()
        model = pickle.loads(s)
        return model
    
    def pickRandomSeq(self, length = 100):
        symSequence = []
        taggedWordsIter = brown.tagged_words()
        maxIdx = len(taggedWordsIter)
        import random
        idx = 0
        while idx < length:
            wrdIdx = random.randint(0, maxIdx)
            (wrd, tag) = taggedWordsIter[wrdIdx]
            if tag in WHITE_LIST:
                val = WHITE_LIST.index(tag)
                symSequence.append(val)
                idx += 1
        return symSequence

    def pickOrderedSeq(self, length = 100):
        symSequence = []
        taggedWordsIter = brown.tagged_words()
        maxIdx = len(taggedWordsIter)
        maxMinIdx = maxIdx - length
        import random
        minIdx = random.randint(0, maxMinIdx)
        count = 0
        for idx in range(minIdx, maxIdx):
            (wrd, tag) = taggedWordsIter[idx]
            if tag in WHITE_LIST:
                val = WHITE_LIST.index(tag)
                symSequence.append(val)
            idx += 1
            count += 1
            if count >= length:
                break
        return symSequence

    def printHMM(self):
        print("A = %s" % str(self._hmm.transmat_))
        print("B = %s" % str(self._hmm.emissionprob_))
        print("PI = %s" % str(self._hmm.startprob_))
        print("Verify A = %s" % np.sum(self._hmm.transmat_, axis=1))
        print("Verify B = %s" % np.sum(self._hmm.emissionprob_, axis=1))
        print("Verify PI = %s" % np.sum(self._hmm.startprob_, axis=0))
    
    def histo(self):
        retHisto = dict((x, self._syms.count(x)) for x in range(len(WHITE_LIST)))
        retHisto = dict((WHITE_LIST[k], val) for k, val in retHisto.items())
        return retHisto