Exemplo n.º 1
0
    def select(self):  
           
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # Implement model selection using CV
        NB_SPLITS = 3   
        mean_scores = []
        split_method = KFold(random_state=self.random_state, n_splits=NB_SPLITS)
        n_components = range(self.min_n_components, self.max_n_components + 1)
        
        try:
            for n_component in n_components:
                model = self.base_model(n_component)
                kfold_scores = []
                for _, test_idx in split_method.split(self.sequences):
                    test_X, test_length = combine_sequences(test_idx, self.sequences)
                    kfold_scores.append(model.score(test_X, test_length))
                    
                mean_scores.append(np.mean(kfold_scores))
                
        except Exception as e:
            pass
        
        if len(mean_scores) > 0:
            states = n_components[np.argmax(mean_scores)]
        else:
            states = self.n_constant

        return self.base_model(states)
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float('-inf')
        best_model = None
        sum_score = 0.0
        counter = 0.0
        if len(self.sequences) >= 3:
            n_splits = min(3, len(self.sequences))
            splits = KFold(n_splits)
            for n in range(self.min_n_components, self.max_n_components+1):
                try:
                    for train_index, test_index in splits.split(self.sequences):
                        # used forum code to get train/test X,Lengths respectively: https://discussions.udacity.com/t/selectorcv-crashes/400125
                        train_X, train_lengths = combine_sequences(train_index, self.sequences)
                        test_X, test_lengths = combine_sequences(test_index, self.sequences)
                        model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                                random_state=self.random_state, verbose=False).fit(train_X, train_lengths)
                        score = model.score(test_X, test_lengths)
                        sum_score += score
                        counter += 1
                    # used average score from udacity forum: https://discussions.udacity.com/t/my-selectorcv-class/349110
                    average_score = sum_score / counter
                    if average_score > best_score:
                        best_score = average_score
                        best_model = model
                except:
                    continue
        # for models with length less than 3
        else:
            best_score_1 = float('inf')
            best_model = None
            for n in range(self.min_n_components, self.max_n_components+1):
                try:
                    model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                            random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    logL = model.score(self.X, self.lengths)
                    # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10
                    p = n ** 2 + 2 * n * len(self.X[0])  - 1
                    N = len(self.X)
                    score_1 = -2 * logL + p * np.log(N)
                    if score_1 < best_score_1:
                        best_score_1 = score_1
                        best_model = model
                except:
                    continue
        return best_model
Exemplo n.º 3
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        max_score = None
        max_model = None

        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                all_score = 0.0
                qty = 0
                final_model = None
                if (len(self.sequences) >= 2):
                    # Generate K folds
                    folds = min(len(self.sequences),3)
                    split_method = KFold(shuffle=True, n_splits=folds)
                    parts = split_method.split(self.sequences)
                    for cv_train_idx, cv_test_idx in parts:
                        # Kfold information for train
                        X_train, lengths_train = np.asarray(combine_sequences(cv_train_idx, self.sequences))
                        # Kfold information for test
                        X_test, lengths_test = np.asarray(combine_sequences(cv_test_idx, self.sequences))
                        # Fit model with train data
                        model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(X_train, lengths_train)
                        # Get score using test data
                        all_score = all_score+model.score(X_test,lengths_test)
                        qty = qty+1
                    # Calculate score
                    score = all_score / qty
                else:
                    # cant be fold
                    final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    score = model.score(self.X, self.lengths)
                # Keep model with best score
                if max_score is None or max_score < score:
                    max_score = score
                    if final_model is None:
                        final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                                  random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    max_model = final_model

            except:
                pass

        return max_model
Exemplo n.º 4
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        # raise NotImplementedError
        record = float("-inf")

        min_seq = min([len(seq) for seq in self.sequences])    
        self.max_n_components = min (self.max_n_components, min_seq) 
        hmm_model = self.base_model(self.n_constant)
        if len(self.sequences) == 1:
            return hmm_model
        elif len(self.sequences) == 2:
            split_method = KFold(n_splits=2)
            #self.max_n_components = 3
        else:
            split_method = KFold(n_splits=3,random_state=self.random_state)
        
        
        for num in range(self.min_n_components,self.max_n_components+1,1):
            #print(num)
            logL = 0
            cnt = 0
            
            for cv_train_idx, cv_test_idx in split_method.split(self.sequences):
                #print("Train fold indices:{} Test fold indices:{}".format(cv_train_idx, cv_test_idx))  # view indices of the folds
                X, lengths = combine_sequences(cv_train_idx,self.sequences)
                try:
                    model = GaussianHMM(n_components= num, n_iter=1000).fit(X, lengths)
                    X, lengths = combine_sequences(cv_test_idx,self.sequences)
                    logL += model.score(X, lengths)
                except:
                    continue
                    #print("failure on {} with {} states".format(self.this_word, num))                      
            if cnt> 0 and logL/cnt > record:
                record = logL
                hmm_model = model   
        return hmm_model
Exemplo n.º 5
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        split_method = KFold()

        best_score = float("-inf")
        best_model = None

        for n_components in range(self.min_n_components,
                                  self.max_n_components + 1):
            scores = []
            hmm_model = None

            try:
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    X, lengths = combine_sequences(cv_train_idx,
                                                   self.sequences)
                    test_X, test_lengths = combine_sequences(
                        cv_test_idx, self.sequences)

                    hmm_model = GaussianHMM(n_components=n_components,
                                            covariance_type="diag",
                                            n_iter=1000,
                                            random_state=self.random_state,
                                            verbose=False).fit(X, lengths)
                    scores.append(hmm_model.score(test_X, test_lengths))

                avg_score = np.mean(scores)

                if (avg_score > best_score):
                    best_score = avg_score
                    best_model = hmm_model
            except:
                pass

        return best_model if best_model is not None else self.base_model(
            self.n_constant)
Exemplo n.º 6
0
 def select(self):
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     best_n = None
     best_avg_logL = float("-inf")
     n_splits = min(self.min_n_components, len(self.lengths))
     if n_splits < 2:
         n_splits = 2
     split_method = KFold(n_splits=n_splits)
     for n in range(self.min_n_components, self.max_n_components):
         try:
             folds = total_logL = 0
             for train_idx, test_idx in split_method.split(self.sequences):
                 folds += 1
                 train_X, train_lengths = combine_sequences(
                     train_idx, self.sequences)
                 test_X, test_lengths = combine_sequences(
                     test_idx, self.sequences)
                 try:
                     model = GaussianHMM(
                         n, n_iter=1000,
                         random_state=self.random_state).fit(
                             train_X, train_lengths)
                     score = model.score(test_X, test_lengths)
                     total_logL += score
                 except:
                     pass
             avg_logL = total_logL / folds
             if avg_logL > best_avg_logL:
                 best_avg_logL = avg_logL
                 best_n = n
         except:
             pass
     # retrain using best n_components with FULL DATA SET, not split set
     # used above
     if best_n is not None:
         return self.base_model(best_n)
     return None
Exemplo n.º 7
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        best_score = float('-inf')
        best_n = self.n_constant
        split_method = KFold(random_state=self.random_state)

        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                scores = []
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    train_X, train_lengths = combine_sequences(
                        cv_train_idx, self.sequences)
                    test_X, test_lengths = combine_sequences(
                        cv_test_idx, self.sequences)

                    model = GaussianHMM(n_components=n,
                                        covariance_type="diag",
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=False).fit(
                                            train_X, train_lengths)
                    score = model.score(test_X, test_lengths)
                    scores.append(score)

                mean_score = np.mean(scores)
                if mean_score > best_score:
                    best_score = mean_score
                    best_n = n
            except:
                if self.verbose:
                    print("CV: failure on {} with {} states".format(
                        self.this_word, n))

        return self.base_model(best_n)
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        split_method = KFold(n_splits=2, shuffle=True)
        highest_logL = float('-inf')
        best_model = None

        try:
            for n_components in range(self.min_n_components,
                                      self.max_n_components + 1):

                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):

                    x_train, lengths_train = combine_sequences(
                        cv_train_idx, self.sequences)

                    model = GaussianHMM(n_components=n_components,
                                        covariance_type="diag",
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=False).fit(
                                            x_train, lengths_train)

                    x_test, lengths_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    logL = model.score(x_test, lengths_test)

                    if logL > highest_logL:
                        highest_logL = logL
                        best_model = model

            return best_model

        except:
            if best_model:
                return best_model
Exemplo n.º 9
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV

        score_cv = float('-inf')
        model_cv = None
        logL_array = []
        #kf = KFold(n_splits=min(3, len(self.lengths)))
        #num_hidstates = self.max_n_components - self.min_n_components

        for component_n in range(self.min_n_components,
                                 self.max_n_components + 1):
            kf = KFold(n_splits=min(3, len(self.lengths)))

            for train_n, test_n in kf.split(self.sequences):
                try:
                    #x1, len1 = train_n.get_word_Xlengths(self.words)
                    x_train, len_train = combine_sequences(
                        train_n, self.sequences)
                    x_test, len_test = combine_sequences(
                        test_n, self.sequences)

                    model = GaussianHMM(n_components=component_n,
                                        n_iter=1000).fit(x_train, len_train)
                    logL = model.score(x_test, len_test)
                    logL_array.append(logL)
                except:
                    pass

            mean_score = np.mean(logL_array)

            if mean_score > score_cv:
                score_cv = mean_score
                model_cv = model

        return model_cv
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        best_score = float('-inf')
        best_model = None
        split_method = KFold()

        for n_components in range(self.min_n_components,
                                  self.max_n_components + 1):
            score = 0
            folds = 0
            try:
                if len(self.sequences) > 2:
                    for train_i, test_i in split_method.split(self.sequences):
                        train_X, train_l = combine_sequences(
                            train_i, self.sequences)
                        test_X, test_l = combine_sequences(
                            test_i, self.sequences)
                        model = GaussianHMM(n_components=n_components,
                                            covariance_type="diag",
                                            n_iter=1000,
                                            random_state=self.random_state,
                                            verbose=False).fit(
                                                train_X, train_l)
                        score += model.score(test_X, test_l)
                        folds += 1
                    avg_score = score / folds
                else:
                    model = self.base_model(n_components)
                    avg_score = model.score(self.X, self.lengths)
                if avg_score > best_score:
                    best_score = avg_score
                    best_model = model
            except:
                pass
        return best_model
Exemplo n.º 11
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        split_method = KFold(n_splits=3)
        model_and_scores = []
        if len(self.sequences)>2:
            for current_n_components in range(self.min_n_components,self.max_n_components+1):
                scores = []
                for cv_train_idx, cv_test_idx in split_method.split(self.sequences):
                    train_X, train_lenghts = combine_sequences(cv_train_idx,self.sequences)
                    test_X, test_lenghts = combine_sequences(cv_test_idx, self.sequences)

                    try:
                        hmm_model = GaussianHMM(n_components=current_n_components, covariance_type="diag", n_iter=1000,
                                                random_state=self.random_state, verbose=False).fit(train_X, train_lenghts)
                        scores.append(hmm_model.score(test_X,test_lenghts))
                        if self.verbose:
                            print("model created for {} with {} states".format(self.this_word, current_n_components))

                    except:
                        if self.verbose:
                            print("failure on {} with {} states".format(self.this_word, current_n_components))
                # before returning the model we train on full data
                hmm_model = GaussianHMM(n_components=current_n_components, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                model_and_scores.append([hmm_model,np.mean(scores)])

            best_model, score = max(model_and_scores, key=lambda item: item[1])
        else:
            try:
                best_model = hmm_model = GaussianHMM(n_components=self.n_constant, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
            except:
                if self.verbose:
                    print("failure on {} altogether".format(self.this_word))

        return best_model
Exemplo n.º 12
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_model = None
        best_score = float('-inf')

        for components in range(self.min_n_components,
                                self.max_n_components + 1):
            try:
                cv_folds = KFold(n_splits=2)
                test_scores = []

                for cv_train_idx, cv_test_idx in cv_folds.split(
                        self.sequences):
                    X_train, lengths_train = combine_sequences(
                        cv_train_idx, self.sequences)
                    X_test, lengths_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    scored_model = GaussianHMM(n_components=components,
                                               covariance_type="diag",
                                               n_iter=1000,
                                               random_state=self.random_state,
                                               verbose=False).fit(
                                                   X_train, lengths_train)

                    if scored_model:
                        test_scores.append(
                            scored_model.score(X_test, lengths_test))

                score = np.mean(test_scores)
                if score > best_score:
                    best_score = score
                    best_model = scored_model
            except:
                pass

        return best_model
Exemplo n.º 13
0
    def select(self):
        #warnings.filterwarnings("ignore", category=DeprecationWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        # TODO implement model selection using CV

        kf = KFold(n_splits=3, shuffle=False, random_state=None)
        likelihoods = []
        cv_scores = []

        for states in range(self.min_n_components, self.max_n_components + 1):
            try:
                if len(self.sequences) > 2:

                    for train_index, test_index in kf.split(self.sequences):

                        self.X, self.lengths = combine_sequences(
                            train_index, self.sequences)

                        X_test, lengths_test = combine_sequences(
                            test_index, self.sequences)

                        hmm_model = self.base_model(states)
                        log_likelihood = hmm_model.score(X_test, lengths_test)
                else:
                    hmm_model = self.base_model(states)
                    log_likelihood = hmm_model.score(self.X, self.lengths)

                    likelihoods.append(log_likelihood)

                score_cvs_avg = np.mean(likelihoods)
                cv_scores.append(tuple([score_cvs_avg, hmm_model]))

            except Exception as e:
                pass

        return max(cv_scores, key=lambda x: x[0])[1] if cv_scores else None
Exemplo n.º 14
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        logs_array = []
        best_score = -float("Inf")
        best_model = None

        for num_states in range(self.min_n_components,
                                self.max_n_components + 1):

            try:
                split_method = KFold(n_splits=min(3, len(self.sequences)))

                #Doing crossvalidation and fitting the model accordingly
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    X_train, lengths_train = combine_sequences(
                        cv_train_idx, self.sequences)
                    X_test, lengths_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    model = GaussianHMM(n_components=num_states,
                                        n_iter=1000).fit(
                                            X_train, lengths_train)
                    logL = model.score(X_test, lengths_test)
                    logs_array.append(logL)
                mean_score = np.mean(logs_array)

                #Update the best model
                if mean_score > best_score:
                    best_score = mean_score
                    best_model = model

            except:
                pass

        return best_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        logl_model_list = []
        #Instantiate KFold
        splits = KFold(n_splits=2)
        for components in range(self.min_n_components,
                                self.max_n_components + 1):
            try:
                #Compute Likelihood
                hmm_model = GaussianHMM(n_components=components,
                                        covariance_type="diag",
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=False)
                log_l_list = []
                for cv_train_idx, cv_test_idx in splits.split(self.sequences):
                    #Select indices from split
                    train_X, train_lengths = combine_sequences(
                        cv_train_idx, self.sequences)

                    #Fit model to training data
                    hmm_model.fit(train_X, train_lengths)
                    #Evaluate on test data
                    test_X, test_lengths = combine_sequences(
                        cv_test_idx, self.sequences)
                    log_l = hmm_model.score(test_X, test_lengths)
                    #Cache likelihood
                    log_l_list.append(log_l)
                #Compute mean likelihood
                logl_model_list.append((np.mean(log_l_list), hmm_model))
            except:
                continue
        if logl_model_list:
            _, model = max(logl_model_list, key=lambda x: x[0])
            return model
        else:
            return None
Exemplo n.º 16
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV

        best_score = float('-inf')
        best_model = None

        if len(self.sequences) < 2:
            return self.base_model(self.n_constant)

        for num_states in range(self.min_n_components,
                                self.max_n_components + 1):
            try:
                scores = []
                model = self.base_model(num_states)
                split_method = KFold(n_splits=min(3, len(self.lengths)))

                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):

                    X_train, length_train = combine_sequences(
                        cv_train_idx, self.sequences)
                    X_test, length_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    model.fit(X_train, length_train)
                    one_score = model.score(X_test, length_test)
                    scores.append(one_score)

                score = np.mean(scores)

                if score > best_score:
                    best_score = score
                    best_model = model
            except:
                pass
        return best_model
Exemplo n.º 17
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        n_components = None
        logL = float('-inf')
        
        try:
            split_method = KFold(n_splits=min(3,len(self.sequences)))
        except:
            return None

        for tmp_n_components in range(self.min_n_components, self.max_n_components+1):
            try:
                tmp_logL = 0
                for cv_train_idx, cv_test_idx in split_method.split(self.sequences):
                    train_x, train_length = combine_sequences(cv_train_idx, self.sequences)
                    test_x, test_length = combine_sequences(cv_test_idx, self.sequences)
                    tmp_hmm_model = GaussianHMM(n_components=tmp_n_components, covariance_type="diag", n_iter=1000,
                                            random_state=self.random_state, verbose=False).fit(train_x, train_length)
                    tmp_logL += tmp_hmm_model.score(test_x, test_length)
                #print('PTCCKVDP: this_word={}, tmp_n_components={}, tmp_logL={}'.format(self.this_word,tmp_n_components,tmp_logL))
                if tmp_logL > logL:
                    logL = tmp_logL
                    n_components = tmp_n_components
            except:
                #print('JAYLNMOK: fail, this_word={}, tmp_n_components={}'.format(self.this_word,tmp_n_components,tmp_logL))
                pass
        
        if self.verbose and n_components == None:
            print('Error MIHMRNLR: this_word={}'.format(self.this_word))
            return None
        
        try:
            return GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000,
                random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
        except:
            return None
Exemplo n.º 18
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = -float('inf')
        best_components_number = self.min_n_components
        best_model = None
        n_splits = min(len(self.lengths), 3)
        if n_splits == 1:
            return self.base_model(self.n_constant)
        split_method = KFold(n_splits)
        for components in range(self.min_n_components,
                                self.max_n_components + 1):
            score = []
            iter_counter = 0
            try:
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    X_train, len_train = combine_sequences(
                        cv_train_idx, self.sequences)
                    X_test, len_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    hmm_model = GaussianHMM(n_components=components,
                                            covariance_type='diag',
                                            n_iter=1000,
                                            random_state=self.random_state,
                                            verbose=False).fit(
                                                X_train, len_train)
                    iter_counter += 1
                    score.append(hmm_model.score(X_test, len_test))
                score = np.average(score)
                if score > best_score:
                    best_score = score
                    best_components_number = components
                    best_model = hmm_model
            except:
                pass
        return best_model
Exemplo n.º 19
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        scores_array = []
        best_score = float("-inf")
        best_model = None

        for num_states in range(self.min_n_components,
                                self.max_n_components + 1):

            try:
                split_method = KFold(n_splits=min(3, len(self.lengths)))

                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    sequences_train, lengths_train = combine_sequences(
                        cv_train_idx, self.sequences)
                    sequences_test, lengths_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    model = GaussianHMM(n_components=num_states,
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=False).fit(
                                            sequences_train, lengths_train)
                    score = model.score(sequences_test, lengths_test)
                    scores_array.append(score)

                mean_score = np.mean(scores_array)

                if mean_score > best_score:
                    best_score = mean_score
                    best_model = model

            except:
                pass

        return best_model
Exemplo n.º 20
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV

        max_score = float("-inf")
        best_model = None

        split_method = KFold(min(len(self.sequences), 3))  # api default = 3
        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                model = self.base_model(n)

                logLs = []
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    try:
                        train_X, train_lengths = combine_sequences(
                            cv_train_idx, self.sequences)
                        test_X, test_lengths = combine_sequences(
                            cv_test_idx, self.sequences)

                        model.fit(train_X, train_lengths)
                        logL = model.score(test_X, test_lengths)
                        logLs.append(logL)

                    except:
                        pass
                mean_score = np.mean(logLs)

                if mean_score > max_score:
                    max_score = mean_score
                    best_model = model
            except:
                pass

        return best_model
Exemplo n.º 21
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        word_sequences = self.sequences
        if (len(self.sequences) == 1):
            split_method = []
        else:
            split_method = KFold(n_splits=min(3, len(self.sequences)))
        logPXtest = float("-inf")
        best_num_components = self.n_constant
        for components in range(self.min_n_components,
                                self.max_n_components + 1):
            logPxis = []

            try:
                if (len(self.sequences) == 1):
                    model = self.base_model(components)
                    newlogP = model.score(self.X, self.lengths)
                    if newlogP > logPXtest:
                        logPXtest = newlogP
                        best_num_components = components
                else:
                    for cv_train_idx, cv_test_idx in split_method.split(
                            word_sequences):
                        X_train, lengths_train = combine_sequences(
                            cv_train_idx, self.sequences)
                        X_test, lengths_test = combine_sequences(
                            cv_test_idx, self.sequences)
                        model = self.base_model_CV(components, X_train,
                                                   lengths_train)
                        logPxis.append(model.score(X_test, lengths_test))
                    if (np.average(logPxis) > logPXtest):
                        logPXtest = np.average(logPxis)
                        best_num_components = components
            except:
                pass

        return self.base_model(best_num_components)
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # DONE : implement model selection using CV
        best_score = float('-inf')
        best_model = None

        # DONE : implement model selection using CV
        for n in range(self.min_n_components, self.max_n_components + 1):
            sum_of_logL = 0
            number_of_iterations = 0
            number_of_splits = min(3, len(self.sequences))
            # print(number_of_splits)
            if number_of_splits > 1:
                kfold = KFold(n_splits=number_of_splits)
                for cv_training_set, cv_testing_set in kfold.split(
                        self.sequences):
                    # traning, length_of_training = combine_sequences(cv_training_set, self.sequences)
                    testing, length_of_testing = combine_sequences(
                        cv_testing_set, self.sequences)
                    try:
                        hmm_model = self.base_model(n)
                        logL_score = hmm_model.score(testing,
                                                     length_of_testing)
                        number_of_iterations = number_of_iterations + 1

                    except:
                        logL_score = 0

                    sum_of_logL = sum_of_logL + logL_score

                if number_of_iterations == 0:
                    score = sum_of_logL
                else:
                    score = sum_of_logL / number_of_iterations

            else:
                try:
                    hmm_model = self.base_model(n)
                    score = hmm_model.score(self.X, self.lengths)
                except:
                    hmm_model = None
                    score = 0

            if score > best_score:
                best_score = score
                best_model = hmm_model

        return best_model
Exemplo n.º 23
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_logL = float('-inf')
        best_num_components = None

        for num_states in range(self.min_n_components, self.max_n_components + 1):
            logL_arr = []

            try:
                split_method = KFold(n_splits=min(len(self.lengths), 3))
                for cv_train_idx, cv_test_idx in split_method.split(self.sequences):
                    train_X, train_Xlengths = combine_sequences(cv_train_idx, self.sequences)
                    test_X, test_Xlengths = combine_sequences(cv_test_idx, self.sequences)

                    try:
                        model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000,
                                            random_state=self.random_state, verbose=False).fit(train_X, train_Xlengths)
                        logL_arr.append(model.score(test_X, test_Xlengths))

                    except:
                        pass

                if logL_arr:
                    mean_logL = np.mean(logL_arr)
                else:
                    mean_logL = float('-inf')

                if mean_logL > best_logL:
                    best_logL = mean_logL
                    best_num_components = num_states

            except:
                pass

        return self.base_model(best_num_components)
Exemplo n.º 24
0
    def select(self):
        '''
        You need to split the dataset in train and test sets (it will be more than 1 pair of sets- iterate them)
        Use the training data to fit the model
        Use the test data to get the score
        Get the average score of all these scores <-- this will be the components score
        Keep the number of components of the highest components score.
        Return a model fitted over all the data with the best number of components
        '''

        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_component_score = float('-inf')
        best_n = self.min_n_components

        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                split_method = KFold(n_splits=min(len(self.lengths), 3))
                scores = []
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    x_test, lengths_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    x_train, lengths_train = combine_sequences(
                        cv_train_idx, self.sequences)
                    model = GaussianHMM(n_components=n, n_iter=1000).fit(
                        x_train, lengths_train)
                    scores.append(model.score(x_test, lengths_test))
                component_score = sum(scores) / len(scores)
                if component_score > best_component_score:
                    best_component_score = component_score
                    best_n = n
            except:
                continue

        return self.base_model(best_n)
Exemplo n.º 25
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float("-inf") #KEEP HIGHEST
        best_model = None
        for num_states in range(self.min_n_components, self.max_n_components + 1):
            scores = []
            n_splits = 3
            model, logL = None, None
            # Check Data Amount
            if len(self.sequences) < n_splits:
                break
            split_method = KFold(random_state=self.random_state, n_splits=n_splits)
            for cv_train_idx, cv_test_idx in split_method.split(self.sequences):
                # Help from the Forums.
                x_train, lengths_train = combine_sequences(cv_train_idx, self.sequences)
                # Split Training sequences
                x_test, lengths_test = combine_sequences(cv_test_idx, self.sequences)
                try:
                    model = GaussianHMM(n_components=num_states, 
                                        n_iter=1000,).fit(x_train, lengths_train)
                    logL = model.score(x_test, lengths_test)
                    scores.append(logL)
                except Exception as e:
                    break
            avg = None
            if len(scores) > 0:
                avg = np.average(scores)
            else:
                avg = float("-inf")
            if avg > best_score:
                best_score, best_model = avg, model
        if best_model is None:
            return self.base_model(self.n_constant)
        else:
            return best_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV

        all_n_components = []
        split_method = KFold()
        all_scores = []  # Store each CV value
        for n_components in range(self.min_n_components,
                                  self.max_n_components + 1):
            try:
                if len(self.sequences
                       ) > 2:  # Check if there are enough data to split
                    scores = []
                    for cv_train_idx, cv_test_idx in split_method.split(
                            self.sequences):
                        # Prepare training sequences
                        self.X, self.lengths = combine_sequences(
                            cv_train_idx, self.sequences)
                        # Prepare testing sequences
                        X_test, lengths_test = combine_sequences(
                            cv_test_idx, self.sequences)
                        model = self.base_model(n_components)
                        scores.append(model.score(X_test, lengths_test))
                    all_scores.append(np.mean(scores))
                else:
                    model = self.base_model(n_components)
                    all_scores.append(model.score(self.X, self.lengths))
                all_n_components.append(n_components)
            except:
                # eliminate non-viable models from consideration
                pass

        best_num_components = all_n_components[np.argmax(
            all_scores)] if all_scores else self.n_constant
        return self.base_model(best_num_components)
Exemplo n.º 27
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # implement model selection using CV
        best_cv = float('-Inf')
        best_model = None
        for p in range(self.min_n_components, self.max_n_components + 1):
            splits = min(3, len(self.sequences))
            if splits < 2:
                continue
            split_method = KFold(splits)
            accum_score = 0
            for cv_train_idx, cv_test_idx in split_method.split(self.sequences):
                self.X, self.lengths = combine_sequences(cv_train_idx, self.sequences) # make the model automatically fit to training
                test_x, test_length = combine_sequences(cv_test_idx, self.sequences)
                try:
                    model = self.base_model(num_states=p)
                    accum_score += model.score(test_x, test_length)
            cv = accum_score / float(splits) # normalize over all model splits
            if cv > best_cv:
                best_cv = cv
                best_model = model
                except:
                    continue
Exemplo n.º 28
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        if len(self.lengths) < 3:
            split_method = KFold(2)
        else:
            split_method = KFold()
        best_model = None
        highest_score = float('-inf')
        n_components = range(self.min_n_components, self.max_n_components + 1)
        for n_component in n_components:
            try:
                test_scores = []
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    self.X, self.lengths = combine_sequences(
                        cv_train_idx, self.sequences)
                    model = self.base_model(n_component)

                    test_X, test_lengths = combine_sequences(
                        cv_test_idx, self.sequences)
                    test_score = model.score(test_X, test_lengths)
                    test_scores.append(test_score)

                score = np.mean(test_scores)
                if score > highest_score:
                    best_model = model
                    highest_score = score
            except:
                continue

        if best_model:
            return best_model
        else:
            return self.base_model(self.n_constant)
	def select(self):
		warnings.filterwarnings("ignore", category=DeprecationWarning)
		maxLogL = 0.0
		totalLogL = 0.0
		split_method = KFold(n_splits = 3, shuffle = False, random_state = None)
		best_num_components = self.n_constant
		for n_comp in range(self.min_n_components, self.max_n_components+1):
			try:
				for cv_train_idx, cv_test_idx in split_method.split(self.sequences):
					Xn, LengthsN = combine_sequences(cv_test_idx,self.sequences)
					self.X, self.lengths = combine_sequences(cv_train_idx,self.sequences)
					tr_model =  self.base_model(n_comp)
					logL = tr_model.score(Xn, LengthsN)
					totalLogL = totalLogL + logL
					avgLogL = logL/totalLogL
					if avgLogL > maxLogL:
						best_num_components = n_comp
						maxLogL = logL
			except:
				pass


		selected = self. base_model(best_num_components)
		return selected
Exemplo n.º 30
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)


        scores = [(float('-inf'),0)]
        n_splits = 3
        split_method = KFold(random_state=self.random_state, n_splits=n_splits)

        for n_components in range(self.min_n_components, self.max_n_components + 1):
            try:
                cv_scores=[]
                if(len(self.sequences) < n_splits):
                    return self.base_model(self.n_constant)

                for train_index, test_index in split_method.split(self.sequences):
                    X_train, lengths_train = combine_sequences(train_index, self.sequences)
                    X_test,  lengths_test  = combine_sequences(test_index, self.sequences)


                    hmm_model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X_train, lengths_train)
                    logL = hmm_model.score(X_test, lengths_test)


                    cv_scores.append(logL)

                cv_scores_mean = (np.array(cv_scores)).mean()
                scores.append((cv_scores_mean, n_components))

            except:
                pass

        _, best_num_components = max(scores)



        return self.base_model(best_num_components)
Exemplo n.º 31
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        best_score = -100000000
        best_num_state = 0
        best_model = None
        n_splits = min(3, len(self.sequences))

        for n_components in range(self.min_n_components,
                                  self.max_n_components + 1):
            count = 0
            total_score = 0
            try:
                splits = KFold(n_splits)
                current_model = self.base_model(n_components)
                for cv_train_idx, cv_test_idx in splits.split(self.sequences):
                    cv_train_X, cv_train_len = combine_sequences(
                        cv_train_idx, self.sequences)
                    cv_test_X, cv_test_len = combine_sequences(
                        cv_test_idx, self.sequences)

                    current_model = current_model.fit(cv_train_X, cv_train_len)
                    score = current_model.score(cv_test_X, cv_test_len)
                    total_score += score
                    count += 1

                avg_score = total_score / count

                if avg_score > best_score:
                    best_score = avg_score
                    best_model = current_model
            except:
                pass

        return best_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_model = None
        max_score = float('-inf')
        split_method = KFold(min(3, len(self.sequences[0])))

        for number in range(self.min_n_components, self.max_n_components):
            set_score = []
            for cv_train, cv_test in split_method.split(self.sequences[0]):
                try:
                    X_train, lengths_train = combine_sequences(cv_train, self.sequences)
                    X_test, lengths_test = combine_sequences(cv_train, self.sequences)

                    model = self.base_model(number)
                    model.fit(X_train, lengths_train)
                    logL = model.score(X_test, lengths_test)
                    set_score.append(logL)

                    if np.mean(set_score) > max_score:
                        best_model = model
                except:
                    pass
        return best_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV

        best_score = float("inf")
        best_model = self.base_model(self.n_constant)

        for n in range(self.min_n_components, self.max_n_components + 1):
            scores = []
            n_splits = 2

            if (len(self.sequences) < n_splits):
                break

            split_method = KFold(random_state=self.random_state,
                                 n_splits=n_splits)
            try:
                for train_idx, test_idx in split_method.split(self.sequences):
                    X_train, len_train = combine_sequences(
                        train_idx, self.sequences)
                    X_test, len_test = combine_sequences(
                        test_idx, self.sequences)
                    model = self_base_model(n)
                    X, l = combine_sequences(test_idx, self.sequences)

                    scores.append(model.score(X, l))
                score = np.mean(scores)
                if score > best_score:
                    best_score = score
                    best_model = model

            except Exception as e:
                continue

        return best_model
Exemplo n.º 34
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        # raise NotImplementedError

        best_cv = float('-inf')
        best_model = None
        split_method = KFold()
        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                scores = []
                for train_index, test_index in split_method(self.sequences):
                    self.X, self.lengths = combine_sequences(train_index, self.sequences)
                    train_model = self.base_model(n)
                    X_test, lengths_test = combine_sequences(test_index, self.sequences)
                    scores.append(train_model.score(X_test, lengths_test))
                cv = np.mean(scores)
                if cv > best_cv:
                    best_cv = cv
                    best_model = model
            except:
                continue
        return best_model if best_model else self.base_model(self.n_constant)