Exemplo n.º 1
0
def pretrain_sda_cg(sda, train_names, read_window, read_algo, read_rank,
                    window_size, pretraining_epochs, corruption_levels):
    ## Pre-train layer-wise
    print '... getting the pretraining functions'
    import scipy.optimize

    for i in xrange(sda.n_layers):
        train_reader = ICHISeqDataReader(train_names)
        n_train_patients = len(train_names)

        for patients in xrange(n_train_patients):
            train_set_x, train_set_y = train_reader.read_next_doc(
                algo=read_algo, window=read_window, rank=read_rank)
            pretraining_fn, pretraining_update = pretraining_functions_sda_cg(
                sda=sda,
                train_set_x=train_set_x,
                window_size=window_size,
                corruption_levels=corruption_levels)
            print '... pre-training the model'
            # using scipy conjugate gradient optimizer
            print("Optimizing using scipy.optimize.fmin_cg...")
            best_w_b = scipy.optimize.fmin_cg(
                f=partial(pretraining_fn, da_index=i),
                x0=numpy.zeros((sda.dA_layers[i].n_visible + 1) *
                               sda.dA_layers[i].n_hidden,
                               dtype=sda.dA_layers[i].input.dtype),
                fprime=partial(pretraining_update, da_index=i),
                maxiter=pretraining_epochs)
    return sda
Exemplo n.º 2
0
def pretrain_sda_cg(sda, train_names, read_window, read_algo, read_rank, 
                    window_size, pretraining_epochs, corruption_levels):
    ## Pre-train layer-wise
    print '... getting the pretraining functions'
    import scipy.optimize
    
    for i in xrange(sda.n_layers):
        train_reader = ICHISeqDataReader(train_names)
        n_train_patients =  len(train_names)
        
        for patients in xrange(n_train_patients):
            train_set_x, train_set_y = train_reader.read_next_doc(
                algo = read_algo,
                window = read_window,
                rank = read_rank
            )
            pretraining_fn, pretraining_update = pretraining_functions_sda_cg(
                sda=sda,
                train_set_x=train_set_x,
                window_size=window_size,
                corruption_levels=corruption_levels
            )
            print '... pre-training the model'
            # using scipy conjugate gradient optimizer
            print ("Optimizing using scipy.optimize.fmin_cg...")
            best_w_b = scipy.optimize.fmin_cg(
                f=partial(pretraining_fn, da_index = i),
                x0=numpy.zeros((sda.dA_layers[i].n_visible + 1) * sda.dA_layers[i].n_hidden,
                               dtype=sda.dA_layers[i].input.dtype),
                fprime=partial(pretraining_update, da_index = i),
                maxiter=pretraining_epochs
            )                            
    return sda
Exemplo n.º 3
0
def test_hmm(
    gen_hmm,
    test_names,
    read_window,
    read_algo,
    read_rank
    ):
    
    test_reader = ICHISeqDataReader(test_names)
    n_test_patients = len(test_names)
    
    error_array = []
    
    for i in xrange(n_test_patients):
        #get data divided on sequences with respect to labels
        test_x, test_y = test_reader.read_next_doc(
            algo = read_algo,
            rank = read_rank,
            window = read_window,
            divide = False
        )
        
        #compute mean error value for one patient in test set
        patient_error = mean_error(
            gen_hmm = gen_hmm,
            obs_seq = test_x.get_value(),
            actual_states = test_y.eval()
        )
        
        error_array.append(patient_error)
        print(patient_error, ' error for patient ' + str(test_names[i]))

        gc.collect()
    return error_array
Exemplo n.º 4
0
def create_hmm(train_data_names, n_hidden, n_visible, read_algo, read_rank, read_window):

    train_reader = ICHISeqDataReader(train_data_names)

    n_train_patients = len(train_data_names)

    pi_values = numpy.zeros((n_hidden,))
    a_values = numpy.zeros((n_hidden, n_hidden))
    b_values = numpy.zeros((n_hidden, n_visible))
    array_from_hidden = numpy.zeros((n_hidden,))

    for train_patient in xrange(n_train_patients):
        # get data divided on sequences with respect to labels
        train_set_x, train_set_y = train_reader.read_next_doc(algo=read_algo, rank=read_rank, window=read_window)

        pi_values, a_values, b_values, array_from_hidden = update_params_on_patient(
            pi_values=pi_values,
            a_values=a_values,
            b_values=b_values,
            array_from_hidden=array_from_hidden,
            hiddens_patient=train_set_y.eval(),
            visibles_patient=train_set_x.eval(),
            n_hidden=n_hidden,
        )

        gc.collect()

    pi_values, a_values, b_values = finish_training(
        pi_values=pi_values,
        a_values=a_values,
        b_values=b_values,
        array_from_hidden=array_from_hidden,
        n_hidden=n_hidden,
        n_patients=n_train_patients,
    )

    # use standart model of hmm
    hmm_model = hmm.MultinomialHMM(n_components=n_hidden)
    hmm_model.startprob_ = pi_values
    hmm_model.transmat_ = a_values
    hmm_model.n_symbols = n_visible
    hmm_model.emissionprob_ = b_values
    gc.collect()

    return hmm_model
Exemplo n.º 5
0
def test(hmm_model, valid_data, read_algo, read_window, read_rank, predict_algo):

    valid_reader = ICHISeqDataReader(valid_data)

    for valid_patient in valid_data:
        # get data divided on sequences with respect to labels
        valid_set_x, valid_set_y = valid_reader.read_next_doc(algo=read_algo, rank=read_rank, window=read_window)

        patient_error = get_error_on_patient(
            model=hmm_model,
            visible_set=valid_set_x.eval(),
            hidden_set=valid_set_y.eval(),
            algo=predict_algo,
            pat=valid_patient,
            all_labels=True,
        )

        gc.collect()
    return patient_error
Exemplo n.º 6
0
def validate_model(sda,
                   valid_names,
                   read_window,
                   read_algo,
                   read_rank,
                   window_size):
                       
    valid_reader = ICHISeqDataReader(valid_names)
    valid_errors = []
    for i in xrange (len(valid_names)):
        valid_x, valid_y = valid_reader.read_next_doc(
            algo = read_algo,
            rank = read_rank,
            window = read_window,
            divide = False
        )
        valid_x = valid_x.get_value()
        valid_y = valid_y.eval()
        
        n_valid_times = valid_x.shape[0] - window_size + 1
                    
        new_valid_x = numpy.array(
            [sda.get_da_output(
                    valid_x[time: time + window_size]
                ).ravel()
            for time in xrange(n_valid_times)]
        )

        half_window_size = int(window_size/2)
        new_valid_y = valid_y[
            half_window_size: n_valid_times + half_window_size
        ]

        #compute mean error value for patients in validation set
        pat_error = mean_error(
            gen_hmm = sda.hmm1,
            obs_seq = new_valid_x,
            actual_states = new_valid_y
        )
        valid_errors.append(pat_error)
    return numpy.mean(valid_errors)
Exemplo n.º 7
0
def test_da_params(corruption_level):
    learning_rates = [0.001, 0.003, 0.005, 0.007, 0.009, 0.011, 0.013, 0.015]
    window_sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    
    train_data = ['p10a','p011','p013','p014','p020','p022','p040','p045','p048']
    valid_data = ['p09b','p023','p035','p038']
    test_data = ['p09a','p033']
    
    train_reader = ICHISeqDataReader(train_data)
    train_set, train_labels = train_reader.read_all()
    
    valid_reader = ICHISeqDataReader(valid_data)
    valid_set, valid_labels = valid_reader.read_all()

    test_reader = ICHISeqDataReader(test_data)
    test_set, test_labels = test_reader.read_all()
    
    output_folder=('[%s], [%s], [%s]')%(",".join(train_data), ",".join(valid_data), ",".join(test_data))
    
    for lr in learning_rates:
        for ws in window_sizes:
            train_dA(learning_rate=lr,
                     training_epochs=1,
                     window_size = ws, 
                     corruption_level=corruption_level,
                     n_hidden=ws*2,
                     train_set=train_set,
                     output_folder=output_folder)
Exemplo n.º 8
0
 def train(self,
           train_names,
           valid_names,
           read_window,
           read_algo,
           read_rank,
           train_epochs
     ):
     for epoch in xrange(train_epochs):
         train_reader = ICHISeqDataReader(train_names)
         n_train_patients = len(train_names)
         #train hmms on data of each pattient
         for train_patient in xrange(n_train_patients):
             #get data divided on sequences with respect to labels
             train_set = train_reader.read_next_doc(
                 algo = read_algo,
                 rank = read_rank,
                 window = read_window,
                 divide = True
             )
             for label in xrange(self.n_hmms):
                 train_for_fit = train_set[label].eval().reshape(-1, 1)
                 if train_for_fit.shape[0] > self.hmm_models[label].n_components:                        
                     self.hmm_models[label].fit(
                         numpy.array(train_for_fit)
                     )
                     self.isFitted[label] = True
                         
             error_cur_epoch = self.validate_model(
                 valid_names = valid_names,
                 read_window = read_window,
                 read_algo = read_algo,
                 read_rank = read_rank
             )
             self.valid_error_array.append([])
             self.valid_error_array[-1].append(epoch)
             self.valid_error_array[-1].append(train_patient)
             self.valid_error_array[-1].append(error_cur_epoch)
             
             gc.collect()
Exemplo n.º 9
0
def test_log_reg(test_names,
                 read_algo,
                 read_window,
                 read_rank,                 
                 classifier,
                 window_size=1):
    test_reader = ICHISeqDataReader(test_names)
    
    index = T.lscalar()
    y = T.iscalar('y')
    
    test_error_array = []    
    
    for pat_num in xrange(len(test_names)):
        test_set_x, test_set_y = test_reader.read_next_doc(
            algo = read_algo,
            window = read_window,
            rank = read_rank
        )
        n_test_samples = test_set_x.get_value(borrow=True).shape[0] - window_size + 1
        # compiling a Theano function that computes the mistakes that are made by
        # the model on a row
        test_model = theano.function(
            inputs=[index],
            outputs=[classifier.errors(y), classifier.predict(), y],
            givens={
                classifier.x: test_set_x[index: index + window_size],
                y: test_set_y[index + window_size - 1]
            }
        )
        
        test_result = [test_model(i) for i in xrange(n_test_samples)]
        test_result = numpy.asarray(test_result)
        test_losses = test_result[:,0]
        test_score = float(numpy.mean(test_losses))*100
                            
        test_error_array.append(test_score)
     
    return test_error_array
Exemplo n.º 10
0
def train_separately():
    all_train = ['p002','p003','p005','p007','p08a','p08b','p09a','p09b',
			'p10a','p011','p012','p013','p014','p15a','p15b','p016',
               'p017','p018','p019','p020','p021','p022','p023','p025',
               'p026','p027','p028','p029','p030','p031','p032','p033',
               'p034','p035','p036','p037','p038','p040','p042','p043',
               'p044','p045','p047','p048','p049','p050','p051']
    valid_data = all_train
        
    valid_reader = ICHISeqDataReader(valid_data)
    
    for valid_patient in valid_data:
        #get data divided on sequences with respect to labels
        valid_set_x, valid_set_y = valid_reader.read_next_doc()
        
        patient_error = get_error_on_patient(
            hidden_set=valid_set_y.eval()
        )
        
        print(patient_error, ' error for patient ' + valid_patient)

        gc.collect()  
Exemplo n.º 11
0
    def validate_model(self,
                       valid_names,
                       read_window,
                       read_algo,
                       read_rank
                       ):
        valid_reader = ICHISeqDataReader(valid_names)
        valid_errors = []
        for i in xrange (len(valid_names)):
            valid_x, valid_y = valid_reader.read_next_doc(
                algo = read_algo,
                rank = read_rank,
                window = read_window,
                divide = False
            )

            #compute mean error value for patients in validation set
            pat_error = mean_error(
                gen_hmm = self,
                obs_seq = valid_x.get_value(),
                actual_states = valid_y.eval()
            )
            valid_errors.append(pat_error)
        return numpy.mean(valid_errors)
Exemplo n.º 12
0
def finetune_hmm1(sda,
                  n_hiddens,
                  n_hmms,
                  train_names,
                  valid_names,
                  global_epochs,
                  read_rank,
                  read_window,
                  read_algo,
                  window_size,
                  posttrain_algo,
                  posttrain_rank,
                  posttrain_window):
                      
    # set hmm1 layer on sda
    sda.set_hmm1(
        hmm1 = GeneralHMM(
            n_hiddens = n_hiddens,
            n_hmms = n_hmms
        )        
    )
    
    for epoch in xrange(global_epochs):
        train_reader = ICHISeqDataReader(train_names)
        n_train_patients = len(train_names)
        
        #train hmms on data of each patient
        for train_patient in xrange(n_train_patients):
            #get data divided on sequences with respect to labels
            train_set = train_reader.read_next_doc(
                algo = read_algo,
                rank = read_rank,
                window = read_window,
                divide = True
            )
            for label in xrange(n_hmms):
                set_for_label = set(train_set[label].eval())
                if set_for_label != []:
                    n_train_times = len(set_for_label) - window_size + 1
                    
                    train_after_sda = numpy.array(
                        [sda.get_da_output(
                            set_for_label[time: time + window_size]
                        ).ravel()
                        for time in xrange(n_train_times)]
                    )
                    
                    if train_after_sda != []:
                        sda.hmm1.hmm_models[label].fit(
                            [numpy.array(train_after_sda).reshape((-1, 1))]
                        )
                            
            error_cur_epoch = validate_model(
                sda = sda,
                valid_names = valid_names,
                read_window = read_window,
                read_algo = read_algo,
                read_rank = read_rank,
                window_size = window_size
            )
            sda.hmm1.valid_error_array.append([])
            sda.hmm1.valid_error_array[-1].append(
                epoch*n_train_patients + train_patient
            )
            sda.hmm1.valid_error_array[-1].append(error_cur_epoch)
                
            gc.collect()
            
    return sda
Exemplo n.º 13
0
def train_logistic_sgd(
        read_algo,
        read_window,
        read_rank,
        learning_rate,
        n_epochs,
        train_names,
        valid_names,
        classifier,
        output_folder,
        base_folder,
        window_size=1
    ):
                          
    # read the datasets
    train_reader = ICHISeqDataReader(train_names)
    valid_reader = ICHISeqDataReader(valid_names)
    
    # early-stopping parameters    
    patience_increase = 25  # wait this much longer when a new best is found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                  # considered significant
    best_validation_loss = numpy.inf

    done_looping = False
    iter = 0
    classifier.train_cost_array = []
    classifier.train_error_array = []
    classifier.valid_error_array = []
        
    for pat_num in xrange (len(train_names)):
        pat_epoch = 0
        # go through the training set
        train_set_x, train_set_y = train_reader.read_next_doc(
            algo = read_algo,
            window = read_window,
            rank = read_rank
        )
        valid_set_x, valid_set_y = valid_reader.read_next_doc(
            algo = read_algo,
            window = read_window,
            rank = read_rank
        )
        n_train_samples = train_set_x.get_value(borrow=True).shape[0] - window_size + 1
        n_valid_samples = valid_set_x.get_value(borrow=True).shape[0] - window_size + 1
        
        patience = n_train_samples*2  # look as this many examples regardless
        validation_frequency = patience / 4
        
        train_model, validate_model = training_functions_log_reg_sgd(
            classifier = classifier,
            window_size = window_size
        )
        
        done_looping = False
        
        while (pat_epoch < n_epochs) and (not done_looping):
            cur_train_cost =[]
            cur_train_error = []
            for index in xrange(n_train_samples):            
                sample_cost, sample_error, cur_pred, cur_actual = train_model(
                    index = index,
                    train_set_x = train_set_x.get_value(borrow=True),
                    train_set_y = train_set_y.eval(),
                    lr = learning_rate
                )
                # iteration number
                iter = pat_epoch * n_train_samples + index
                    
                cur_train_cost.append(sample_cost)
                cur_train_error.append(sample_error)
            
                if (iter + 1) % validation_frequency == 0:
                    # compute zero-one loss on validation set
                    validation_losses = []
                    for i in xrange(n_valid_samples):
                        validation_loss, cur_pred, cur_actual = validate_model(
                            index = i,
                            valid_set_x = valid_set_x.get_value(borrow=True),
                            valid_set_y = valid_set_y.eval()
                        )
                        validation_losses.append(validation_loss)
        
                    this_validation_loss = float(numpy.mean(validation_losses))*100                 
                    classifier.valid_error_array.append([])
                    classifier.valid_error_array[-1].append(classifier.epoch + float(iter)/n_train_samples)
                    classifier.valid_error_array[-1].append(this_validation_loss)
           
                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:
                        #improve patience if loss improvement is good enough
                        if this_validation_loss < best_validation_loss *  \
                            improvement_threshold:
                            patience = max(patience, iter * patience_increase)
            
                        best_validation_loss = this_validation_loss

                if patience*4 <= iter:
                    done_looping = True
                    break
                               
            classifier.train_cost_array.append([])
            classifier.train_cost_array[-1].append(classifier.epoch + float(iter)/n_train_samples)
            classifier.train_cost_array[-1].append(float(numpy.mean(cur_train_cost)))
            cur_train_cost =[]
           
            classifier.train_error_array.append([])
            classifier.train_error_array[-1].append(classifier.epoch + float(iter)/n_train_samples)
            classifier.train_error_array[-1].append(float(numpy.mean(cur_train_error)*100))
            cur_train_error =[]
                    
            classifier.epoch = classifier.epoch + 1
            pat_epoch = pat_epoch + 1
            gc.collect()
                        
    return classifier
Exemplo n.º 14
0
def finetune_log_layer_sgd(
    sda,
    train_names,
    valid_names,
    read_algo,
    read_window,
    read_rank,
    window_size,
    finetune_lr,
    global_epochs,
    pat_epochs,
    output_folder):
    ########################
    # FINETUNING THE MODEL #
    ########################

    # get the training, validation and testing functions for the model
    train_fn, validate_model = build_finetune_functions(
        sda=sda,
        window_size=window_size,
        learning_rate=finetune_lr
    )
    
    train_reader = ICHISeqDataReader(train_names)

    # early-stopping parameters
    patience_increase = 25  # wait this much longer when a new best is                            # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant   
                                  
    best_valid = numpy.inf
    cur_train_cost =[]
    cur_train_error = []
    iter = 0
    
    for global_epoch in xrange(global_epochs):
        for pat_num in xrange(len(train_names)):
            done_looping = False
            # go through the training set
            train_set_x, train_set_y = train_reader.read_next_doc(
                algo = read_algo,
                window = read_window,
                rank = read_rank
            )
            n_train_samples = train_set_x.get_value(borrow=True).shape[0] - window_size + 1
            
            patience = n_train_samples*2  # look as this many examples regardless
            validation_frequency = patience / 2 # go through this many
                                      # minibatche before checking the network
                                      # on the validation set; in this case we
                                      # check every epoch
            pat_epoch = 0
    
            while (pat_epoch < pat_epochs) and (not done_looping):
                for index in xrange(n_train_samples):          
                    sample_cost, sample_error, cur_pred, cur_actual = train_fn(
                        index = index,
                        train_set_x = train_set_x.get_value(borrow=True),
                        train_set_y = train_set_y.eval()
                    )
                    
                    # iteration number
                    iter = iter + 1
                        
                    cur_train_cost.append(sample_cost)
                    cur_train_error.append(sample_error)
        
                    if (iter + 1) % validation_frequency == 0:
                        valid_reader = ICHISeqDataReader(valid_names)
                        valid_array = []
                        for valid_pat in xrange(len(valid_names)):
                            valid_set_x, valid_set_y = valid_reader.read_next_doc(
                                algo = read_algo,
                                window = read_window,
                                rank = read_rank
                            )
                            n_valid_samples = valid_set_x.get_value(borrow=True).shape[0] - window_size + 1
                            validation_losses = [
                                validate_model(
                                    index = i,
                                    valid_set_x = valid_set_x.get_value(borrow=True),
                                    valid_set_y = valid_set_y.eval()
                                ) for i in xrange(n_valid_samples)
                            ]
                            this_validation_loss = float(numpy.mean(validation_losses))*100                 
                            valid_array.append(this_validation_loss)
                        valid_mean_error = numpy.mean(valid_array)                        
                        sda.logLayer.valid_error_array.append([])
                        sda.logLayer.valid_error_array[-1].append(sda.logLayer.epoch + float(index)/n_train_samples)
                        sda.logLayer.valid_error_array[-1].append(valid_mean_error)
                        
                        # if we got the best validation score until now
                        if valid_mean_error < best_valid:
        
                            #improve patience if loss improvement is good enough
                            if this_validation_loss < best_valid * \
                                improvement_threshold:
                                patience = max(patience, iter * patience_increase)
        
                            best_valid = valid_mean_error
        
                    if patience*4 <= iter:
                        done_looping = True
                        break
                                   
                sda.logLayer.train_cost_array.append([])
                sda.logLayer.train_cost_array[-1].append(sda.logLayer.epoch)
                sda.logLayer.train_cost_array[-1].append(numpy.mean(cur_train_cost))
                cur_train_cost =[]
               
                sda.logLayer.train_error_array.append([])
                sda.logLayer.train_error_array[-1].append(sda.logLayer.epoch)
                sda.logLayer.train_error_array[-1].append(numpy.mean(cur_train_error)*100)
                cur_train_error =[]
                        
                sda.logLayer.epoch = sda.logLayer.epoch + 1
                pat_epoch = pat_epoch + 1
                gc.collect()
                            
    visualize_finetuning(
        train_cost=sda.logLayer.train_cost_array,
        train_error=sda.logLayer.train_error_array,
        valid_error=sda.logLayer.valid_error_array,
        window_size=window_size,
        learning_rate=0,
        datasets_folder=output_folder,
        base_folder='finetune_log_reg'
    )

    return sda
Exemplo n.º 15
0
def pretrain_sda_sgd(
        sda,
        train_names,
        valid_names,
        read_window,
        read_algo,
        read_rank,
        window_size,
        pretrain_lr,
        corruption_levels,
        global_epochs,
        pat_epochs):
    # compute number of examples given in training set
    n_train_patients =  len(train_names)
    
    pretraining_fns, valid_fns = pretraining_functions_sda_sgd(sda=sda,
                                                    window_size=window_size)

    ## Pre-train layer-wise
    for i in xrange(sda.n_layers):
        cur_dA = sda.dA_layers[i]
        cur_dA.train_cost_array = []
        iter = 0
        for global_epoch in xrange(global_epochs):
            train_reader = ICHISeqDataReader(train_names)
            for patients in xrange(n_train_patients):
                # go through the training set
                train_set_x, train_set_y = train_reader.read_next_doc(
                    algo = read_algo,
                    window = read_window,
                    rank = read_rank
                )
                n_train_samples = train_set_x.get_value(borrow=True).shape[0] - window_size + 1
                
                patience = n_train_samples*2  # look as this many examples regardless
                validation_frequency = patience / 2 # go through this many
                                          # minibatche before checking the network
                                          # on the validation set; in this case we
                                          # check every epoch
                # go through pretraining epochs
                for pat_epoch in xrange(pat_epochs):
                    cur_epoch_cost=[]                               
                    for index in xrange(n_train_samples):
                        # iteration number
                        big_epoch = (global_epoch*n_train_patients + patients)*pat_epochs + pat_epoch
                        iter = iter + 1
                    
                        cur_epoch_cost.append(pretraining_fns[i](index=index,
                                 train_set = train_set_x.get_value(borrow=True),
                                 corruption=corruption_levels[i],
                                 lr=pretrain_lr))
                                 
                        # test on valid set        
                        if (iter + 1) % validation_frequency == 0:
                            valid_reader = ICHISeqDataReader(valid_names)
                            valid_array = []
                            for valid_pat in xrange(len(valid_names)):
                                valid_set_x, valid_set_y = valid_reader.read_next_doc(
                                    algo = read_algo,
                                    window = read_window,
                                    rank = read_rank
                                )
                                n_valid_samples = valid_set_x.get_value(borrow=True).shape[0] - window_size + 1
                                validation_losses = [
                                    valid_fns[i](
                                        index = index,
                                        valid_set = valid_set_x.get_value(borrow=True)
                                    ) for index in xrange(n_valid_samples)]
                                this_validation_loss = float(numpy.mean(validation_losses))*100                 
                                valid_array.append(this_validation_loss)
                            valid_mean_error = numpy.mean(valid_array)                        
                            cur_dA.valid_error_array.append([])
                            cur_dA.valid_error_array[-1].append(
                                big_epoch + float(index)/n_train_samples
                            )
                            cur_dA.valid_error_array[-1].append(valid_mean_error)
                                        
                    cur_dA.train_cost_array.append([])
                    cur_dA.train_cost_array[-1].append(big_epoch)
                    cur_dA.train_cost_array[-1].append(numpy.mean(cur_epoch_cost))
                    
                gc.collect()
            
    return sda
Exemplo n.º 16
0
def finetune_hmm2(sda,
                  read_window,
                  read_algo,
                  read_rank,
                  posttrain_rank,
                  posttrain_algo,
                  window_size,
                  train_names):
                     
    n_train_patients=len(train_names)
    
    n_visible = pow(10, posttrain_rank) + 2 - read_window #input of sda
    n_visible = n_visible - window_size + 1 #output of sda
    n_hidden = 7
    
    posttrain_window = sda.da_layers_output_size
        
    train_reader = ICHISeqDataReader(train_names)
    
    #create matrices for params of HMM layer
    pi_values = numpy.zeros((n_hidden,))
    a_values = numpy.zeros((n_hidden, n_hidden))
    b_values = numpy.zeros((n_hidden, n_visible))
    array_from_hidden = numpy.zeros((n_hidden,))
    
    if (posttrain_algo == "avg_disp" or posttrain_algo == "filter+avg_disp"):
        n_visible *= 10
        
    for train_patient in xrange(n_train_patients):
        train_set_x, train_set_y = train_reader.read_next_doc(
            algo = read_algo,
            window = read_window,
            rank = read_rank
        )
        train_set_x = train_set_x.get_value()
        train_set_y = train_set_y.eval()
        
        n_train_times = train_set_x.shape[0] - window_size + 1
        
        train_visible_after_sda = numpy.array(
            [sda.get_da_output(
                train_set_x[time: time + window_size]
            ).ravel()
            for time in xrange(n_train_times)]
        )
             
        new_train_visible = create_labels_after_das(
            da_output_matrix = train_visible_after_sda,
            algo = posttrain_algo,
            rank = posttrain_rank,
            window = posttrain_window
        )
        n_patient_samples = len(new_train_visible)
        half_window_size = int(window_size/2)
        new_train_hidden=train_set_y[half_window_size:n_patient_samples+half_window_size]
        
        pi_values, a_values, b_values, array_from_hidden = update_params_on_patient(
            pi_values=pi_values,
            a_values=a_values,
            b_values=b_values,
            array_from_hidden=array_from_hidden,
            hiddens_patient=new_train_hidden,
            visibles_patient=new_train_visible,
            n_hidden=n_hidden
        )
        
        gc.collect()
        
    pi_values, a_values, b_values = finish_training(
        pi_values=pi_values,
        a_values=a_values,
        b_values=b_values,
        array_from_hidden=array_from_hidden,
        n_hidden=n_hidden,
        n_patients=n_train_patients
    )
    
    hmm_model = hmm.MultinomialHMM(
        n_components=n_hidden,
        startprob=pi_values,
        transmat=a_values
    )
    
    hmm_model.n_symbols=n_visible
    hmm_model.emissionprob_=b_values 
    gc.collect()
    print('MultinomialHMM created')
    
    sda.set_hmm2(
        hmm2 = hmm_model
    )
    
    return sda
Exemplo n.º 17
0
def finetune_hmm1(sda,
                  n_components,
                  n_hmms,
                  train_names,
                  valid_names,
                  global_epochs,
                  read_rank,
                  read_window,
                  read_algo,
                  window_size,
                  posttrain_algo,
                  posttrain_rank,
                  posttrain_window,
                  output_folder):
                      
    # set hmm1 layer on sda
    sda.set_hmm1(
        hmm1 = GeneralHMM(
            n_components = n_components,
            n_hmms = n_hmms
        )        
    )
    
    for epoch in xrange(global_epochs):
        train_reader = ICHISeqDataReader(train_names)
        n_train_patients = len(train_names)
        
        #train hmms on data of each patient
        for train_patient in xrange(n_train_patients):
            #get data divided on sequences with respect to labels
            train_set = train_reader.read_next_doc(
                algo = read_algo,
                rank = read_rank,
                window = read_window,
                divide = True
            )
            for label in xrange(n_hmms):
                train_for_label = train_set[label].eval()
                if train_for_label != []:
                    n_train_times = train_for_label.shape[0] - window_size + 1
                    
                    train_after_sda = numpy.array(
                        [sda.get_da_output(
                            train_for_label[time: time + window_size]
                        ).ravel()
                        for time in xrange(n_train_times)]
                    )
                    
                    if train_after_sda.shape[0] > sda.hmm1.hmm_models[label].n_components:
                        sda.hmm1.hmm_models[label].fit(
                            train_after_sda.reshape((-1, 1))
                        )
                        sda.hmm1.isFitted[label] = True
                            
            error_cur_epoch = validate_model(
                sda = sda,
                valid_names = valid_names,
                read_window = read_window,
                read_algo = read_algo,
                read_rank = read_rank,
                window_size = window_size
            )
            sda.hmm1.valid_error_array.append([])
            sda.hmm1.valid_error_array[-1].append(
                epoch*n_train_patients + train_patient
            )
            sda.hmm1.valid_error_array[-1].append(error_cur_epoch)
                
            gc.collect()
            
    visualize_validating(
        valid_error=sda.hmm1.valid_error_array,
        window_size=window_size,
        datasets_folder=output_folder,
        base_folder='finetune_hmm1'
    )
            
    return sda
Exemplo n.º 18
0
def train_logistic_cg(read_algo, read_window, read_rank, train_names,
                      valid_names, window_size, n_epochs, classifier):

    # read the datasets
    train_reader = ICHISeqDataReader(train_names)
    valid_reader = ICHISeqDataReader(valid_names)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()

    # generate symbolic variables for input
    x = classifier.x  # data, presented as window with x, y, x for each sample
    y = T.iscalar('y')  # labels, presented as int label

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    for pat_num in xrange(len(train_names)):
        # go through the training set
        train_set_x, train_set_y = train_reader.read_next_doc(
            algo=read_algo, window=read_window, rank=read_rank)
        valid_set_x, valid_set_y = valid_reader.read_next_doc(
            algo=read_algo, window=read_window, rank=read_rank)
        n_train_samples = train_set_x.get_value(
            borrow=True).shape[0] - window_size + 1
        n_valid_samples = valid_set_x.get_value(
            borrow=True).shape[0] - window_size + 1

        validate_model = theano.function(
            [index],
            classifier.errors(y),
            givens={
                x: valid_set_x[index:index + window_size],
                y: valid_set_y[index + window_size - 1]
            },
            name="validate")

        #  compile a theano function that returns the cost
        conj_cost = theano.function(
            inputs=[index],
            outputs=[cost, classifier.errors(y),
                     classifier.predict(), y],
            givens={
                x: train_set_x[index:index + window_size],
                y: train_set_y[index + window_size - 1]
            },
            name="conj_cost")

        # compile a theano function that returns the gradient with respect to theta
        conj_grad = theano.function(
            [index],
            T.grad(cost, classifier.theta),
            givens={
                x: train_set_x[index:index + window_size],
                y: train_set_y[index + window_size - 1]
            },
            name="conj_grad")

        train_confusion_matrix = numpy.zeros((7, 7))

        # creates a function that computes the average cost on the training set
        def train_fn(theta_value):
            classifier.theta.set_value(theta_value, borrow=True)
            cur_train_cost = []
            cur_train_error = []
            for i in xrange(n_train_samples):
                sample_cost, sample_error, cur_pred, cur_actual = conj_cost(i)
                cur_train_cost.append(sample_cost)
                cur_train_error.append(sample_error)
                train_confusion_matrix[cur_actual][cur_pred] += 1

            this_train_loss = float(numpy.mean(cur_train_cost))
            classifier.train_cost_array.append([])
            classifier.train_cost_array[-1].append(classifier.epoch)
            classifier.train_cost_array[-1].append(this_train_loss)

            classifier.train_error_array.append([])
            classifier.train_error_array[-1].append(classifier.epoch)
            classifier.train_error_array[-1].append(
                float(numpy.mean(cur_train_error) * 100))

            classifier.epoch += 1

            return this_train_loss

        # creates a function that computes the average gradient of cost with
        # respect to theta
        def train_fn_grad(theta_value):
            classifier.theta.set_value(theta_value, borrow=True)
            grad = conj_grad(0)
            for i in xrange(1, n_train_samples):
                grad += conj_grad(i)
            return grad / n_train_samples

        # creates the validation function
        def callback(theta_value):
            classifier.theta.set_value(theta_value, borrow=True)
            #compute the validation loss
            validation_losses = [
                validate_model(i) for i in xrange(n_valid_samples)
            ]
            this_validation_loss = float(
                numpy.mean(validation_losses) * 100., )
            print('validation error %f %%' % (this_validation_loss))
            classifier.valid_error_array.append([])
            classifier.valid_error_array[-1].append(classifier.epoch)
            classifier.valid_error_array[-1].append(this_validation_loss)

        ###############
        # TRAIN MODEL #
        ###############

        # using scipy conjugate gradient optimizer
        print("Optimizing using scipy.optimize.fmin_cg...")
        best_theta = scipy.optimize.fmin_cg(
            f=train_fn,
            x0=numpy.zeros((classifier.n_in + 1) * classifier.n_out,
                           dtype=x.dtype),
            fprime=train_fn_grad,
            callback=callback,
            disp=0,
            maxiter=n_epochs)
    return classifier
Exemplo n.º 19
0
def test_sda(
    sda,
    test_names,
    read_window,
    read_algo,
    read_rank,
    window_size,
    posttrain_rank,
    posttrain_algo,
    predict_algo='viterbi'):

    test_reader = ICHISeqDataReader(test_names)
    posttrain_window = sda.da_layers_output_size
    
    index = T.lscalar('index')
    test_set_x = T.vector('test_set_x')
    test_set_y = T.ivector('test_set_y')
    y = T.iscalar('y')  # labels, presented as int label
    
    hmm1_error_array = []
    hmm2_error_array = []
    log_reg_errors = []
    
    test_log_reg = theano.function(
        inputs=[
            index,
            test_set_x,
            test_set_y
        ],
        outputs=[sda.logLayer.errors(y), sda.logLayer.predict(), y],
        givens={
            sda.x: test_set_x[index: index + window_size],
            y: test_set_y[index + window_size - 1]
        }
    )    
    
    for test_patient in test_names:
        test_set_x, test_set_y = test_reader.read_next_doc(
            algo = read_algo,
            window = read_window,
            rank = read_rank
        )
                        
        test_set_x = test_set_x.get_value(borrow=True)
        test_set_y = test_set_y.eval()
        
        n_test_times = test_set_x.shape[0] - window_size + 1
        
        test_result = [test_log_reg(
            index = i,
            test_set_x = test_set_x,
            test_set_y = test_set_y) for i in xrange(n_test_times)
        ]
        test_result = numpy.asarray(test_result)
        test_losses = test_result[:,0]
        test_score = float(numpy.mean(test_losses))*100
                            
        log_reg_errors.append(test_score)
                
        test_visible_after_sda = numpy.array(
            [sda.get_da_output(
                test_set_x[time: time+window_size]
            ).ravel()
            for time in xrange(n_test_times)]
        )
        
        half_window_size = int(window_size/2)
        test_y_after_sda = test_set_y[
            half_window_size : n_test_times + half_window_size
        ]
        
        #compute mean error value for patients in validation set hmm1
        pat_error = mean_error(
            gen_hmm = sda.hmm1,
            obs_seq = test_visible_after_sda,
            actual_states = test_y_after_sda
        )
        hmm1_error_array.append(pat_error)
        
                    
        new_test_visible = create_labels_after_das(
            da_output_matrix = test_visible_after_sda,
            algo = posttrain_algo,
            rank = posttrain_rank,
            window = posttrain_window
        )
        
        n_patient_samples = len(new_test_visible)
        new_test_hidden = test_set_y[half_window_size:n_patient_samples+half_window_size]
        
        patient_error = get_error_on_patient(
            model = sda.hmm2,
            visible_set = new_test_visible,
            hidden_set = new_test_hidden,
            algo = predict_algo,
            pat = test_patient,
            all_labels = True
        )
        
        hmm2_error_array.append(patient_error)
        print(patient_error, ' error (hmm) for patient ' + test_patient)
        print(test_score, ' error (log_reg) for patient ' + test_patient)
        gc.collect()
        
    return hmm1_error_array, hmm2_error_array, log_reg_errors
Exemplo n.º 20
0
def train_logistic_cg(
    read_algo,
    read_window,
    read_rank,
    train_names,
    valid_names,
    window_size,
    n_epochs,
    classifier):
    
    # read the datasets
    train_reader = ICHISeqDataReader(train_names)
    valid_reader = ICHISeqDataReader(valid_names)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  

    # generate symbolic variables for input
    x = classifier.x  # data, presented as window with x, y, x for each sample
    y = T.iscalar('y')  # labels, presented as int label


    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)
    
    for pat_num in xrange(len(train_names)):
        # go through the training set
        train_set_x, train_set_y = train_reader.read_next_doc(
            algo = read_algo,
            window = read_window,
            rank = read_rank
        )
        valid_set_x, valid_set_y = valid_reader.read_next_doc(
            algo = read_algo,
            window = read_window,
            rank = read_rank
        )
        n_train_samples = train_set_x.get_value(borrow=True).shape[0] - window_size + 1
        n_valid_samples = valid_set_x.get_value(borrow=True).shape[0] - window_size + 1
        
        validate_model = theano.function(
            [index],
            classifier.errors(y),
            givens={
                x: valid_set_x[index: index + window_size],
                y: valid_set_y[index + window_size - 1]
            },
            name="validate"
        )
    
        #  compile a theano function that returns the cost
        conj_cost = theano.function(
            inputs=[index],
            outputs=[cost, classifier.errors(y), classifier.predict(), y],
            givens={
                x: train_set_x[index: index + window_size],
                y: train_set_y[index + window_size - 1]
            },
            name="conj_cost"
        )

        # compile a theano function that returns the gradient with respect to theta
        conj_grad = theano.function(
            [index],
            T.grad(cost, classifier.theta),
            givens={
                x: train_set_x[index: index + window_size],
                y: train_set_y[index + window_size - 1]
            },
            name="conj_grad"
        )
        
        train_confusion_matrix = numpy.zeros((7, 7))
    
        # creates a function that computes the average cost on the training set
        def train_fn(theta_value):
            classifier.theta.set_value(theta_value, borrow=True)
            cur_train_cost = []
            cur_train_error =[]
            for i in xrange(n_train_samples):
                sample_cost, sample_error, cur_pred, cur_actual = conj_cost(i)
                cur_train_cost.append(sample_cost)
                cur_train_error.append(sample_error)
                train_confusion_matrix[cur_actual][cur_pred] += 1
            
            this_train_loss = float(numpy.mean(cur_train_cost))  
            classifier.train_cost_array.append([])
            classifier.train_cost_array[-1].append(classifier.epoch)
            classifier.train_cost_array[-1].append(this_train_loss)
           
            classifier.train_error_array.append([])
            classifier.train_error_array[-1].append(classifier.epoch)
            classifier.train_error_array[-1].append(float(numpy.mean(cur_train_error)*100))
                    
            classifier.epoch += 1
            
            return this_train_loss
    
        # creates a function that computes the average gradient of cost with
        # respect to theta
        def train_fn_grad(theta_value):
            classifier.theta.set_value(theta_value, borrow=True)
            grad = conj_grad(0)
            for i in xrange(1, n_train_samples):
                grad += conj_grad(i)
            return grad / n_train_samples
    
        # creates the validation function
        def callback(theta_value):
            classifier.theta.set_value(theta_value, borrow=True)
            #compute the validation loss
            validation_losses = [validate_model(i)
                                 for i in xrange(n_valid_samples)]
            this_validation_loss = float(numpy.mean(validation_losses) * 100.,)
            print('validation error %f %%' % (this_validation_loss))
            classifier.valid_error_array.append([])
            classifier.valid_error_array[-1].append(classifier.epoch)
            classifier.valid_error_array[-1].append(this_validation_loss)
        
        ###############
        # TRAIN MODEL #
        ###############
    
        # using scipy conjugate gradient optimizer
        print ("Optimizing using scipy.optimize.fmin_cg...")
        best_theta = scipy.optimize.fmin_cg(
            f=train_fn,
            x0=numpy.zeros((classifier.n_in + 1) * classifier.n_out, dtype=x.dtype),
            fprime=train_fn_grad,
            callback=callback,
            disp=0,
            maxiter=n_epochs
        )
    return classifier