示例#1
0
    def Datset_Generate(self):
        token_dict = yaml.load(open(self.hp.Token_Path), Loader=yaml.Loader)

        train_dataset = Dataset(
            token_dict=token_dict,
            pattern_path=self.hp.Train.Train_Pattern.Path,
            metadata_file=self.hp.Train.Train_Pattern.Metadata_File)
        dev_dataset = Dataset(
            token_dict=token_dict,
            pattern_path=self.hp.Train.Eval_Pattern.Path,
            metadata_file=self.hp.Train.Eval_Pattern.Metadata_File)
        inference_dataset = Inference_Dataset(
            token_dict=token_dict,
            pattern_paths=self.hp.Train.Inference_Pattern_in_Train)

        if self.gpu_id == 0:
            logging.info('The number of train patterns = {}.'.format(
                len(train_dataset)))
            logging.info('The number of development patterns = {}.'.format(
                len(dev_dataset)))
            logging.info('The number of inference patterns = {}.'.format(
                len(inference_dataset)))

        collater = Collater(token_dict=token_dict)
        inference_collater = Inference_Collater(token_dict=token_dict)

        self.dataloader_dict = {}
        self.dataloader_dict['Train'] = torch.utils.data.DataLoader(
            dataset= train_dataset,
            sampler= torch.utils.data.DistributedSampler(train_dataset, shuffle= True) \
                     if self.hp.Use_Multi_GPU else \
                     torch.utils.data.RandomSampler(train_dataset),
            collate_fn= collater,
            batch_size= self.hp.Train.Batch_Size,
            num_workers= self.hp.Train.Num_Workers,
            pin_memory= True
            )
        self.dataloader_dict['Dev'] = torch.utils.data.DataLoader(
            dataset= dev_dataset,
            sampler= torch.utils.data.DistributedSampler(dev_dataset, shuffle= True) \
                     if self.num_gpus > 1 else \
                     torch.utils.data.RandomSampler(dev_dataset),
            collate_fn= collater,
            batch_size= self.hp.Train.Batch_Size,
            num_workers= self.hp.Train.Num_Workers,
            pin_memory= True
            )
        self.dataloader_dict['Inference'] = torch.utils.data.DataLoader(
            dataset=inference_dataset,
            sampler=torch.utils.data.SequentialSampler(inference_dataset),
            collate_fn=inference_collater,
            batch_size=self.hp.Inference_Batch_Size
            or self.hp.Train.Batch_Size,
            num_workers=self.hp.Train.Num_Workers,
            pin_memory=True)
示例#2
0
    def Dataset_Generate(self):
        train_dataset = Dataset(
            pattern_path=self.hp.Train.Train_Pattern.Path,
            metadata_file=self.hp.Train.Train_Pattern.Metadata_File,
            pattern_per_speaker=self.hp.Train.Batch.Train.Pattern_per_Speaker)
        dev_dataset = Dataset(
            pattern_path=self.hp.Train.Eval_Pattern.Path,
            metadata_file=self.hp.Train.Eval_Pattern.Metadata_File,
            pattern_per_speaker=self.hp.Train.Batch.Eval.Pattern_per_Speaker)
        inference_dataset = Dataset(
            pattern_path=self.hp.Train.Eval_Pattern.Path,
            metadata_file=self.hp.Train.Eval_Pattern.Metadata_File,
            pattern_per_speaker=self.hp.Train.Batch.Eval.Pattern_per_Speaker,
            num_speakers=50,  #Maximum number by tensorboard.
        )
        logging.info('The number of train speakers = {}.'.format(
            len(train_dataset)))
        logging.info('The number of development speakers = {}.'.format(
            len(dev_dataset)))

        collater = Collater(min_frame_length=self.hp.Train.Frame_Length.Min,
                            max_frame_length=self.hp.Train.Frame_Length.Max)
        inference_collater = Inference_Collater(
            samples=self.hp.Train.Inference.Samples,
            frame_length=self.hp.Train.Inference.Frame_Length,
            overlap_length=self.hp.Train.Inference.Overlap_Length)

        self.dataloader_dict = {}
        self.dataloader_dict['Train'] = torch.utils.data.DataLoader(
            dataset= train_dataset,
            sampler= torch.utils.data.DistributedSampler(train_dataset, shuffle= True) \
                     if self.hp.Use_Multi_GPU else \
                     torch.utils.data.RandomSampler(train_dataset),
            collate_fn= collater,
            batch_size= self.hp.Train.Batch.Train.Speaker,
            num_workers= self.hp.Train.Num_Workers,
            pin_memory= True
            )
        self.dataloader_dict['Dev'] = torch.utils.data.DataLoader(
            dataset= dev_dataset,
            sampler= torch.utils.data.DistributedSampler(dev_dataset, shuffle= True) \
                     if self.num_gpus > 1 else \
                     torch.utils.data.RandomSampler(dev_dataset),
            collate_fn= collater,
            batch_size= self.hp.Train.Batch.Eval.Speaker,
            num_workers= self.hp.Train.Num_Workers,
            pin_memory= True
            )
        self.dataloader_dict['Inference'] = torch.utils.data.DataLoader(
            dataset=inference_dataset,
            shuffle=True,
            collate_fn=inference_collater,
            batch_size=self.hp.Train.Batch.Eval.Speaker,
            num_workers=self.hp.Train.Num_Workers,
            pin_memory=True)
示例#3
0
def distribution_user_view():
    """
    view the distribution of distance
    """

    dataset = Dataset()
    data = dataset.data
    label = dataset.label
    data = data.reshape((len(data), -1))

    data_nagitive = []
    data_positive = []
    for i in range(len(label)):
        item = data[i]
        if label[i] == 0.0:
            for j in range(len(item)):
                if item[j] != -1.0:
                    data_nagitive.append(float(item[j]))
        else:
            for j in range(len(item)):
                if item[j] != -1.0:
                    data_positive.append(float(item[j]))
    print(type(data_positive[0]))

    sns.kdeplot(
        data_positive,
        color='r',
    )
    sns.kdeplot(
        data_nagitive,
        color='b',
    )
    plt.show()
示例#4
0
def load_ds(query_params):
    if query_params.query_name in os.listdir('../outputs/pickles/'):
        print 'loading saved ds'
        data = pickle.load(open('../outputs/pickles/' + query_params.query_name, 'rb'))
    else:
        print 'generating new ds'
        data = Dataset(execute_query(query_params.queries))
        pickle.dump(data, open('../outputs/pickles/' + query_params.query_name, 'wb'))
    return data
示例#5
0
    def data_init(self):

        print("\nData init")
        #self.dataset = TCGA_Dataset(self.config)
        self.dataset = Dataset(self.config)

        generator = Generator(self.config, self.dataset)
        self.train_generator = generator.generate()

        self.X_val, self.y_val = self.dataset.convert_to_arrays(
            self.dataset._partition[0]['val'],
            self.dataset._partition[1]['val'],
            phase='val',
            size=self.config.sampling_size_val)

        self.X_test, self.y_test = self.dataset.convert_to_arrays(
            self.dataset._partition[0]['test'],
            self.dataset._partition[1]['test'],
            phase='test',
            size=self.config.sampling_size_test)

        self.y_test = self.patch_to_image(self.y_test, proba=False)
示例#6
0
    def Datset_Generate(self):
        train_Dataset = Dataset(
            pattern_path=hp.Train.Train_Pattern.Path,
            metadata_file=hp.Train.Train_Pattern.Metadata_File,
            accumulated_dataset_epoch=hp.Train.Train_Pattern.
            Accumulated_Dataset_Epoch,
            mel_length_min=hp.Train.Train_Pattern.Mel_Length.Min,
            mel_length_max=hp.Train.Train_Pattern.Mel_Length.Max,
            text_length_min=hp.Train.Train_Pattern.Text_Length.Min,
            text_length_max=hp.Train.Train_Pattern.Text_Length.Max,
            use_cache=hp.Train.Use_Pattern_Cache)
        dev_Dataset = Dataset(
            pattern_path=hp.Train.Eval_Pattern.Path,
            metadata_file=hp.Train.Eval_Pattern.Metadata_File,
            mel_length_min=hp.Train.Eval_Pattern.Mel_Length.Min,
            mel_length_max=hp.Train.Eval_Pattern.Mel_Length.Max,
            text_length_min=hp.Train.Eval_Pattern.Text_Length.Min,
            text_length_max=hp.Train.Eval_Pattern.Text_Length.Max,
            use_cache=hp.Train.Use_Pattern_Cache)
        inference_Dataset = Inference_Dataset(
            pattern_path=hp.Train.Inference_Pattern_File_in_Train)
        logging.info('The number of train patterns = {}.'.format(
            len(train_Dataset) //
            hp.Train.Train_Pattern.Accumulated_Dataset_Epoch))
        logging.info('The number of development patterns = {}.'.format(
            len(dev_Dataset)))
        logging.info('The number of inference patterns = {}.'.format(
            len(inference_Dataset)))

        collater = Collater()
        inference_Collater = Inference_Collater()

        self.dataLoader_Dict = {}
        self.dataLoader_Dict['Train'] = torch.utils.data.DataLoader(
            dataset=train_Dataset,
            shuffle=True,
            collate_fn=collater,
            batch_size=hp.Train.Batch_Size,
            num_workers=hp.Train.Num_Workers,
            pin_memory=True)
        self.dataLoader_Dict['Dev'] = torch.utils.data.DataLoader(
            dataset=dev_Dataset,
            shuffle=True,
            collate_fn=collater,
            batch_size=hp.Train.Batch_Size,
            num_workers=hp.Train.Num_Workers,
            pin_memory=True)
        self.dataLoader_Dict['Inference'] = torch.utils.data.DataLoader(
            dataset=inference_Dataset,
            shuffle=False,
            collate_fn=inference_Collater,
            batch_size=hp.Inference_Batch_Size or hp.Train.Batch_Size,
            num_workers=hp.Train.Num_Workers,
            pin_memory=True)

        if hp.Mode in ['PE', 'GR']:
            self.dataLoader_Dict[
                'Prosody_Check'] = torch.utils.data.DataLoader(
                    dataset=Prosody_Check_Dataset(
                        pattern_path=hp.Train.Train_Pattern.Path,
                        metadata_file=hp.Train.Train_Pattern.Metadata_File,
                        mel_length_min=hp.Train.Train_Pattern.Mel_Length.Min,
                        mel_length_max=hp.Train.Train_Pattern.Mel_Length.Max,
                        use_cache=hp.Train.Use_Pattern_Cache),
                    shuffle=False,
                    collate_fn=Prosody_Check_Collater(),
                    batch_size=hp.Train.Batch_Size,
                    num_workers=hp.Train.Num_Workers,
                    pin_memory=True)
    print(device)

    kwargs = {'num_workers': 8, 'pin_memory': True} if args.cuda else {}

    print('Loading the dataset...')
    path = '/home/john/Desktop/Dissertation/data/labels_PCA'
    with open(path, 'rb') as f:
        labels_dict = pickle.load(f)
    labels_ID = list(labels_dict.keys())

    path = '/home/john/Desktop/Dissertation/data/Dataset_1.npy'
    df_train = np.load(path)
    labels = np.array(list(labels_dict.values()))

    print('Creating DataLoader...')
    train_dataset = Dataset(data=df_train, labels=labels)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)

    train_loss = []
    model = RICA(df_train.shape[1], n_clusters=256, penalty=1.2).to(device)
    optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.5)
    for epoch in range(1, args.epochs + 1):
        print('Executing Epoch...', epoch)
        train_loss.append(train(epoch, model, optimizer, scheduler))

    path = '/home/john/Desktop/Dissertation/TrainingError/RICA_ADAM_loss'
    with open(path, 'wb') as f:
示例#8
0
import pandas as pd
import numpy as np
from Datasets import Dataset
from vectorization import Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from cotraining import CotClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report



#Data paths
train_dir = "train/"
test_dir = "test/"
directory ="/home/goksenin/Desktop/GRADUATION PROJECT/Programming/"
dataset = Dataset(directory)
n_train = 3375
n_test = 1125
X_train, Y_train = dataset.get_set(train_dir)
X_test, Y_test = dataset.get_set(test_dir)

# -1: unlabeled 0:non-relative 1:relative 
y_train = np.asarray(Y_train)
y_train[n_train//4: ] = -1

#######FEATURE EXTRACTION
#getting related documents for feature_extraction
relative_index = [i for i, y_i in  enumerate(Y_train) if y_i == 1]
related_data = []
for index in relative_index:
    related_data.append(X_train[index])
示例#9
0
 def data_init(self):
     print("\nData init")
     self.dataset = Dataset(self.config)
     self.generator = Generator(self.config, self.dataset)
示例#10
0
class DRAM(object):
    def __init__(self, config):
        self.config = config
        self.data_init()
        self.model_init()

    def data_init(self):
        print("\nData init")
        self.dataset = Dataset(self.config)
        self.generator = Generator(self.config, self.dataset)

    def model_init(self):

        self.rnn_cell = tf.contrib.rnn
        self.config = config
        self.regularizer = tf.contrib.layers.l2_regularizer(
            scale=self.config.regularizer)
        self.initializer = tf.contrib.layers.xavier_initializer()
        self.images_ph = tf.placeholder(
            tf.float32,
            [None, self.config.input_shape, self.config.input_shape, 3])
        self.labels_ph = tf.placeholder(tf.int64, [None])
        self.N = tf.shape(self.images_ph)[0]

        # ------- GlimpseNet / LocNet -------

        with tf.variable_scope('glimpse_net'):
            self.gl = ConvGlimpseNetwork(self.config, self.images_ph)

        with tf.variable_scope('loc_net'):
            self.loc_net = LocNet(self.config)

        self.init_loc = tf.zeros(shape=[self.N, 2], dtype=tf.float32)
        with tf.variable_scope("rnn_decoder/loop_function",
                               reuse=tf.AUTO_REUSE):
            self.init_glimpse = self.gl(self.init_loc)

        self.inputs = [self.init_glimpse]
        self.inputs.extend([0] * (self.config.num_glimpses - 1))

        # ------- Recurrent network -------

        def get_next_input(output, i):

            loc, loc_mean = self.loc_net(output)
            gl_next = self.gl(loc)

            self.loc_mean_arr.append(loc_mean)
            self.sampled_loc_arr.append(loc)
            self.glimpses.append(self.gl.glimpse)

            return gl_next

        def rnn_decoder(decoder_inputs,
                        initial_state,
                        cell,
                        loop_function=None):

            with tf.variable_scope("rnn_decoder"):
                state = initial_state
                outputs = []
                prev = None

                for i, inp in enumerate(decoder_inputs):
                    if loop_function is not None and prev is not None:
                        with tf.variable_scope("loop_function",
                                               reuse=tf.AUTO_REUSE):
                            inp = loop_function(prev, i)

                    if i > 0:
                        tf.get_variable_scope().reuse_variables()

                    output, state = cell(inp, state)
                    outputs.append(output)

                    if loop_function is not None:
                        prev = output

            return outputs, state

        self.loc_mean_arr = [self.init_loc]
        self.sampled_loc_arr = [self.init_loc]
        self.glimpses = [self.gl.glimpse]

        self.lstm_cell = self.rnn_cell.LSTMCell(self.config.cell_size,
                                                state_is_tuple=True,
                                                activation=tf.nn.tanh,
                                                forget_bias=1.)
        self.init_state = self.lstm_cell.zero_state(self.N, tf.float32)
        self.outputs, self.rnn_state = rnn_decoder(
            self.inputs,
            self.init_state,
            self.lstm_cell,
            loop_function=get_next_input)

        # ------- Classification -------

        baselines = []
        for t, output in enumerate(self.outputs):
            with tf.variable_scope('baseline', reuse=tf.AUTO_REUSE):
                baseline_t = tf.layers.dense(
                    inputs=output,
                    units=2,
                    kernel_initializer=self.initializer)
            baseline_t = tf.squeeze(baseline_t)
            baselines.append(baseline_t)

        baselines = tf.stack(baselines)
        self.baselines = tf.transpose(baselines)

        with tf.variable_scope('classification', reuse=tf.AUTO_REUSE):
            self.class_prob_arr = []
            for t, op in enumerate(self.outputs):
                self.glimpse_logit = tf.layers.dense(
                    inputs=op,
                    units=self.config.num_classes,
                    kernel_initializer=self.initializer,
                    name='FCCN',
                    reuse=tf.AUTO_REUSE)
                self.glimpse_logit = tf.stop_gradient(self.glimpse_logit)
                self.glimpse_logit = tf.nn.softmax(self.glimpse_logit)
                self.class_prob_arr.append(self.glimpse_logit)
            self.class_prob_arr = tf.stack(self.class_prob_arr, axis=1)

        self.output = self.outputs[-1]
        with tf.variable_scope('classification', reuse=tf.AUTO_REUSE):
            self.logits = tf.layers.dense(inputs=self.output,
                                          units=self.config.num_classes,
                                          kernel_initializer=self.initializer,
                                          name='FCCN',
                                          reuse=tf.AUTO_REUSE)

            self.softmax = tf.nn.softmax(self.logits)

        self.sampled_locations = tf.concat(self.sampled_loc_arr, axis=0)
        self.mean_locations = tf.concat(self.loc_mean_arr, axis=0)
        self.sampled_locations = tf.reshape(
            self.sampled_locations, (self.config.num_glimpses, self.N, 2))
        self.sampled_locations = tf.transpose(self.sampled_locations,
                                              [1, 0, 2])
        self.mean_locations = tf.reshape(self.mean_locations,
                                         (self.config.num_glimpses, self.N, 2))
        self.mean_locations = tf.transpose(self.mean_locations, [1, 0, 2])
        prefix = tf.expand_dims(self.init_loc, 1)
        self.sampled_locations = tf.concat([prefix, self.sampled_locations],
                                           axis=1)
        self.mean_locations = tf.concat([prefix, self.mean_locations], axis=1)
        self.glimpses = tf.stack(self.glimpses, axis=1)

        # Losses/reward

        def loglikelihood(mean_arr, sampled_arr, sigma):
            mu = tf.stack(mean_arr)
            sampled = tf.stack(sampled_arr)
            gaussian = tf.contrib.distributions.Normal(mu, sigma)
            logll = gaussian.log_prob(sampled)
            logll = tf.reduce_sum(logll, 2)
            logll = tf.transpose(logll)
            return logll

        self.xent = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=self.logits, labels=self.labels_ph)
        self.xent = tf.reduce_mean(self.xent)

        self.pred_labels = tf.argmax(self.logits, 1)
        self.reward = tf.cast(tf.equal(self.pred_labels, self.labels_ph),
                              tf.float32)
        self.rewards = tf.expand_dims(self.reward, 1)
        self.rewards = tf.tile(self.rewards, [1, self.config.num_glimpses])
        self.logll = loglikelihood(self.loc_mean_arr, self.sampled_loc_arr,
                                   self.config.loc_std)
        self.advs = self.rewards - tf.stop_gradient(self.baselines)
        self.logllratio = tf.reduce_mean(self.logll * self.advs)

        self.reward = tf.reduce_mean(self.reward)

        self.baselines_mse = tf.reduce_mean(
            tf.square((self.rewards - self.baselines)))
        self.var_list = tf.trainable_variables()

        self.loss = -self.logllratio + self.xent + self.baselines_mse
        self.grads = tf.gradients(self.loss, self.var_list)
        self.grads, _ = tf.clip_by_global_norm(self.grads,
                                               self.config.max_grad_norm)

        self.setup_optimization()

        # session
        self.session_config = tf.ConfigProto()
        self.session_config.gpu_options.visible_device_list = self.config.gpu
        self.session_config.gpu_options.allow_growth = True
        self.session = tf.Session(config=self.session_config)
        self.session.run(tf.global_variables_initializer())

    def setup_optimization(self):

        # learning rate
        self.global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(0),
            trainable=False)

        self.training_steps_per_epoch = int(
            len(self.generator.training_ids) // self.config.batch_size)
        print('Training Step Per Epoch:', self.training_steps_per_epoch)

        self.starter_learning_rate = self.config.lr_start
        self.learning_rate = tf.train.exponential_decay(
            self.starter_learning_rate,
            self.global_step,
            self.training_steps_per_epoch,
            0.70,
            staircase=False)
        self.learning_rate = tf.maximum(self.learning_rate, self.config.lr_min)
        self.optimizer = tf.train.MomentumOptimizer(self.learning_rate,
                                                    momentum=0.90,
                                                    use_nesterov=True)
        #self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = self.optimizer.apply_gradients(
            zip(self.grads, self.var_list), global_step=self.global_step)

    def setup_logger(self):
        """Creates log directory and initializes logger."""

        self.summary_ops = {
            'reward': tf.summary.scalar('reward', self.reward),
            'hybrid_loss': tf.summary.scalar('hybrid_loss', self.loss),
            'cross_entropy': tf.summary.scalar('cross_entropy', self.xent),
            'baseline_mse': tf.summary.scalar('baseline_mse',
                                              self.baselines_mse),
            'logllratio': tf.summary.scalar('logllratio', self.logllratio),
            'lr': tf.summary.scalar('lr', self.learning_rate)
        }
        # 'glimpses': tf.summary.image('glimpses',tf.reshape(self.glimpses,[-1,self.config.glimpse_size,
        #                                                                  self.config.glimpse_size,
        #                                                                 3]),max_outputs=8)}

        self.eval_ops = {
            'labels': self.labels_ph,
            'pred_labels': self.pred_labels,
            'reward': self.reward,
            'hybrid_loss': self.loss,
            'cross_entropy': self.xent,
            'baseline_mse': self.baselines_mse,
            'logllratio': self.logllratio,
            'lr': self.learning_rate
        }

        self.logger = Logger(self.config.logdir,
                             sess=self.session,
                             summary_ops=self.summary_ops,
                             global_step=self.global_step,
                             eval_ops=self.eval_ops,
                             n_verbose=self.config.n_verbose,
                             var_list=self.var_list)

    def train(self):

        print('\n\n\n------------ Starting training ------------  \nT -- %s x %s \n' \
              'Model:  %s glimpses, glimpse size %s x %s \n\n\n' % (
                  self.config.input_shape, self.config.input_shape, self.config.num_glimpses, self.config.glimpse_size,
                  self.config.glimpse_size))

        self.setup_logger()

        for i in range(self.config.steps + 1):

            loc_dir_name = self.config.logdir + '/image/locations'
            traj_dir_name = self.config.logdir + '/image/trajectories'
            ROCs_dir_name = self.config.logdir + '/metrics/ROCs_AUCs/'
            PRs_dir_name = self.config.logdir + '/metrics/PRs/'

            if i == 0:
                if os.path.exists(loc_dir_name):
                    shutil.rmtree(loc_dir_name)
                    os.makedirs(loc_dir_name)
                else:
                    os.makedirs(loc_dir_name)

                if os.path.exists(traj_dir_name):
                    shutil.rmtree(traj_dir_name)
                    os.makedirs(traj_dir_name)
                else:
                    os.makedirs(traj_dir_name)

                if os.path.exists(ROCs_dir_name):
                    shutil.rmtree(ROCs_dir_name)
                    os.makedirs(ROCs_dir_name)
                else:
                    os.makedirs(ROCs_dir_name)

                if os.path.exists(PRs_dir_name):
                    shutil.rmtree(PRs_dir_name)
                    os.makedirs(PRs_dir_name)
                else:
                    os.makedirs(PRs_dir_name)

            self.logger.step = i

            images, labels = self.generator.generate()
            images = images.reshape(
                (-1, self.config.input_shape, self.config.input_shape, 3))
            labels = labels[0]
            feed_dict = {self.images_ph: images, self.labels_ph: labels}

            fetches = [
                self.output, self.rewards, self.reward, self.labels_ph,
                self.pred_labels, self.logits, self.train_op, self.loss,
                self.xent, self.baselines_mse, self.logllratio,
                self.learning_rate, self.loc_mean_arr
            ]
            output, rewards, reward, real_labels, pred_labels, logits, _, hybrid_loss, cross_entropy, baselines_mse, logllratio, lr, locations = self.session.run(
                fetches, feed_dict)

            if i % 1 == 0:

                print('\n------ Step %s ------' % (i))
                print('reward', reward)
                print('labels', real_labels)
                print('pred_labels', pred_labels)
                print('hybrid_loss', hybrid_loss)
                print('cross_entropy', cross_entropy)
                print('baseline_mse', baselines_mse)
                print('logllratio', logllratio)
                print('lr', lr)
                print('locations', locations[-1])
                print('logits', logits)
                self.logger.log('train', feed_dict=feed_dict)

            #if  i > 0 and i % 100 == 0:

            #   self.eval(i)
            #  self.logger.log('val', feed_dict=feed_dict)

            if i == self.config.steps:

                self.test(i)

            #if i == self.config.steps:
        # if i > 0 and i % 100 == 0:

        #    glimpse_images = self.session.run(self.glimpses, feed_dict)
        #   mean_locations = self.session.run(self.mean_locations, feed_dict)
        #  probs = self.session.run(self.class_prob_arr, feed_dict)

        # plot_glimpses(config=self.config, glimpse_images=glimpse_images, pred_labels=pred_labels, probs=probs,
        #   sampled_loc=mean_locations, X=images, labels=real_labels, file_name=loc_dir_name, step=i)

        #plot_trajectories(config=self.config, locations=mean_locations, X=images, labels=real_labels,
        #   pred_labels=pred_labels, file_name=traj_dir_name, step=i)

        #self.logger.save()

    def eval(self, step):
        return self.evaluate(self.session, self.images_ph, self.labels_ph,
                             self.softmax, step)

    def evaluate(self, sess, images_ph, labels_ph, softmax, step):
        print('Evaluating (%s x %s) using %s glimpses' %
              (self.config.input_shape, self.config.input_shape,
               self.config.num_glimpses))
        self.X_val, self.y_val = self.dataset.convert_to_arrays(
            self.dataset._partition[0]['val'],
            size=self.config.sampling_size_val)
        print('Validation set has %s patients' % len(self.y_val))

        X_val, y_val = self.X_val, self.y_val

        _num_examples = X_val.shape[0]
        steps_per_epoch = _num_examples // self.config.eval_batch_size

        y_scores = []
        y_trues = []

        for i in tqdm(iter(range(steps_per_epoch))):

            images, labels_val = self.dataset.next_batch(
                X_val, y_val[0], self.config.eval_batch_size, i)
            #images = images.reshape((-1, self.config.input_shape, self.config.input_shape, 3))

            softmax_val = sess.run(softmax,
                                   feed_dict={
                                       images_ph: images,
                                       labels_ph: labels_val
                                   })
            y_trues.extend(labels_val)
            y_scores.extend(softmax_val)

        y_preds = np.argmax(y_scores, 1)
        y_scores = np.array(y_scores)

        self.metrics_ROCs(y_trues, y_preds, y_scores, step)
        self.metrics(y_trues, y_preds, step)
        return

    def count_params(self):
        return self.count_parameters(self.session)

    def count_parameters(self, sess):
        variables_names = [v.name for v in tf.trainable_variables()]
        values = sess.run(variables_names)
        n_params = 0

        for k, v in zip(variables_names, values):
            print('-'.center(140, '-'))
            print('%s \t Shape: %s \t %s parameters' % (k, v.shape, v.size))
            n_params += v.size

        print('-'.center(140, '-'))
        print('Total # parameters:\t\t %s \n\n' % (n_params))
        return n_params

    def metrics_ROCs(self, y_trues, y_preds, y_scores, step, stage=None):

        y_trues_binary = label_binarize(
            y_trues, classes=list(self.dataset.le_name_mapping.values()))
        y_preds_binary = label_binarize(
            y_preds, classes=list(self.dataset.le_name_mapping.values()))
        n_classes = y_preds_binary.shape[1]
        if stage == 'test':
            fpr, tpr, _ = roc_curve(y_trues, y_scores)
        else:
            fpr, tpr, _ = roc_curve(y_trues, y_scores[:, 1])

        roc_auc = auc(fpr, tpr)

        plt.figure()

        plt.plot(fpr,
                 tpr,
                 label='ROC curve (AUC = {0:0.2f})'
                 ''.format(roc_auc),
                 color='navy',
                 linestyle=':',
                 linewidth=4)

        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiving Operating Characteristic Curves')
        plt.legend(loc="lower right")
        plt.savefig(self.config.logdir + '/metrics/ROCs_AUCs/%i' % step)
        return

    def metrics(self, y_trues, y_preds, step):
        #        y_trues_binary= label_binarize(y_trues, classes=list(self.dataset.le_name_mapping.values()))
        #       y_preds_binary= label_binarize(y_preds, classes=list(self.dataset.le_name_mapping.values()))

        accuracy = accuracy_score(y_trues, y_preds)
        f1score = f1_score(y_trues, y_preds)
        recall = recall_score(y_trues, y_preds)
        precision = precision_score(y_trues, y_preds)
        names = ['accuracy', 'f1_score', 'recall', 'precision']
        pd.DataFrame(data=np.array([accuracy, f1score, recall, precision]),
                     index=names).to_csv(self.config.logdir +
                                         '/metrics/metrics_%i.csv' % step)
        return

    def load(self, checkpoint_dir):
        folder = os.path.join(checkpoint_dir, 'checkpoints')
        print('\nLoading model from <<{}>>.\n'.format(folder))

        self.saver = tf.train.Saver(self.var_list)
        ckpt = tf.train.get_checkpoint_state(folder)

        if ckpt and ckpt.model_checkpoint_path:
            print(ckpt)
            self.saver.restore(self.session, ckpt.model_checkpoint_path)

    def patch_to_image(self, y_patches, proba=True):

        if proba == True:
            y_image = np.array([
                np.mean(y_patches[i * self.config.sampling_size_test:(i + 1) *
                                  self.config.sampling_size_test],
                        axis=0)
                for i in range(
                    int(len(y_patches) / self.config.sampling_size_test))
            ])

        else:
            y_image = np.array([
                np.mean(y_patches[i * self.config.sampling_size_test:(i + 1) *
                                  self.config.sampling_size_test]) > 0.5
                for i in range(
                    int(len(y_patches) / self.config.sampling_size_test))
            ]).reshape((-1, 1)).astype(int)
            y_image = np.asarray(y_image.flatten())
        return y_image

    def test(self, step):
        return self.testing(self.session, self.images_ph, self.labels_ph,
                            self.softmax, step)

    def testing(self, sess, images_ph, labels_ph, softmax, step):
        print('Testing (%s x %s) using %s glimpses' %
              (self.config.input_shape, self.config.input_shape,
               self.config.num_glimpses))
        print(self.dataset._partition[0]['test'])
        self.X_test, self.y_test = self.dataset.convert_to_arrays(
            self.dataset._partition[0]['test'],
            size=self.config.sampling_size_test)
        X_test, y_test = self.X_test, self.y_test
        print('y_test', y_test)
        _num_examples = X_test.shape[0]
        steps_per_epoch = _num_examples // self.config.test_batch_size

        y_scores = []
        y_trues = []

        for i in tqdm(iter(range(steps_per_epoch))):

            images, labels_test = self.dataset.next_batch(
                X_test, y_test[0], self.config.test_batch_size, i)

            print(labels_test)
            #images = images.reshape((-1, self.config.input_shape, self.config.input_shape, 3))

            softmax_test = sess.run(softmax,
                                    feed_dict={
                                        images_ph: images,
                                        labels_ph: labels_test
                                    })
            y_trues.extend(labels_test)
            y_scores.extend(softmax_test)

        y_trues = self.patch_to_image(y_trues, proba=False)
        y_scores = self.patch_to_image(y_scores, proba=True)

        y_preds = np.argmax(y_scores, 1)

        print('Test Set', self.dataset._partition[0]['test'])
        print(y_trues)
        print(y_preds)

        self.metrics_ROCs(y_trues, y_preds, y_scores, step)
        self.metrics(y_trues, y_preds, step)
        return
示例#11
0
class Model(object):
    def __init__(self, config):

        self.config = config
        self.data_init()
        self.model_init()

    def data_init(self):

        print("\nData init")
        #self.dataset = TCGA_Dataset(self.config)
        self.dataset = Dataset(self.config)

        generator = Generator(self.config, self.dataset)
        self.train_generator = generator.generate()

        self.X_val, self.y_val = self.dataset.convert_to_arrays(
            self.dataset._partition[0]['val'],
            self.dataset._partition[1]['val'],
            phase='val',
            size=self.config.sampling_size_val)

        self.X_test, self.y_test = self.dataset.convert_to_arrays(
            self.dataset._partition[0]['test'],
            self.dataset._partition[1]['test'],
            phase='test',
            size=self.config.sampling_size_test)

        self.y_test = self.patch_to_image(self.y_test, proba=False)

    def plot_ROCs(self, y_scores):

        fig = plt.figure(figsize=(10, 10))
        y_true = self.y_test
        y_score = y_scores
        fpr, tpr, _ = roc_curve(y_true, y_score)
        auc = roc_auc_score(y_true, y_score)
        plt.plot(fpr,
                 tpr,
                 lw=2,
                 c='r',
                 alpha=0.8,
                 label=r'%s (AUC = %0.2f)' % auc)
        plt.plot([0, 1], [0, 1],
                 linestyle='--',
                 lw=2,
                 color='black',
                 label='Luck',
                 alpha=.8)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title("ROC curve")
        plt.legend(loc="lower right")
        fig.savefig("output/ROC_curve")
        plt.close()

    def plot_PRs(self, y_scores):

        fig = plt.figure(figsize=(10, 10))

        y_true = self.y_test
        y_score = y_scores

        precision, recall, _ = precision_recall_curve(y_true, y_score)
        plt.plot(recall,
                 precision,
                 lw=2,
                 c='b',
                 alpha=0.8,
                 label=r'PR curve (AP = %0.2f)' % (precision))
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title("PR curve")
        plt.legend(loc="lower right")
        fig.savefig("output/PR_curve")
        plt.close()

    def model_init(self):

        print("\nModel init")
        self.base_model = DenseNet169(include_top=False,
                                      weights='imagenet',
                                      input_shape=(224, 224, 3),
                                      pooling=None)
        x = self.base_model.output
        x = GlobalAveragePooling2D()(x)
        x = Dense(2048, activation='relu', kernel_regularizer=l2(0.1))(x)
        x = Dropout(0.30)(x, training=True)
        x = Dense(100, activation='relu', kernel_regularizer=l2(0.1))(x)
        x = Dropout(0.30)(x)
        output = Dense(1, activation='sigmoid')(x)
        self.model = keras.models.Model(inputs=self.base_model.input,
                                        outputs=output)

    def set_trainable(self, from_idx=0):

        print("\nTraining")
        #for layer in self.base_model.layers:
        #   layer.trainable = False
        for layer in self.model.layers[0:]:
            layer.trainable = True

    def train(self, lr=1e-4, epochs=10, from_idx=0):

        self.set_trainable()
        optimizer = Adam(lr=lr,
                         beta_1=0.9,
                         beta_2=0.999,
                         epsilon=1e-08,
                         decay=self.config.lr_decay)
        self.model.compile(optimizer=optimizer,
                           loss='binary_crossentropy',
                           metrics=['accuracy'])
        train_steps = len(
            self.dataset._partition[0]['train']) / self.config.batch_size
        early_stopping = EarlyStopping(monitor='val_loss',
                                       min_delta=0,
                                       patience=5,
                                       verbose=0,
                                       mode='auto')
        self.history = custom_fit_generator(model=self.model,
                                            generator=self.train_generator,
                                            steps_per_epoch=train_steps,
                                            epochs=epochs,
                                            verbose=1,
                                            validation_data=(self.X_val,
                                                             self.y_val),
                                            shuffle=True,
                                            max_queue_size=30,
                                            workers=30,
                                            use_multiprocessing=True,
                                            callbacks=[early_stopping])

    def predict(self):

        df = self.dataset.get_binarized_data()
        ids = df.index
        labels = df.values

        print("\nPredicting")
        intermediate_layer_model = keras.models.Model(
            inputs=self.base_model.input, outputs=self.model.layers[-1].output)

        self.X_feat, self.y_feat = self.dataset.convert_to_arrays(
            list(ids),
            labels,
            phase='train',
            size=self.config.sample_size_feat)

        ids = np.asarray(ids)
        ids = np.repeat(ids, self.config.sample_size_feat)

        for i in range(10):
            intermediate_output = intermediate_layer_model.predict(self.X_feat)
            features = pd.DataFrame(data=intermediate_output, index=ids)
            features["ids"] = ids
            features = features.groupby(["ids"]).mean()
            features.to_csv("pathology_scores_%s.csv" % i)

#       intermediate_output = intermediate_layer_model.predict(self.X_test, batch_size= self.config.batch_size)
#  print(len(intermediate_output))
#  print(len(intermediate_output[1]))
#       print('intermediate_output',intermediate_output.shape)

#  y_scores = self.model.predict(self.X_test, batch_size= self.config.batch_size)

#   y_scores = self.patch_to_image(y_scores, proba=True)
#  print('y_scores', y_scores)
#  y_preds = np.array([(y_score>0.5).astype(int) for y_score in y_scores]).flatten()
#  pd.DataFrame(data = y_preds, index =self.dataset._partition[0]['test'] ).to_csv('Results.csv')
# print(self.dataset._partition[0]['test'], y_preds)
# return y_scores, y_preds

    def train_predict(self):

        self.train(self.config.lr, self.config.epochs, self.config.from_idx)
        # self.plot_loss()
        # y_scores, y_preds = self.predict()
        self.predict()

    #  np.save("output/y_scores", y_scores)
    #  np.save("output/y_preds", y_preds)

    # return y_scores, y_preds

    def patch_to_image(self, y_patches, proba=True):

        if proba == True:
            y_image = np.array([
                np.mean(y_patches[i * self.config.sampling_size_test:(i + 1) *
                                  self.config.sampling_size_test])
                for i in range(
                    int(len(y_patches) / self.config.sampling_size_test))
            ]).reshape((-1, 1))
        else:
            y_image = np.array([
                np.mean(y_patches[i * self.config.sampling_size_test:(i + 1) *
                                  self.config.sampling_size_test]) > 0.5
                for i in range(
                    int(len(y_patches) / self.config.sampling_size_test))
            ]).reshape((-1, 1)).astype(int)
        y_image = np.asarray(y_image.flatten())
        return y_image

    def plot_loss(self):

        keys = list(self.history.history.keys())
        val_acc_keys = [
            key for key in keys if key[0:3] == "val" and key[-3:] == "acc"
        ]
        acc_keys = [
            key for key in keys if key[0:3] != "val" and key[-3:] == "acc"
        ]
        val_acc = np.mean([self.history.history[key] for key in val_acc_keys],
                          axis=0)
        acc = np.mean([self.history.history[key] for key in acc_keys], axis=0)
        loss = self.history.history["loss"]
        val_loss = self.history.history["val_loss"]

        fig = plt.figure(figsize=(10, 10))
        ax1 = plt.subplot(121)
        ax1.tick_params(labelsize=10)
        plt.plot(acc)
        plt.plot(val_acc)
        plt.title('Mean accuracy', size=14)
        plt.ylabel('accuracy', size=12)
        plt.xlabel('epoch', size=12)
        plt.legend(['train', 'test'], loc='upper left', fontsize=12)
        ax2 = plt.subplot(122)
        ax2.tick_params(labelsize=10)
        plt.plot(loss)
        plt.plot(val_loss)
        plt.title('Mean loss', size=14)
        plt.ylabel('loss', size=12)
        plt.xlabel('epoch', size=12)
        plt.legend(['train', 'test'], loc='upper left', fontsize=12)
        plt.show()
        fig.savefig("output/learning_curve")
        plt.close()

    def plot(self):

        print("\nPlotting model")
        plot_model(self.model, to_file='output/model.png')

    def get_metrics(self, y_scores, y_preds):
        list_of_metrics = [
            "accuracy", "precision", "recall", "f1score", "AUC", "AP"
        ]
        self.metrics = pd.DataFrame(data=np.zeros((1, len(list_of_metrics))),
                                    columns=list_of_metrics)

        #      y_true = self.y_test
        y_pred = y_preds
        y_score = y_scores

        #     print('y_true',y_true)
        print('y_score', y_score)
        print('y_pred', y_pred)