Пример #1
0
    def _data_generator(self, mode=DataGeneratorMode.training):
        def gen_training_data():
            (x_train, y_train), (_, _), _ = load(self.data_mode)
            z = list(zip(x_train, y_train))
            pos = int(len(x_train) * self.validation_split)
            z_tr = z[pos:]
            z_val = z[:pos]
            self.validation_x, self.validation_y = zip(*z_val)
            self.train_x, self.train_y = zip(*z_tr)

            return x_train, y_train

        inputs, outputs = [], []

        if mode == DataGeneratorMode.training or mode == DataGeneratorMode.validation:
            if len(self.validation_x) == 0:
                inputs, outputs = gen_training_data()
            else:
                inputs, outputs = self.validation_x, self.validation_y
        else:
            (_, _), (x_test, y_test), _ = load(self.data_mode)
            inputs, outputs = x_test, y_test

        index = 0
        last_batch_x, last_batch_y = [], []
        while True:

            def get_batch(i):
                try:
                    vectorized_inp = []
                    vectorized_out = []
                    for b in range(self.batch_size):
                        playlist = inputs[i + b]
                        vectorized_inp.append(
                            [self.vocab_index[str(song)] for song in playlist])
                        vectorized_out.append(
                            self.embeddings_matrix[self.vocab_index[str(
                                outputs[i + b])]])

                    x = np.asarray(
                        pad_sequences(vectorized_inp, maxlen=self.max_length))
                    y = np.asarray(vectorized_out)
                    return x, y, i
                except KeyError as key_error:
                    # logging.warning('Can\'t find key', key_error)
                    return get_batch(i + self.batch_size)

            try:
                last_batch_x, last_batch_y, index = get_batch(index)
                index += self.batch_size
            except IndexError:
                index = 0
            except Exception as exp:
                logging.error('Data generator error', exp)
                index = 0

            yield last_batch_x, last_batch_y
Пример #2
0
        def gen_training_data():
            (x_train, y_train), (_, _), _ = load(self.data_mode)
            z = list(zip(x_train, y_train))
            pos = int(len(x_train) * self.validation_split)
            z_tr = z[pos:]
            z_val = z[:pos]
            self.validation_x, self.validation_y = zip(*z_val)
            self.train_x, self.train_y = zip(*z_tr)

            return x_train, y_train
    def __init__(self,
                 mode=DatasetMode.small,
                 model_name=ModelName.simple_gru):
        self.optimizer = Adam(lr=0.001,
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=1e-08,
                              decay=0.0)
        self.activation = 'softmax'
        self.loss = 'categorical_crossentropy'
        self.metrics = top_k_accuracy_func_list([50, 100, 200, 300, 400, 500])

        self.epochs = 15
        self.batch_size = 512
        self.validation_split = 0.1
        self.data_mode = mode
        '''
        Index of songs in x_train(or test) starts from 1 because of zero padding.
        Index of songs in y_train(or test) starts from zero like song hash.
        For instance:
        In dataset, index of songA is 21.
        songA's index is 22 in x_train(or test)
        songA's index is 21 in y_train(or test).
        The goal is the neural network having the ability to ignore zero-paddings
        '''
        (x_train, y_train), (x_test, y_test), songs = load(mode)

        self.max_length = max([len(playlist) for playlist in x_train])
        self.song_hash = songs

        self.train_len = len(x_train)
        self.test_len = len(x_test)

        self.validation_x, self.validation_y = [], []
        self.train_x, self.train_y = [], []

        if model_name is ModelName.simple_gru:
            self.weights_loc = TRAINED_MODELS_BASE_PATH + 'simple_gru_weights.best.hdf5'
            self.tb_logs_path = LOGS_BASE_PATH + 'simple_gru_logs'
            self.model = self.create_simple_gru_model()
        elif model_name is ModelName.bi_directional_lstm:
            self.weights_loc = TRAINED_MODELS_BASE_PATH + 'bi_lstm_weights.best.hdf5'
            self.tb_logs_path = LOGS_BASE_PATH + 'bi_lstm_logs'
            self.model = self.create_bi_lstm_model()
        elif model_name is ModelName.attention_bilstm:
            self.weights_loc = TRAINED_MODELS_BASE_PATH + 'ablstm_model_weights.best.hdf5'
            self.tb_logs_path = LOGS_BASE_PATH + 'ablstm_model_logs'
            self.model = self.create_ablstm_model()
        else:
            raise Exception("Unknown Model Name; ", model_name)

        self.callbacks = self._get_callback_functions()
Пример #4
0
    def __init__(self,
                 mode=DatasetMode.small,
                 model_name=ModelName.simple_gru):
        self.optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
        self.activation = 'tanh'
        self.loss = 'cosine'
        self.metrics = [
            'mae', 'mse', 'acc', 'mape', 'cosine', 'categorical_crossentropy'
        ]
        self.epochs = 50
        self.batch_size = 512
        self.validation_split = 0.1

        self.data_mode = mode
        v_index, embedding_matrix = SongEmbeddings().get_embeddings_matrix()
        self.embeddings_matrix = embedding_matrix
        self.vocab_index = v_index

        self.validation_x, self.validation_y = [], []
        self.train_x, self.train_y = [], []
        '''
        Index of songs in x_train(or test) starts from 1 because of zero padding.
        Index of songs in y_train(or test) starts from zero like song hash.
        For instance:
        In dataset, index of songA is 21.
        songA's index is 22 in x_train(or test)
        songA's index is 21 in y_train(or test).
        The goal is the neural network having the ability to ignore zero-paddings
        '''
        (x_train, y_train), (x_test, _), songs = load(mode)

        self.max_length = max([len(playlist) for playlist in x_train])
        self.song_hash = songs
        self.train_len = len(x_train)
        self.test_len = len(x_test)

        if model_name is ModelName.simple_gru:
            self.weights_loc = TRAINED_MODELS_BASE_PATH + 'e_simple_gru_weights.best.hdf5'
            self.tb_logs_path = LOGS_BASE_PATH + 'e_simple_gru_logs'
            self.model = self._create_simple_gru_model()
        elif model_name is ModelName.bi_directional_lstm:
            self.weights_loc = TRAINED_MODELS_BASE_PATH + 'e_bi_lstm_weights.best.hdf5'
            self.tb_logs_path = LOGS_BASE_PATH + 'e_bi_lstm_logs'
            self.model = self._create_bi_lstm_model()
        elif model_name is ModelName.attention_bilstm:
            self.weights_loc = TRAINED_MODELS_BASE_PATH + 'e_ablstm_model_weights.best.hdf5'
            self.tb_logs_path = LOGS_BASE_PATH + 'e_ablstm_model_logs'
            self.model = self._create_ablstm_model()
        else:
            raise Exception("Unknown Model Name; ", model_name)

        self.callbacks = self._get_callback_functions()
    def _data_generator(self, mode=DataGeneratorMode.training):
        def gen_training_data():
            (x_train, y_train), (_, _), _ = load(self.data_mode)
            z = list(zip(x_train, y_train))
            pos = int(len(x_train) * self.validation_split)
            z_tr = z[pos:]
            z_val = z[:pos]
            self.validation_x, self.validation_y = zip(*z_val)
            self.train_x, self.train_y = zip(*z_tr)

            return x_train, y_train

        inputs, outputs = [], []

        if mode == DataGeneratorMode.training or mode == DataGeneratorMode.validation:
            if len(self.validation_x) == 0:
                inputs, outputs = gen_training_data()
            else:
                inputs, outputs = self.validation_x, self.validation_y
        else:
            (_, _), (x_test, y_test), _ = load(self.data_mode)
            inputs, outputs = x_test, y_test

        index = 0
        last_batch_x, last_batch_y = [], []

        while True:
            try:
                inp = inputs[index:index + self.batch_size]
                out = outputs[index:index + self.batch_size]

                last_batch_x = np.asarray(sequence.pad_sequences(
                    inp, maxlen=self.max_length),
                                          dtype="int64")
                last_batch_y = to_categorical(out,
                                              len(self.song_hash) +
                                              1)  # Zero is included
                index += self.batch_size
            except IndexError:
                index = 0
            except Exception as exp:
                logging.error('Data generator error', exp)
                index = 0
            yield last_batch_x, last_batch_y