def fit(self, X_train, y_train):

        def save_graph(self, sess):

            if not os.path.exists(self.save_folder):
                os.makedirs(self.save_folder)

            if not os.path.exists(self.save_folder + str(self.random_file_ext_) + "/"):
                os.makedirs(self.save_folder + str(self.random_file_ext_) + "/")
            permanent_saver = tf.train.Saver()
            permanent_saver.save(sess, self.save_folder + str(self.random_file_ext_) + "/" + "model")

        def convert_data_to_one_hot(y_train):
            #y_test_temp = np.zeros((y_test.size, y_test.max() + 1), dtype=np.int)
            #y_test_temp[np.arange(y_test.size), y_test] = 1

            # Other option:
            #   y_train is a tensor then because of one_hot, but feed_dict only accepts numpy arrays => replace y_train with sess.run(y_train)
            #   http://stackoverflow.com/questions/34410654/tensorflow-valueerror-setting-an-array-element-with-a-sequence
            # return tf.one_hot(y_train, 4), tf.one_hot(y_test, 4)
            y_train_temp = np.zeros((y_train.size, y_train.max() + 1), dtype=np.int)
            y_train_temp[np.arange(y_train.size), y_train] = 1

            return y_train_temp

        y_train_conv = convert_data_to_one_hot(y_train)

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.prediction, self.prob = self.neural_network_model(
                X_train)


            if self.use_class_weights == True:#https://stackoverflow.com/questions/35155655/loss-function-for-class-imbalanced-binary-classifier-in-tensor-flow#answer-38912982
                class_weights = calculate_class_weight(y_train) # doesnt work really well
                class_weight_mod = tf.constant(
                    [
                        [class_weights[0], class_weights[1]]
                    ])
                weight_per_label = tf.transpose(tf.matmul(self.y, tf.transpose(class_weight_mod)))
                xent = tf.multiply(weight_per_label
                                   , tf.nn.softmax_cross_entropy_with_logits(logits=self.prediction, labels=self.y))
            else:
                xent = tf.nn.softmax_cross_entropy_with_logits(logits=self.prediction, labels=self.y)

            cost = tf.reduce_mean(xent)

            # simple explanations to some optimizers:
            #   http://stackoverflow.com/questions/33919948/how-to-set-adaptive-learning-rate-for-gradientdescentoptimizer
            #   http://cs231n.github.io/neural-networks-3/
            #   http://sebastianruder.com/optimizing-gradient-descent/index.html#adam
            # Parameters: http://tflearn.org/optimizers
            if self.optimizer == 'adam':
                optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_tensor).minimize(cost)
            elif self.optimizer == 'adadelta':
                optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.learning_rate_tensor).minimize(cost)
            elif self.optimizer == 'adagrad':
                optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate_tensor).minimize(cost)
            elif self.optimizer == 'graddesc':
                optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate_tensor).minimize(cost)
            elif self.optimizer == 'momentum':
                optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate_tensor,
                                                       momentum=self.momentum).minimize(cost)
            elif self.optimizer == 'nesterov_momentum':
                optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate_tensor, momentum=self.momentum,
                                                       use_nesterov=True).minimize(cost)
            elif self.optimizer == 'proxada':
                optimizer = tf.train.ProximalAdagradOptimizer(learning_rate=self.learning_rate_tensor).minimize(cost)
            elif self.optimizer == 'rms':
                optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_tensor).minimize(cost)
            else:
                optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_tensor).minimize(cost)

        with tf.Session(graph=self.graph, config=self.config) as sess:
            sess.run(tf.global_variables_initializer())

            momentum_start = 0.5
            momentum_end = 0.99
            calc_learning_rate = self.learning_rate

            for epoch in range(self.hm_epochs):
                epoch_loss = 0

                # increase momentum steadily
                i = 0
                calc_momentum = momentum_start + (
                    float((momentum_end - momentum_start) / self.hm_epochs) * epoch)  # increase momentum with epochs

                if self.step_decay_LR == True and (epoch == 20 or epoch == 35 or epoch == 45) and epoch > 0:
                    calc_learning_rate = float(calc_learning_rate / 10.0)

                while i < len(X_train):
                    start = i
                    end = i + self.batch_size
                    batch_x = np.array(X_train[start:end])
                    batch_y = np.array(y_train_conv[start:end])

                    _, c = sess.run([optimizer, cost], feed_dict={self.x: batch_x,
                                                                  self.y: batch_y,
                                                                  self.keep_prob: self.keep_prob_const,
                                                                  # self.learning_rate: CHANGE EVERY FEW EPOCHS,
                                                                  self.momentum: calc_momentum,
                                                                  self.learning_rate_tensor: calc_learning_rate
                                                                  })
                    epoch_loss += c
                    i += self.batch_size

                self.learning_rate_output += str(epoch_loss) + "\n"
                print('Epoch', epoch + 1, 'completed out of', self.hm_epochs, 'loss:', epoch_loss, 'LR=',
                      calc_learning_rate)
                # get second weight matrix
                # test = sess.run(self.graph.get_tensor_by_name("weight1:0"))
                # print(test)

            # save the graph permanently
            save_graph(self, sess)

            # correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
            # accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
            # print('Accuracy by TensorFlow:', accuracy.eval({x: X_test, y: y_test, keep_prob: keep_prob_const}))
            # value = sess.run(tf.argmax(self.prediction, 1), feed_dict={self.x: X_test, self.keep_prob: self.keep_prob_const})
        # return value
        return self
    def fit(self, X_train, y_train, X_test, y_test, test_fold, loss_filename):
        self.test_fold = test_fold
        self.y_test = y_test

        # set session config with gpu variable growth
        self.sess = tf.Session(
            config=self.config
        )  # see https://github.com/fchollet/keras/issues/1538

        K.set_session(self.sess)

        # convert y_train to one-hot vector
        y_train_one_hot = convert_data_to_one_hot(y_train)
        y_test_one_hot = convert_data_to_one_hot(self.y_test)

        # load feature dict for LSTM_1000_GloVe
        with open(self.FEATURES_DIR + self.PARAM_DICT_FILENAME, "rb") as f:
            param_dict = pickle.load(f)

        # load parameters needed for embedding layer
        EMBEDDING_DIM = param_dict["EMBEDDING_DIM"]  # e.g. 50
        self.MAX_SEQ_LENGTH = param_dict["MAX_SEQ_LENGTH"]  # e.g. 100

        X_train_LSTM, X_train_MLP = split_X(X_train, self.MAX_SEQ_LENGTH)
        X_test_LSTM, X_test_MLP = split_X(X_test, self.MAX_SEQ_LENGTH)

        # load embeddings
        EMBEDDING_FILE = np.load(self.FEATURES_DIR +
                                 param_dict["EMBEDDING_FILE"])

        print("EMBEDDING_FILE.shape = " + str(EMBEDDING_FILE.shape))

        # calc cass weights
        class_weights = calculate_class_weight(y_train, no_classes=4)

        ################
        # CLAIMS LAYER #
        ################
        lstm_input = Input(
            shape=(self.MAX_SEQ_LENGTH, ), dtype='int32', name='lstm_input'
        )  # receive sequences of MAX_SEQ_LENGTH_CLAIMS integers
        embedding = Embedding(
            input_dim=len(EMBEDDING_FILE),  # lookup table size
            output_dim=EMBEDDING_DIM,  # output dim for each number in a sequence
            weights=[EMBEDDING_FILE],
            input_length=self.
            MAX_SEQ_LENGTH,  # receive sequences of MAX_SEQ_LENGTH_CLAIMS integers
            mask_zero=False,
            trainable=True)(lstm_input)

        att_vec = GlobalMaxPooling1D()(embedding)

        orig_docs_att = InnerAttentionLayer(att_vec,
                                            EMBEDDING_DIM,
                                            return_att_weights=True,
                                            return_sequence=True,
                                            name='lstm_attention')

        data_LSTM = LSTM(
            100,
            return_sequences=True,
            stateful=False,
            dropout=0.2,
            batch_input_shape=(self.batch_size, self.MAX_SEQ_LENGTH,
                               EMBEDDING_DIM),
            input_shape=(self.MAX_SEQ_LENGTH, EMBEDDING_DIM),
            implementation=self.LSTM_implementation)(orig_docs_att[0])
        data_LSTM = LSTM(100,
                         return_sequences=False,
                         stateful=False,
                         dropout=0.2,
                         batch_input_shape=(self.batch_size,
                                            self.MAX_SEQ_LENGTH,
                                            EMBEDDING_DIM),
                         input_shape=(self.MAX_SEQ_LENGTH, EMBEDDING_DIM),
                         implementation=self.LSTM_implementation)(data_LSTM)

        ###############################
        # MLP (NON-TIMESTEP) FEATURES #
        ###############################
        mlp_input = Input(shape=(len(X_train_MLP[0]), ),
                          dtype='float32',
                          name='mlp_input')

        ###############
        # MERGE LAYER #
        ###############
        merged = concatenate([data_LSTM, mlp_input])

        dense_mid = Dense(600,
                          kernel_regularizer=self.regularizer,
                          kernel_initializer=self.kernel_initializer,
                          activity_regularizer=self.dense_activity_regularizer,
                          activation='relu')(merged)
        dense_mid = Dense(600,
                          kernel_regularizer=self.regularizer,
                          kernel_initializer=self.kernel_initializer,
                          activity_regularizer=self.dense_activity_regularizer,
                          activation='relu')(dense_mid)
        dense_mid = Dense(600,
                          kernel_regularizer=self.regularizer,
                          kernel_initializer=self.kernel_initializer,
                          activity_regularizer=self.dense_activity_regularizer,
                          activation='relu')(dense_mid)
        dense_out = Dense(4, activation='softmax', name='dense_out')(dense_mid)

        # build model
        self.model = Model(inputs=[lstm_input, mlp_input], outputs=[dense_out])

        # print summary
        self.model.summary()

        # optimizers
        if self.optimizer_name == "adagrad":
            optimizer = optimizers.Adagrad(lr=self.lr)
            print("Used optimizer: adagrad, lr=" + str(self.lr))
        elif self.optimizer_name == "adamax":
            optimizer = optimizers.Adamax(
                lr=self.lr
            )  # recommended for sparse stuff like with embeddings
            print("Used optimizer: adamax, lr=" + str(self.lr))
        elif self.optimizer_name == "nadam":
            optimizer = optimizers.Nadam(
                lr=self.lr)  # recommended to leave at default params
            print("Used optimizer: nadam, lr=" + str(self.lr))
        elif self.optimizer_name == "rms":
            optimizer = optimizers.RMSprop(lr=self.lr)  # recommended for RNNs
            print("Used optimizer: rms, lr=" + str(self.lr))
        elif self.optimizer_name == "SGD":
            optimizer = optimizers.SGD(lr=self.lr)  # recommended for RNNs
            print("Used optimizer: SGD, lr=" + str(self.lr))
        elif self.optimizer_name == "adadelta":
            optimizer = optimizers.Adadelta(self.lr)  # recommended for RNNs
            print("Used optimizer: adadelta, lr=" + str(self.lr))
        else:
            optimizer = optimizers.Adam(lr=self.lr)
            print("Used optimizer: Adam, lr=" + str(self.lr))

        # compile model: for loss fcts see https://github.com/fchollet/keras/blob/master/keras/losses.py
        self.model.compile(
            optimizer,
            'kullback_leibler_divergence',  # categorial_crossentropy
            metrics=['accuracy'])
        if self.use_class_weights == True:
            self.model.fit([X_train_LSTM, X_train_MLP],
                           y_train_one_hot,
                           validation_data=([X_test_LSTM,
                                             X_test_MLP], y_test_one_hot),
                           batch_size=self.batch_size,
                           epochs=self.epochs,
                           verbose=1,
                           class_weight=class_weights,
                           callbacks=[
                               EarlyStoppingOnF1(self.epochs,
                                                 X_test_LSTM,
                                                 X_test_MLP,
                                                 self.y_test,
                                                 loss_filename,
                                                 epsilon=0.0,
                                                 min_epoch=self.min_epoch),
                           ])
        else:
            self.model.fit([X_train_LSTM, X_train_MLP],
                           y_train_one_hot,
                           validation_data=([X_test_LSTM,
                                             X_test_MLP], y_test_one_hot),
                           batch_size=self.batch_size,
                           epochs=self.epochs,
                           verbose=1,
                           callbacks=[
                               EarlyStoppingOnF1(self.epochs,
                                                 X_test_LSTM,
                                                 X_test_MLP,
                                                 self.y_test,
                                                 loss_filename,
                                                 epsilon=0.0,
                                                 min_epoch=self.min_epoch),
                           ])
        self.model.save(self.save_folder + "save.h5")
        return self