Пример #1
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    elif args.action == 'test':
        dm.add_data('train_data', train_path, True)
        dm.add_data('test_data', test_path, True)
    else:
        raise Exception('Action except for train, semi, and test')

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    model.summary()

    print("args.load_model =", args.load_model)
    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        #print ('Warning : testing without loading any model')
        print('args.action is %s' % (args.action))
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        """
        checkpoint = ModelCheckpoint(filepath=save_path, 
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max' )
        """
        tweets = X[0, :]
        snippets = X[1, :]
        targets = X[2, :]
        print("tweets's shape = ", tweets.shape)
        print("snippets's shape = ", snippets.shape)
        print("targets's shape = ", targets.shape)
        print("Y's shape = ", Y.shape)
        #model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output])
        history = model.fit(
            [tweets, snippets, targets],
            Y,
            validation_data=([X_val[0, :], X_val[1, :], X_val[2, :]], Y_val),
            epochs=args.nb_epoch,
            batch_size=args.batch_size)  #,
        #callbacks=[checkpoint, earlystopping] )
        predictions = model.predict([tweets, snippets, targets])
        #print(predictions.shape)
        #print(predictions)

        model.save(save_path)

    # testing
    elif args.action == 'test':
        args.val_ratio = 0
        (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio)
        tweets = X[0, :]
        snippets = X[1, :]
        targets = X[2, :]
        #print("tweets.shape =", tweets.shape)
        #print("snippets.shape =", snippets.shape)
        #print("targets.shape =", targets.shape)
        predictions = model.predict([tweets, snippets, targets])
        preidctions = predictions.reshape(-1)
        #print(predictions)
        #print(Y.shape)
        #scores = np.sum((predictions - Y)**2)/len(Y)
        scores = model.evaluate([tweets, snippets, targets], Y)
        print("test data mse by keras = %f" % scores[1])
        print("test data mse by sklearn = %f" %
              mean_squared_error(Y, predictions))
        for idx, value in enumerate(predictions):
            if value > 0:
                predictions[idx] = 1
            elif value == 0:
                predictions[idx] = 0
            elif value < 0:
                predictions[idx] = -1

        for idx, value in enumerate(Y):
            if value > 0:
                Y[idx] = 1
            elif value == 0:
                Y[idx] = 0
            elif value < 0:
                Y[idx] = -1

        print("test data micro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='micro'))
        print("test data macro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='macro'))
        #print("test data scores[1](loss = mse) = %f" % scores[1])
        #raise Exception ('Implement your testing function')
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        tweets = X[0, :]
        snippets = X[1, :]
        targets = X[2, :]
        predictions = model.predict([tweets, snippets, targets])
        preidctions = predictions.reshape(-1)
        #scores = np.sum((predictions - Y)**2)/len(Y)
        scores = model.evaluate([tweets, snippets, targets], Y)
        print("train data mse by keras = %f" % scores[1])
        print("train data mse by sklearn = %f" %
              mean_squared_error(Y, predictions))
        for idx, value in enumerate(predictions):
            if value > 0:
                predictions[idx] = 1
            elif value == 0:
                predictions[idx] = 0
            elif value < 0:
                predictions[idx] = -1

        for idx, value in enumerate(Y):
            if value > 0:
                Y[idx] = 1
            elif value == 0:
                Y[idx] = 0
            elif value < 0:
                Y[idx] = -1

        print("train data micro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='micro'))
        print("train data macro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='macro'))

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_loss',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Пример #2
0
Файл: hw4.py Проект: dajuguan/ml
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

#处理数据
#####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    else:
        dm.add_data('test_data', test_path, False)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    #初始化模型
    # initial model
    print('initial model...')
    model = simpleRNN(args)
    print(model.summary())

    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')

#训练过程
# training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        #创建一个实例history
        history = LossHistory()
        hist = model.fit(X,
                         Y,
                         validation_data=(X_val, Y_val),
                         epochs=args.nb_epoch,
                         batch_size=args.batch_size,
                         callbacks=[checkpoint, earlystopping, history])
        #绘制acc-loss曲线
        history.loss_plot('epoch')
#测试过程
# testing
    elif args.action == 'test':
        id = dm.data['test_data'][1]
        out = model.predict(dm.data['test_data'][0])
        out = np.squeeze(out)
        out[out <= 0.5] = 0
        out[out > 0.5] = 1
        out = out.astype(int)
        print("pred shape:", np.array(out).shape)
        print("id shape:", np.array(id).shape)
        result = pd.concat(
            [pd.DataFrame({'id': id}),
             pd.DataFrame({'sentiment': out})],
            axis=1)
        wd = pd.DataFrame(result)
        wd.to_csv("submission.csv", index=None)
        newZip = zipfile.ZipFile('submission.zip', 'w')
        newZip.write('submission.csv', compress_type=zipfile.ZIP_DEFLATED)
        newZip.close()


#半监督训练过
# semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            history = LossHistory()

            # train
            hist = model.fit(semi_X,
                             semi_Y,
                             validation_data=(X_val, Y_val),
                             epochs=2,
                             batch_size=args.batch_size,
                             callbacks=[checkpoint, earlystopping, history])
            history.loss_plot('epoch')

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Пример #3
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    elif args.action == 'test':
        dm.add_data('test_data', test_path, True)
    else:
        raise Exception('Action except for train, semi, and test')

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # prepare glove embedding
    embedding_matrix = preEB(dm)

    # initial model
    print('initial model...')
    model = simpleRNN(args, embedding_matrix, dm.tokenizer.word_index)
    model.summary()

    print("args.load_model =", args.load_model)
    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        #print ('Warning : testing without loading any model')
        print('args.action is %s' % (args.action))
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        """
        checkpoint = ModelCheckpoint(filepath=save_path, 
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max' )
        """
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size)  #,
        #callbacks=[checkpoint, earlystopping] )

        model.save(save_path)

    # testing
    elif args.action == 'test':
        args.val_ratio = 0
        (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio)
        pred = model.predict(X)
        scores = model.evaluate(X, Y)
        print("test data scores(loss = mse) = %f" % scores[1])
        print("mse: ", evaluation(pred, Y, 'mse'))
        print("micro: ", evaluation(pred, Y, 'f1_micro'))
        print("macro: ", evaluation(pred, Y, 'f1_macro'))

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_loss',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Пример #4
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
        dm.add_data('test_data', test_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    elif args.action == 'test':
        dm.add_data('train_data', train_path, True)
        dm.add_data('test_data', test_path, True)
    else:
        raise Exception('Action except for train, semi, and test')
    """      
    # prepare tokenizer
    print ('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path,'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)
    """
    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    """
    if not os.path.exists(os.path.join(save_path,'token.pk')):
        dm.save_tokenizer(os.path.join(save_path,'token.pk')) 
    """
    # convert to sequences
    token_corpus = dm.to_token_corpus(args.max_length)
    #word2vec = to_word2vec(token_corpus)
    if args.action == "train":
        word2vec = to_word2vec(token_corpus)
        save_path_word2vec_model = os.path.join(save_path, 'word2vec.model')
        word2vec.save(save_path_word2vec_model)
    elif args.action == "test":
        path = os.path.join(load_path, 'word2vec.model')
        if os.path.exists(path):
            print('load model from %s' % path)
            word2vec = Word2Vec.load(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    word2vec = word2vec.wv
    #print(word2vec['downgrades'])

    #padding sentence
    dm.padding_sent(args.max_length)
    dm.sent_to_word2vec(word2vec)
    #(X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    model.summary()

    print("args.load_model =", args.load_model)
    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        #print ('Warning : testing without loading any model')
        print('args.action is %s' % (args.action))
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        #print(type(X))
        #print(type(X[0]))
        #print(X[0][0])
        #print(X)
        #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max')
        #X, Y, X_val, Y_val = np.array(X), np.array(Y), np.array(X_val), np.array(Y_val)
        #print(X)
        #print(X[0])
        #X_val = np.reshape(X_val, (X_val.shape[0], args.max_length, X_val.shape[2]))
        save_path_model_h5 = os.path.join(save_path, 'model.h5')
        """
        checkpoint = ModelCheckpoint(filepath=save_path, 
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max' )
        """
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size)  #,
        #callbacks=[checkpoint, earlystopping] )

        model.save(save_path_model_h5)

    # testing
    elif args.action == 'test':
        args.val_ratio = 0
        (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio)
        predictions = model.predict(X)
        predictions = predictions.reshape(-1)
        scores = model.evaluate(X, Y)
        print("test data mse by keras = %f" % scores[1])
        print("test data mse by sklearn = %f" %
              mean_squared_error(Y, predictions))
        for idx, value in enumerate(predictions):
            if value > 0:
                predictions[idx] = 1
            elif value == 0:
                predictions[idx] = 0
            elif value < 0:
                predictions[idx] = -1

        for idx, value in enumerate(Y):
            if value > 0:
                Y[idx] = 1
            elif value == 0:
                Y[idx] = 0
            elif value < 0:
                Y[idx] = -1

        print("test data micro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='micro'))
        print("test data macro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='macro'))

        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        predictions = model.predict(X)
        predictions = predictions.reshape(-1)
        scores = model.evaluate(X, Y)
        print("train data mse by keras = %f" % scores[1])
        print("train data mse by sklearn = %f" %
              mean_squared_error(Y, predictions))
        for idx, value in enumerate(predictions):
            if value > 0:
                predictions[idx] = 1
            elif value == 0:
                predictions[idx] = 0
            elif value < 0:
                predictions[idx] = -1

        for idx, value in enumerate(Y):
            if value > 0:
                Y[idx] = 1
            elif value == 0:
                Y[idx] = 0
            elif value < 0:
                Y[idx] = -1

        print("train data micro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='micro'))
        print("train data macro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='macro'))

        #raise Exception ('Implement your testing function')

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_loss',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
def main():
    parser = argparse.ArgumentParser(description='Text OHCA recognition')
    parser.add_argument('model')
    parser.add_argument('action', choices=['train', 'test'])

    # training argument
    parser.add_argument('--batch_size', default=256, type=float)
    parser.add_argument('--nb_epoch', default=2000, type=int)
    parser.add_argument('--val_ratio', default=0.1, type=float)
    parser.add_argument('--gpu_fraction', default=0.6, type=float)
    parser.add_argument('--vocab_size', default=50000, type=int)
    parser.add_argument('--max_length', default=400, type=int)
    parser.add_argument('--patience', default=30, type=int)

    # model parameter
    parser.add_argument('--loss_function', default='binary_crossentropy')
    parser.add_argument('--cell', default='LSTM', choices=['LSTM', 'GRU'])
    parser.add_argument('-num_lay', '--num_layers', default=2, type=int)
    parser.add_argument('-emb_dim', '--embedding_dim', default=256, type=int)
    parser.add_argument('-hid_siz', '--hidden_size', default=400, type=int)
    parser.add_argument('--pretrain_emb', default=True, type=bool)
    parser.add_argument('--emb_matrix', default='cbowemb.npz')
    #    parser.add_argument('--dropout_rate', default=0.3, type=float)
    parser.add_argument('--keep_prob', default=1.0, type=float)
    parser.add_argument('-lr', '--learning_rate', default=0.013, type=float)
    parser.add_argument('--threshold', default=0.5, type=float)
    # output path for your prediction
    parser.add_argument(
        '--result_path',
        default='result.csv',
    )

    # put model in the same directory
    parser.add_argument('--load_model', default=None)
    parser.add_argument('--load_token', default=True, type=bool)
    parser.add_argument('--save_dir', default='model/')
    # log dir for tensorboard
    parser.add_argument('--log_dir', default='log_dir/')
    # testing output
    parser.add_argument('--testfile', default='data/ohca_scripts.txt')
    parser.add_argument('--testout', default='data/script_test.txt')

    args = parser.parse_args()

    train_path = 'data/ohca_scripts.txt'
    test_path = args.testfile

    save_path = 'token/'
    #load token path
    if args.load_token is not None:
        load_path = os.path.join(save_path)

    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    sess = get_session(args.gpu_fraction)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, with_label=True)
    else:
        dm.add_data('test_data', test_path,
                    with_label=True)  # now the test will have label

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_token is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # Create the graph object
    tf.reset_default_graph()
    # initial model
    print('initial model...')
    rnnmodel = simpleRNN(args)
    #print (model.summary())

    with tf.name_scope('inputs'):
        #create placeholder for training (testing) data
        X_ = tf.placeholder(tf.int32, [None, args.max_length], name='X')
        y_ = tf.placeholder(tf.int32, [
            args.batch_size,
        ], name='y_')
        keep_prob = tf.placeholder_with_default(1.0,
                                                shape=(),
                                                name="keep_prob")

    y_predict = rnnmodel.model(args, X_, keep_prob)

    #prepare for saving model to evaluate
    train_var = [X_, y_, keep_prob, y_predict]
    tf.add_to_collection('train_var', train_var[0])
    tf.add_to_collection('train_var', train_var[1])
    tf.add_to_collection('train_var', train_var[2])
    tf.add_to_collection('train_var', train_var[3])

    #loss (MSE)
    mse = rnnmodel.loss(y_, y_predict)

    #optimizers
    train_op = rnnmodel.optimizer(args, mse)

    #accuracy for validation
    accuracy = rnnmodel.accuracy(y_, y_predict)

    #initial state of LSTM
    init_state = rnnmodel.initial_state

    # merge the write out histogram plots (tensorboard)
    merged = tf.summary.merge_all()

    #check outputs of LSTM
    routputs = rnnmodel.outputs

    if args.load_model is not None:
        load_path = os.path.join(args.save_dir)
        if args.action == 'train':
            print('Warning : load a exist model variables and keep training')
        path = os.path.join(load_path, 'Sentimen_rnn_final')
        if os.path.exists(path + ".meta"):
            print('load model from %s' % path)
            #model.load_weights(path) change to tensorflow model
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')
        raise Exception('Not loading model for testing...')

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        print("Shape of X is {}, and y is {}".format(
            np.array(X).shape,
            np.array(Y).shape))

    elif args.action == 'test':
        (X, Y) = dm.get_labeldata('test_data')
        print("Load test data (shape {})".format(X.shape))
        #raise Exception ('Implement your testing function')

    init = tf.global_variables_initializer()

    #prepare to save model
    save_vars = tf.trainable_variables()
    saver = tf.train.Saver(save_vars,
                           max_to_keep=7,
                           keep_checkpoint_every_n_hours=1)

    last_loss = 1000000.0

    with tf.Session() as sess:
        init.run()

        #if pre-trained, load embedding matrix
        if (args.pretrain_emb == True):
            emb_npfn = save_path + args.emb_matrix
            emb_matrix = np.load(emb_npfn)['embed_m']
            if (emb_matrix.shape[0] != args.vocab_size
                    or emb_matrix.shape[1] != args.embedding_dim):
                print(
                    "Import embedding matrix shape {} does not match shape of ({},{})..."
                    .format(emb_matrix.shape, args.vocab_size,
                            args.embedding_dim))
                exit(1)
            else:
                print("Loading embedding matrix.....")
                sess.run(rnnmodel.embedding_mat.assign(emb_matrix))

        train_writer = tf.summary.FileWriter(args.log_dir + 'train',
                                             sess.graph)
        valid_writer = tf.summary.FileWriter(args.log_dir + 'valid',
                                             sess.graph)
        # load variables in graphs if assigned
        if args.load_model is not None:
            saver.restore(sess, path)

        #if semi-learning, first apply model to semi-learning data
        if (args.action == 'train'):
            #training
            early_stop_counter = 0
            generation_num = 0
            # repeat nb_epoch times
            for e in range(args.nb_epoch):
                state = sess.run([init_state])
                semi_preds = []

                if (e == 0):
                    # hard copy
                    X_train = X.copy()
                    Y_train = Y.copy()

                #elif ( args.action='train'):
                #reset initial LSTM state every epochs
                n_batches = len(X) // args.batch_size
                for ix, (X_batch, y_batch) in enumerate(
                        get_batches(X_train, Y_train, args.batch_size), 1):

                    generation_num += 1
                    train_dict = {
                        X_: X_batch,
                        y_: y_batch,
                        keep_prob: args.keep_prob,
                        init_state: state
                    }
                    #for each traing generation, reload zero initial states

                    _, summary, mse_train, accu_train = sess.run(
                        [train_op, merged, mse, accuracy],
                        feed_dict=train_dict)

                    train_writer.add_summary(summary, generation_num)
                    outputs_ = routputs.eval(feed_dict=train_dict)
                    if (ix == 1):
                        print(X_batch.shape)
                        #print("shape of outputs is {}".format(outputs_[:,-1].shape))

                    if (generation_num % 10 == 0):
                        print("Epoch: {}/{}".format(e, args.nb_epoch),
                              "Iteration: {}".format(generation_num),
                              "Train loss: {:.3f}".format(mse_train))

                    #validation for each 50 generations or end of each epoch
                    if (generation_num % 50 == 0 or ix == n_batches):
                        val_acc = []
                        val_loss = []
                        val_state = sess.run([init_state])
                        for iv, (X_batch, y_batch) in enumerate(
                                get_batches(X_val, Y_val, args.batch_size), 1):
                            val_dict = {
                                X_: X_batch,
                                y_: y_batch,
                                keep_prob: 1,
                                init_state: val_state
                            }

                            summary, batch_acc, batch_loss = sess.run(
                                [merged, accuracy, mse], feed_dict=val_dict)
                            #print out some answer for checking
                            val_predict = sess.run(y_predict,
                                                   feed_dict=val_dict)
                            #print("shape of val_predict is {}".format(np.array(val_predict).shape))
                            #last ten elements of each batch

                            for y_true, y_pre in zip(y_batch[-9:],
                                                     val_predict[-9:]):
                                print("y_true: {}, y_predict: {}".format(
                                    y_true, y_pre))

                            val_loss.append(batch_loss)
                            val_acc.append(batch_acc)

                            sys.stdout.flush()

                        print("Iteration: {}".format(generation_num),
                              "Val acc: {:.3f}".format(np.mean(val_acc)),
                              "Val mse: {:.3f}".format(np.mean(val_loss)))

                        valid_writer.add_summary(summary, generation_num)
                        loss_val_avg = np.mean(val_loss)
                        #save variables every 50 generations
                        saver.save(sess,
                                   os.path.join(args.save_dir, "Sentimen_rnn"),
                                   global_step=generation_num)

                        if (ix == n_batches):
                            #early stop count here
                            if (last_loss > loss_val_avg):
                                last_loss = loss_val_avg
                                early_stop_counter = 0
                            else:
                                early_stop_counter += 1

                if (early_stop_counter >= args.patience
                        or e == (args.nb_epoch - 1)):
                    #save model
                    saver.save(
                        sess, os.path.join(args.save_dir,
                                           "Sentimen_rnn_final"))
                    saver.export_meta_graph(os.path.join(
                        args.save_dir, "Sentimen_rnn_final.meta"),
                                            collection_list=['train_var'])
                    break

            print("End of training.....")

        #testing
        elif (args.action == 'test'):
            # hard copy
            X_test = X.copy()
            Y_test = Y.copy()
            state = sess.run([init_state])
            with open(args.testout, 'w+') as outfile:

                for ix, (X_batch, y_batch) in enumerate(
                        get_batches(X_test, Y_test, args.batch_size), 1):

                    test_dict = {
                        X_: X_batch,
                        y_: y_batch,
                        keep_prob: args.keep_prob,
                        init_state: state
                    }
                    #for each traing generation, reload zero initial states

                    _, y_prebatch, accu_train = sess.run(
                        [train_op, y_predict, accuracy], feed_dict=test_dict)

                    for y_true, y_pre in zip(y_batch, y_prebatch):
                        strout = "%d\t%f\n" % (y_true, y_pre)
                        outfile.write(strout)
            print("Testing finish, write out file {}".format(args.testout))
            #raise Exception ('Implement your testing function')

    return