예제 #1
0
def main():
    """ 
        Main function of test.py
        Arguments:
            modelname: String, name of the model
            datapath: The testing file
            subtask: String, "A" or "B" or "C"
        Outputs:
            subtask + [subtask]/result/[modelname]/res.pred
    """
    modelname = args.modelname
    datapath = args.datapath
    subtask = args.subtask
    dm = DataManager(subtask)
    dm.load_tokenizer(
        os.path.join("subtask" + subtask, "models", modelname, "word2idx.pkl"),
        os.path.join("subtask" + subtask, "models", modelname, "idx2word.pkl"))
    dm.add_data("test", datapath)
    dm.to_sequence(40, 40)
    (test_Q, test_C), qidlist = dm.get_data("test")
    print("test_Q", test_Q[0:2])
    print("test_C", test_C[0:2])
    print("qidlist", qidlist[0:2])
    model = load_model(
        os.path.join("subtask" + subtask, "models", modelname, "model.h5"))
    result = model.predict([test_Q, test_C], batch_size=128, verbose=1)
    print("result", result[0:2])
    if subtask == "A":
        outputA(qidlist, result, modelname)
    elif subtask == "B":
        outputB(qidlist, result, modelname)
    elif subtask == "C":
        outputC(qidlist, result, modelname)
예제 #2
0
def new_process_xy(tokenpath,path2x,path2y):
    dm = DataManager()
    dm.add_data('seed', '0samples.csv')
    dm.add_data('truth', '0samples.csv')
    dm.tokenize(230000) #vocab size
    dm.save_tokenizer(tokenpath)
    dm.to_sequence(1) #max length
    dm.save_sequence(path2x)
    dm.tosave_label(path2y)
예제 #3
0
def main():
    path_pfx = ''
    max_len = 37

    dm = DataManager()
    dm.add_data('test', os.path.join(sys.argv[1]), False, True)
    print(len(dm.data['test'][0]))
    dm.preprocessing()
    dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec'))
    #dm.load_tokenizer(os.path.join(path_pfx, 'token.pkl'))
    dm.to_sequence(max_len, use_pretrain=True)
    result = predict(dm.data['test'][0], path_pfx)
    write(sys.argv[2], result)
    print('finished')
예제 #4
0
def main(argv):
    filename = argv[1]
    output_path = argv[2]
    output_path = output_path.replace('\r', '')
    output_path = output_path.replace('\r\n', '')
    dm = DataManager()
    dm.add_data('test_data', filename, False)
    dm.load_tokenizer('./model/token_25k.pk')
    dm.to_sequence(40)

    model = load_model('./model/00017-0.82720.h5')
    model.summary()

    val_proba = model.predict(dm.data['test_data'])
    val_classes = [1 if value > 0.5 else 0 for value in val_proba]

    out = pd.DataFrame(val_classes, columns=['label'])
    out.to_csv(output_path, index_label='id')
예제 #5
0
def main():
    voc_size = None
    max_len = 39
    path_pfx = ''
    dm = DataManager()
    dm.add_data('train', sys.argv[1])
    #dm.add_data('semi', os.path.join(path_pfx, 'training_nolabel.txt'), False)
    #dm.add_data('test', os.path.join(path_pfx, 'testing_data.txt'), False, True)
    dm.preprocessing()

    dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec'))
    #dm.load_embedding_matrix(os.path.join(path_pfx, 'word2vec.wv.vectors.npy'))
    dm.to_sequence(max_len, use_pretrain=True)
    #dm.to_bow()

    print(max_len)

    #emb_mat =  dm.get_embedding_matrix()
    emb_mat = None

    train(dm, voc_size=voc_size, max_len=max_len, emb_mat=emb_mat)
예제 #6
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir,args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir,args.load_model)

           #####read data#####
    dm = DataManager()
    print ('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    elif args.action == 'train_corpus':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    else:
        raise Exception ('Implement your testing parser')


    # prepare tokenizer
    print ('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path,'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists('./model/token_25k.pk'):
        dm.save_tokenizer('./model/token_25k.pk')

    embedding_w = dm.get_vec_model('emb_1.npy',args.embedding_dim)
    dm.to_sequence(args.max_length)
        # initial model
    print ('initial model...')
    model = simpleRNN(args,embedding_w)
    model.summary()

    if args.load_model is not None:
        if args.action == 'train':
            print ('Warning : load a exist model and keep training')
        path = os.path.join(load_path,'model.h5')
        if os.path.exists(path):
            print ('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" %path)
    elif args.action == 'test':
        print ('Warning : testing without loading any model')

        # training
    if args.action == 'train_corpus':
        (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc', patience = 3, verbose=1, mode='max')

        checkpoint = ModelCheckpoint(filepath='./model/'+'{epoch:05d}-{val_acc:.5f}.h5',
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     monitor='val_acc',
                                     mode='max' )

        history = model.fit(X, Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            verbose=1,
                            shuffle= True,
                            callbacks=[checkpoint, earlystopping] )
        # plot_figure(history)
            # semi-supervised training
    elif args.action == 'semi':

        earlystopping = EarlyStopping(monitor='val_acc', patience = 10, verbose=1, mode='max')


        checkpoint = ModelCheckpoint(filepath='./model/semi/'+'{epoch:05d}-{val_acc:.5f}.h5',
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     monitor='val_acc',
                                     mode='max' )

        # repeat 10 times
        (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)
        [semi_all_X] = dm.get_data('semi_data')
        semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True)
        dm.clean_data()
        dm.add_data('train_data', train_path, True)
        dm.add_data('test_data',test_path, False)
        dm.to_sequence(args.max_length)
        semi_X, semi_Y = dm.get_semi_data('test_data', semi_pred, args.threshold, args.loss_function)
        semi_X = np.concatenate((semi_X, X))
        semi_Y = np.concatenate((semi_Y, Y))
        print ('-- semi_data size: %d' %(len(semi_X)))

        model = simpleRNN(args,embedding_w)
        # train
        history = model.fit(semi_X, semi_Y,
                            validation_data=(X_val, Y_val),
                            epochs=40,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping] )

        plot_figure(history)
예제 #7
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

#####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'test':
        dm.add_data('test_data', test_path, False)
    else:
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.action == 'token':
        dm.tokenize()

    else:
        # read exist tokenizer
        dm.load_tokenizer(args.token)
    '''else:
        # create tokenizer on new data
        dm.tokenize()'''

    dm.save_tokenizer(args.token)

    # convert to sequences
    if args.action != 'token':
        dm.to_sequence(args.max_length)

    # initial model
    if args.action != 'token':
        print('initial model...')
        model = simpleRNN(args)
        print(model.summary())
        if args.load_model is not None:
            if args.action == 'train':
                print('Warning : load a exist model and keep training')
            path = os.path.join(load_path, 'model.h5')
            if os.path.exists(path):
                print('load model from %s' % path)
                model.load_weights(path)
            else:
                raise ValueError("Can't find the file %s" % path)
        elif args.action == 'test':
            print('Warning : testing without loading any model')

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=11,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

# testing
    elif args.action == 'test':
        X = dm.get_data('test_data')[0]
        predict = model.predict(X)
        result = [['id', 'label']]
        for i in range(len(predict)):
            a = [i]
            if predict[i][0] > 0.5:
                a.append(1)
            else:
                a.append(0)
            #a.append(predict[i][0])  #test
            #a.append(predict[i])
            result.append(a)
            i += 1
        cout = csv.writer(open(args.result_path, 'w'))
        cout.writerows(result)
        #implement after ensure output format


# semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=11,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times

        #for i in range(10):
        # label the semi-data
        semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True)
        semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                          args.threshold, args.loss_function)
        semi_X = np.concatenate((semi_X, X))
        semi_Y = np.concatenate((semi_Y, Y))
        #print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
        # train
        history = model.fit(semi_X,
                            semi_Y,
                            validation_data=(X_val, Y_val),
                            epochs=20,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

        if os.path.exists(save_path):
            print('load model from %s' % save_path)
            model.load_weights(save_path)
        else:
            raise ValueError("Can't find the file %s" % path)
예제 #8
0
def main():
	# limit gpu memory usage
	train_path = argv[1]
	semi_path = argv[2]

	#K.set_session(get_session(gpu_fraction))

	#####read data#####

	dm = DataManager()
	print ('Loading data...')
	if action == 'train':
		dm.add_data('train_data', train_path, True)
		#dm.add_data('semi_data', semi_path, False)
	elif action == 'semi':
		dm.add_data('train_data', train_path, True)
		dm.add_data('semi_data', semi_path, False)
	else:
		raise Exception ('Implement your testing parser')

	# prepare tokenizer
	print ('get Tokenizer...')
	if not os.path.exists(tokenizer_save_path):
		dm.tokenize(20000)
		dm.save_tokenizer(tokenizer_save_path)
	else:
		dm.load_tokenizer(tokenizer_save_path)

	
	# Word2Vec
	print ('get Word2Vec...')
	data_dic = dm.get_data()
	tokenizer = dm.get_tokenizer()
	#vocab_size = len(tokenizer.word_index)+1
	#data_list = data_dic['train_data'][2]+data_dic['semi_data'][1]
	#data_list = data_dic['train_data']
	#w2v_model = Word2Vec(data_list, size=256, min_count=5,iter=16,workers=16)
	#w2v_model.save(word2vec_save_path)
	#w2v_model = Word2Vec.load(word2vec_save_path)
	w2v_model=pk.load(open('emb.pkl','rb'))

	# convert to sequences
	dm.to_sequence(max_length)
	#dm.to_bow()

	# initial model
	print ('initial model...')
	model = simpleRNN()    
	print (model.summary())
	labelnum = [] 

	# training
	if action == 'train':
		(X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio)
		X = embedding_vector(X, w2v_model, tokenizer)
		X_val = embedding_vector(X_val, w2v_model, tokenizer)

		earlystopping = EarlyStopping(monitor='val_acc', patience = 15, verbose=1, mode='max')
		checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max' )
		history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=nb_epoch, batch_size=batch_size, callbacks=[checkpoint, earlystopping])
	# semi-supervised training
	elif action == 'semi':

		(X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio)
		semi_all_X = dm.get_data()['semi_data'][0]
		X = embedding_vector(X, w2v_model, tokenizer)
		X_val = embedding_vector(X_val, w2v_model, tokenizer)
		semi_all_X = embedding_vector(semi_all_X,w2v_model,tokenizer)

		X = np.array(X)
		X_val = np.array(X_val)
		semi_all_X = np.array(semi_all_X)

		earlystopping = EarlyStopping(monitor='val_acc', patience = 5, verbose=1, mode='max')
		checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max')
		# repeat 10 times
		for i in range(10):
			# label the semi-data
			semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True)
			semi_X, semi_Y = getsemidata(semi_all_X,semi_pred,threshold)
			labelnum.append(semi_X.shape)
			semi_X = np.concatenate((semi_X, X),axis=0)
			semi_Y = np.concatenate((semi_Y, Y),axis=0)
			print ('-- iteration %d  semi_data size: %d' %(i+1,len(semi_X)))
			# train
			history = model.fit(semi_X, semi_Y,validation_data=(X_val, Y_val),epochs=2,batch_size=batch_size,callbacks=[checkpoint, earlystopping] )

			if os.path.exists(model_save_path):
				print ('load model from %s' % model_save_path)
				model.load_model(model_save_path)
			else:
				raise ValueError("Can't find the file %s" %path)
	
	'''
예제 #9
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    else:
        dm.add_data('test_data', test_path, False)
        # raise Exception ('Implement your testing parser')

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)
    # dm.to_bow()

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    print(model.summary())

    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

        plot(history, args.model)
        # plot_model(model, to_file='./img/structure.png')

    # testing
    elif args.action == 'test':
        X = dm.get_data('test_data')
        print('Predict testing data...')
        result = model.predict(X)
        print('Save result...')
        saveResult(result, args.result_path)
        # raise Exception ('Implement your testing function')

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
예제 #10
0
파일: hw4.py 프로젝트: chenuxian/ML2017FALL
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = args.save_dir
    if args.load_model is not None:
        load_path = args.save_dir

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
        dm.add_test_data('test_data', test_path)
    else:
        dm.add_test_data('test_data', test_path)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    print(model.summary())

    if args.load_model is not None:
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

    # testing
    elif args.action == 'test':
        print(model.summary())
        [test_x] = dm.get_data('test_data')
        classes = model.predict(test_x, batch_size=32)
        with open(args.output_path, "w", encoding='utf-8') as f:
            spamwriter = csv.writer(f, delimiter=',')
            spamwriter.writerow(['id', 'label'])
            for i in range(len(classes)):
                if classes[i][0] < 0.5:
                    result = 0
                else:
                    result = 1
                spamwriter.writerow([str(i), str(result)])

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        [test_x] = dm.get_data('test_data')
        semi_all_X = np.concatenate((semi_all_X, test_x), axis=0)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(16):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
예제 #11
0
def main():
    dm = DataManager()
    dm.add_data('train_data', train_path, True)
    dm.add_data('semi_data', semi_path, False)

    print('Get Tokenizer...')
    dm.load_tokenizer('./token/token.pk')

    embedding_mat = dm.to_sequence(40, action)

    print('Initial model...')
    if action == 'train':
        model = RNN(embedding_mat)
        print(model.summary())
    elif action == 'semi':
        model = load_model('./model/model1.hdf5')
        print(model.summary())

    if action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=30,
                                      verbose=1,
                                      mode='max')
        checkpoint = ModelCheckpoint(filepath='./model/model.hdf5',
                                     verbose=1,
                                     save_best_only=True,
                                     monitor='val_acc',
                                     mode='max')
        model.fit(X,
                  Y,
                  validation_data=(X_val, Y_val),
                  epochs=80,
                  batch_size=512,
                  callbacks=[checkpoint, earlystopping])

    elif action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2)
        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')
        checkpoint = ModelCheckpoint(filepath='./model/model_semi.hdf5',
                                     verbose=1,
                                     save_best_only=True,
                                     monitor='val_acc',
                                     mode='max')
        for i in range(10):
            semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=1)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, 0.1)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            model.fit(semi_X,
                      semi_Y,
                      validation_data=(X_val, Y_val),
                      epochs=2,
                      batch_size=512,
                      callbacks=[checkpoint, earlystopping])
            print('load model from')
            model = load_model('./model/model_semi.hdf5')
예제 #12
0
# load data
dm = DataManager()
dm.add_data('test_data',test_path,False)


if mode=='private':
  # tokenizer
  dm.load_tokenizer('./token/token.pk')
  # load model
  model = load_model('./model/model1.hdf5')
elif mode=='public':
  # tokenizer
  dm.load_tokenizer('./token/token_filter.pk')
  # load model
  model = load_model('./model/model2.hdf5')

dm.to_sequence(40,'test')
test_all_x = dm.get_data('test_data')

print(model.summary())
predict = model.predict(test_all_x,batch_size = 1024, verbose=1)
predict[predict <=  0.5] = 0
predict[predict  >  0.5] = 1

f = open(output_path,'w')
f.write('id' + ',' + 'label\n')
for i in range(len(predict)):
  f.write(str(i) + ',' + str(int(predict[i])) + '\n')
f.close()