예제 #1
0
    	,emb_size       = 128
    	,hidden_size    = 128
    	,nb_filter      = 250
    	,filter_length  = 3 
    	,pool_length    = 2 
    	,init_type      = 'random' 
    	,emb_file       = "../data/unlabeled_corpus.vec"
    	,tune_emb       = True
	)

	options,args = parser.parse_args(sys.argv)


	print('Loading data...')
	(X_train, y_train), (X_test, y_test), (X_dev, y_dev), max_features, E, label_id = aidr.load_and_numberize_data(path=options.data_dir,
																			nb_words=options.max_features, init_type=options.init_type,
																			embfile=options.emb_file, dev_train_merge=0, map_labels_to_five_class=0)

#	print("Padding sequences....")
	X_train = sequence.pad_sequences(X_train, maxlen=options.maxlen)
	X_test  = sequence.pad_sequences(X_test,  maxlen=options.maxlen)
	X_dev   = sequence.pad_sequences(X_dev,   maxlen=options.maxlen)


	#build model...
	nb_classes = np.max(y_train) + 1

	print('............................')
	print(len(X_train), 'train tweets')
	print(len(X_dev),   'dev   tweets')
	print(max_features - 3, 'vocabulary size')
예제 #2
0
    	,nb_filter      = 250
    	,init_type      = 'random' 
    	,filter_length  = 3 
    	,pool_length    = 2 
    	,add_feat       = 0
    	,maxlen         = 80
    	,tune_emb       = True
    	,max_features   = 80
    	,map_class      = 0
	)

	options,args = parser.parse_args(sys.argv)

	print('Loading data...')
	(X_train, y_train), (X_test, y_test), (X_dev, y_dev), vocab_size, E, label_id = aidr.load_and_numberize_data(path=options.data_dir, seed=113,
																			nb_words=options.max_features, init_type=options.init_type,
																			embfile=options.emb_file)
	X_train_f, X_test_f, X_dev_f = aidr_feat.load_tfidf_vectors(path=options.data_dir, seed=113) # load features

	assert len(X_train) == X_train_f.shape[0] and len(X_test) == X_test_f.shape[0]

	print("Padding sequences....")
	X_train = sequence.pad_sequences(X_train, maxlen=options.maxlen)
	X_test  = sequence.pad_sequences(X_test,  maxlen=options.maxlen)
	X_dev   = sequence.pad_sequences(X_dev,   maxlen=options.maxlen)


	#build model...
	nb_classes = len(label_id)
	max_features = X_train_f.shape[1]
예제 #3
0
파일: xx_cnn_aidr.py 프로젝트: datnt88/AIDR
    	,emb_size       = 128
    	,hidden_size    = 128
    	,nb_filter      = 250
    	,filter_length  = 3 
    	,pool_length    = 2 
    	,init_type      = 'random' 
    	,emb_file       = "../data/unlabeled_corpus.vec"
    	,tune_emb       = True
	)

	options,args = parser.parse_args(sys.argv)


	print('Loading data...')
	(X_train, y_train), (X_test, y_test), (X_dev, y_dev), max_features, E, label_id = aidr.load_and_numberize_data(path=options.data_dir,
																			nb_words=options.max_features, init_type=options.init_type,
																			embfile=options.emb_file, dev_train_merge=0, map_labels_to_five_class=0)

#	print("Padding sequences....")
	X_train = sequence.pad_sequences(X_train, maxlen=options.maxlen)
	X_test  = sequence.pad_sequences(X_test,  maxlen=options.maxlen)
	X_dev   = sequence.pad_sequences(X_dev,   maxlen=options.maxlen)


	#build model...
	nb_classes = np.max(y_train) + 1

	print('............................')
	print(len(X_train), 'train tweets')
	print(len(X_test),  'test  tweets')
	print(len(X_dev),   'dev   tweets')
예제 #4
0
파일: MLP_aidr.py 프로젝트: datnt88/AIDR
	   	,learn_alg      = "adam" # sgd, adagrad, rmsprop, adadelta, adam (default)
	   	,loss           = "hinge"#"binary_crossentropy" # hinge, squared_hinge, binary_crossentropy (default)
	    ,minibatch_size = 32
    	,dropout_ratio  = 0.2
    	,epochs         = 25
    	,hidden_size    = 128
    	,nb_layers      = 1
    	,model_type     = 'mlp' 
    	,add_feat       = 0
    	,map_class      = 0
	)

	options,args = parser.parse_args(sys.argv)

	print('Loading data...')
	(X_train, y_train), (X_test, y_test), (X_dev, y_dev), max_features, E, label_id = aidr.load_and_numberize_data(path=options.data_dir, seed=113)
	X_train_f, X_test_f, X_dev_f = aidr_feat.load_tfidf_vectors(path=options.data_dir, seed=113) # load features

	assert len(X_train) == X_train_f.shape[0] and len(X_test) == X_test_f.shape[0]

	#build model...
	nb_classes = len(label_id)
	max_features = X_train_f.shape[1]
	print('............................')
	print(len(X_train), 'train tweets')
	print(len(X_test),  'test  tweets')
	print(len(X_dev),   'dev   tweets')
	print(max_features, 'features')
	print(nb_classes, 'different classes')
	print('............................')
예제 #5
0
        ,
        recur_type='lstm'  # gru, simplernn, lstm (default)
        ,
        init_type=
        'conv_glove'  # 'random', 'word2vec', 'glove', 'conv_word2vec', 'conv_glove', 'meta_conv',  'meta_orig'
        ,
        emb_file="../data/unlabeled_corpus.vec",
        tune_emb=True,
        map_class=1,
        numClasses=5,
        evalMinibatches=100)
    options, args = parser.parse_args(sys.argv)

    (X_train, y_train), (X_test, y_test), (X_dev, y_dev), max_features, E, label_id, sequence_len = \
        aidr.load_and_numberize_data(path=options.data_dir, nb_words=options.max_features, maxlen=options.maxlen,
                                     init_type=options.init_type,
                                     dev_train_merge=1, embfile=None, map_labels_to_five_class=options.map_class)

    # Placeholders
    input_data = tf.placeholder(tf.int32, [None, options.maxlen],
                                name="input_data")
    sequence_lengths = tf.placeholder(tf.int32,
                                      shape=[None],
                                      name="sequence_lengths")
    y_values = tf.placeholder(tf.int32, [None])
    labels = tf.one_hot(y_values, options.numClasses)

    prediction = forward_propagation_bidirectional(input_data,
                                                   sequence_lengths, E)
    #prediction = forward_propagation_unidirectional(input_data, sequence_lengths, E)
    #prediction = forward_propagation_averaging(input_data, sequence_lengths, E)
예제 #6
0
        pool_length=2,
        init_type='random',
        emb_file="../data/unlabeled_corpus.vec",
        tune_emb=True)

    options, args = parser.parse_args(sys.argv)

    #print('Loading data...')
    print('LOADING DATA...')
    print('----------------------------------------------------------------')
    (X_train, y_train), (X_test, y_test), (X_validate, y_validate), (
        X_newinput,
        y_newinput), max_features, E, label_id = aidr.load_and_numberize_data(
            path=options.data_dir,
            nb_words=options.max_features,
            init_type=options.init_type,
            embfile=options.emb_file,
            validate_train_merge=0,
            map_labels_to_five_class=0)

    #	print("Padding sequences....")
    X_train = sequence.pad_sequences(X_train, maxlen=options.maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=options.maxlen)
    X_validate = sequence.pad_sequences(X_validate, maxlen=options.maxlen)

    X_newinput = sequence.pad_sequences(
        X_newinput, maxlen=options.maxlen)  #Quan them vao--------------------
    #print(X_train[0]) #quan them vao

    #build model...
    nb_classes = np.max(y_train) + 1