def prepare_data(self, data_fields, wv_size=600):
        test_data = Data(self.file_name, self.file_path)
        test_df = test_data.csv_df(data_fields)
        # make a copy of the original tweets for later use
        original_df = test_df.copy()

        # pre-process data(same as how we trained)
        test_data.pre_process(test_df)

        # then convert using word2vec
        model = test_data.build_wordvec(size=wv_size, verbose=False)
        # take a look of the max_len of testing. although we still have to use max_len from train
        max_len_test = test_data.max_len(test_df)
        data = test_data.convert2vec(test_df,
                                     self.max_len_train,
                                     model,
                                     name='test_' + self.file_name)
        test_data.save_vec(data, name='test_' + self.file_name)

        self.data = data
        self.test_data = test_data
        self.test_df = test_df
        self.original_df = original_df
        print ">>>Done preparing data.<<<\n"
label_file = file_path + 'data/labels.npy'

data = np.load(data_file)
label = np.load(label_file)

# load original tweets
# ---------------------------------------------------------------------------------
sports_dic = {
    'basketball': 1,
    'hockey': 2,
    'baseball': 3,
    'tennis': 4,
    'volleyball': 5
}
sp_data = Data(sports_dic, file_path)
sp_df = sp_data.csv_df(['text'])  # load data
rm_hashtags = ['#' + s for s in sports_dic.keys()]
sp_data.pre_process(sp_df, rm_list=rm_hashtags)  # pre-process data
sp_df.drop(['tokenized'], axis=1, inplace=True)
# ---------------------------------------------------------------------------------

# set up lstm structure
n_classes = 5
hm_epochs = 20
batch_size = 50
chunk_size = data.shape[2]
n_chunks = data.shape[1]
rnn_size = 300

# height x width
x = tf.placeholder('float', [None, n_chunks, chunk_size])