list_sentences_test = test_cl.comment_text print("....start....pretrain") from numpy import asarray from numpy import zeros print("....At....Tokenizer") puncuate = r'([\.\!\?\:\,])' from keras.preprocessing.text import Tokenizer tokenizer = Tokenizer(num_words=max_features, oov_token=puncuate) tokenizer.fit_on_texts(list(list_sentences_train) + list(list_sentences_test)) list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train) list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test) totalNumWords = [len(one_comment) for one_comment in list_tokenized_train] print("mean length:" + str(np.mean(totalNumWords))) print("max length:" + str(max(totalNumWords))) print("std length:" + str(np.std(totalNumWords))) print(" maxlen is:" + str(maxlen)) print("number of different word:" + str(len(tokenizer.word_index.items()))) if len(tokenizer.word_index.items()) < max_features: max_features = len(tokenizer.word_index.items()) from keras.preprocessing import sequence
recall = recall(y_true, y_prediction) precision = precision(y_true, y_prediction) return 2 * ((precision * recall) / (precision + recall + K.epsilon())) from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import Dense, GRU, Embedding from tensorflow.python.keras.optimizers import Adam from tensorflow.python.keras.preprocessing.text import Tokenizer from tensorflow.python.keras.preprocessing.sequence import pad_sequences #tokenizing the words tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_sequences(X_train) X_val = tokenizer.texts_to_sequences(X_val) #X_test = tokenizer.texts_to_sequences(X_train) #X_test from keras.preprocessing import sequence, text from keras.preprocessing.text import Tokenizer from keras.models import Sequential from keras.preprocessing.sequence import pad_sequences #sequence padding for the model from keras.preprocessing import sequence, text from keras.preprocessing.text import Tokenizer from keras.models import Sequential from keras.preprocessing.sequence import pad_sequences
labels.append(0) elif (label_arr[i][1] == 1): labels.append(1) elif (label_arr[i][2] == 1): labels.append(2) elif (label_arr[i][3] == 1): labels.append(3) ''' #(unique, counts) = np.unique(data_arr.flatten(), return_counts=True) #vocab_size = len(unique) #labels_a = np.array(labels) token = Tokenizer() token.fit_on_texts(data_arr) index = token.word_index index_len = len(index) new_data = token.texts_to_sequences(data_arr) new_data = pad_sequences(new_data) print(new_data.shape) train, test, train_lab, test_lab = train_test_split(new_data, label_arr) print(train.shape, train_lab.shape) print(test.shape, test_lab.shape) #embed = Word2Vec(train, min_count=1) print(new_data.shape[1]) model = Sequential() model.add(Embedding(index_len * 2, 100, input_length=new_data.shape[1])) model.add(SpatialDropout1D(0.2)) model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(3, activation='softmax')) model.compile(loss='categorical_crossentropy',
tweets, y = read_test_data(file_dir + 'AraSenti_all.xlsx') testTweets, ytest = read_train_data(file_dir + 'KKAISA_tweets.xlsx') #tweets preprocessing tweets = [tweet_preprocessing(t) for t in tweets] testTweets = [tweet_preprocessing(t) for t in testTweets] max_tweet_length = max([len(x.split()) for x in (tweets + testTweets)]) ## Tokenization and padding tokenizer = Tokenizer() tokenizer.fit_on_texts(tweets + testTweets) sequences = tokenizer.texts_to_sequences(tweets) x_train = pad_sequences(sequences, maxlen=max_tweet_length) sequences = tokenizer.texts_to_sequences(testTweets) x_test = pad_sequences(sequences, maxlen=max_tweet_length) vocab_size = len(tokenizer.word_index) + 1 ## in my dataset #create one hot_vectors for labels y_train = to_categorical(y, classes) y_test = to_categorical(ytest, classes) #upload pre_trained embedding embeddings_index = load_embedding() #map our data to word embedding
word = word_tokenize(text) l = len(word) r_len.append(l) MAX_REVIEW_LEN = np.max(r_len) MAX_REVIEW_LEN max_features = num_unique_word max_words = MAX_REVIEW_LEN batch_size = 128 epochs = 3 num_classes = 5 tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train_text)) X_train = tokenizer.texts_to_sequences(X_train_text) X_val = tokenizer.texts_to_sequences(X_val_text) X_test = tokenizer.texts_to_sequences(test_text) #sequence padding? X_train = sequence.pad_sequences(X_train, maxlen=max_words) X_val = sequence.pad_sequences(X_val, maxlen=max_words) X_test = sequence.pad_sequences(X_test, maxlen=max_words) model1 = Sequential() model1.add(Embedding(max_features, 100, mask_zero=True)) model1.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4, return_sequences=True)) model1.add(LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=False)) model1.add(Dense(num_classes, activation='softmax')) model1.compile(loss='categorical_crossentropy',
LOC_labels = [[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 ] for _ in LOC] # 3 MONEY_labels = [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 ] for _ in MONEY] # 4 NUMBER_labels = [[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 ] for _ in NUMBER] # 5 ORG_labels = [[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 ] for _ in ORG] # 6 OTHER_labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 ] for _ in OTHER] # 7 PERCENT_labels = [[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 ] for _ in PERCENT] # 8 PERSON_labels = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0 ] for _ in PERSON] # 9 TIME_labels = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] for _ in TIME] # 10 y = np.concatenate([EVENT_labels, GPE_labels, LANGUAGE_labels, LOC_labels, MONEY_labels, NUMBER_labels, ORG_labels, OTHER_labels, PERCENT_labels, PERSON_labels, TIME_labels], 0) max_features = 20000 tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(x_text)) list_tokenized_train = tokenizer.texts_to_sequences(x_text) maxlen = 100 X_t = pad_sequences(list_tokenized_train, maxlen=maxlen) inp = Input(shape=(maxlen, )) embed_size = 128 x = Embedding(max_features, embed_size)(inp) x = LSTM(60, return_sequences=True,name='lstm_layer')(x) x = GlobalMaxPool1D()(x) x = Dropout(0.1)(x) x = Dense(50, activation="relu")(x) x = Dropout(0.1)(x) x = Dense(11, activation="softmax")(x) model = Model(inputs=inp, outputs=x)