def load_data(data_source): assert data_source in ["keras_data_set", "local_dir"], "Unknown data source" if data_source == "keras_data_set": (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None, oov_char=None, index_from=None) x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post") x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post") vocabulary = imdb.get_word_index() vocabulary_inv = dict((v, k) for k, v in vocabulary.items()) vocabulary_inv[0] = "<PAD/>" else: x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data() vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} y = y.argmax(axis=1) # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] train_len = int(len(x) * 0.9) x_train = x[:train_len] y_train = y[:train_len] x_test = x[train_len:] y_test = y[train_len:] return x_train, y_train, x_test, y_test, vocabulary_inv
def backToWords(comment): word_index=imdb.get_word_index() #getting words related to all integers reverse_word_index=dict( [(value,key)for (key,value) in word_index.items()] #converting integer index to words key=integer val=word ) decoded_review=' '.join( [reverse_word_index.get(i-3,'?')for i in comment] #getting words related to that integer )
def test_imdb(): # only run data download tests 20% of the time # to speed up frequent testing random.seed(time.time()) if random.random() > 0.8: (x_train, y_train), (x_test, y_test) = imdb.load_data() (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40) assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) word_index = imdb.get_word_index() assert isinstance(word_index, dict)
def get_vectors_from_text(dataset_list,word_to_ind=imdb.get_word_index(), start_char=1, index_from=3, maxlen=400, num_words=5000, oov_char=2,skip_top=0): ''' Gets the list vector mapped according to the word to indices dictionary. @param dataset_list = list of review texts in unicode format word_to_ind = word to indices dictionary hyperparameters: start_char-->sentence starting after this char. index_from-->indices below this will not be encoded. max-len-->maximum length of the sequence to be considered. num_words-->number of words to be considered according to the rank.Rank is given according to the frequency of occurence oov_char-->out of variable character. skip_top-->no of top rank words to be skipped @returns: x_train: Final list of vectors(as list) of the review texts ''' x_train = [] for review_string in dataset_list: review_string_list = text_to_word_sequence(review_string) review_string_list = [ele for ele in review_string_list] x_predict = [] for i in range(len(review_string_list)): if review_string_list[i] not in word_to_ind: continue x_predict.append(word_to_ind[review_string_list[i]]) x_train.append((x_predict)) # add te start char and also take care of indexfrom if start_char is not None: x_train = [[start_char] + [w + index_from for w in x] for x in x_train] elif index_from: x_train = [[w + index_from for w in x] for x in x_train] # only maxlen is out criteria x_train=[ele[:maxlen] for ele in x_train] # if num is not given take care if not num_words: num_words = max([max(x) for x in x_train]) # by convention, use 2 as OOV word # reserve 'index_from' (=3 by default) characters: # 0 (padding), 1 (start), 2 (OOV) if oov_char is not None: x_train = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in x_train] else: x_train = [[w for w in x if (skip_top <= w < num_words)] for x in x_train] # padd the sequences x_train = sequence.pad_sequences(x_train, maxlen=maxlen) # return the vectors form of the text return x_train
def imdb_word_dic() -> (Dict[str, int], Dict[int, str]): """ forms the dictionary of word2index and index2word """ # A dictionary mapping words2index word2index = imdb.get_word_index() # The first indices are reserved word2index = {k: (v + 2) for k, v in word2index.items()} word2index["<PAD>"] = 0 word2index["<START>"] = 1 word2index["<UNK>"] = 2 # unknown # index2word index2word = dict([(value, key) for (key, value) in word2index.items()]) return word2index, index2word
def main(epochs, batch_size=32, unique_word_count=5000, max_word_count=400, seed=124): # save np.load # Load imdb data (words_train, labels_train), (words_test, labels_test) = load_data( unique_word_count, max_word_count) # Create model model = create_model_cnn("first-cnn", unique_word_count, max_word_count) # Create map from words to their equivalent vectors embeddings = model.layers[0].get_weights()[0] word_to_token = imdb.get_word_index() word_to_embedding = { word: embeddings[token] for word, token in word_to_token.items() if token < embeddings.shape[0] } # Train model on all data `epochs` times train(model, epochs, batch_size) # Test model positive_test = vectorize_word_list( ["basically", "getting", "action", "right", "from", "the", "start"], word_to_embedding) negative_test = vectorize_word_list(["poor", "terrible", "awful"], word_to_embedding) print(positive_test) positive_test = sequence.pad_sequences(positive_test, maxlen=max_word_count, dtype=np.float32) negative_test = sequence.pad_sequences(negative_test, maxlen=max_word_count, dtype=np.float32) print(positive_test) print("Positive: ", model.predict(positive_test)) print("Negative: ", model.predict(negative_test)) imdb_test = words_test[0:5, :] print(imdb_test.shape) print("IMDB[0]:", vector_to_word_list(imdb_test, word_to_embedding), model.predict(imdb_test)) return
def load_text(filename, idim): data = [] print("\n", filename, ":") with open(filename, 'r') as file: for line in file.readlines(): print(line) data += [ w.strip(''.join(['.', ',', ':', ';', '!', '?', '(', ')'])).lower() for w in line.strip().split() ] index = imdb.get_word_index() x_test = [] for w in data: if w in index and index[w] < idim: x_test.append(index[w]) x_test = vectorize([np.array(x_test)], idim) return x_test
def load_file(filename): file = open(filename, 'r') text = file.read().lower() text = re.sub(r"[^a-z0-9' ]", "", text) text = text.split(" ") index = imdb.get_word_index() coded = [1] for word in text: num = index.get(word, 0) if num != 0: num += 3 if num > top_words: num = 2 coded.append(num) return coded
def load_data(): """ Load data if data have been created. Create data otherwise. """ if 'data' not in os.listdir('.'): os.mkdir('data') if 'id_to_word.pkl' not in os.listdir('data'): print('Loading data...') (x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=max_features, index_from=3) word_to_id = imdb.get_word_index() word_to_id = {k: (v + 3) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} print(len(x_train), 'train sequences') print(len(x_val), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_val = sequence.pad_sequences(x_val, maxlen=maxlen) y_train = np.eye(2)[y_train] y_val = np.eye(2)[y_val] np.save('./data/x_train.npy', x_train) np.save('./data/y_train.npy', y_train) np.save('./data/x_val.npy', x_val) np.save('./data/y_val.npy', y_val) with open('data/id_to_word.pkl', 'wb') as f: pickle.dump(id_to_word, f) else: x_train, y_train, x_val, y_val = (np.load('data/x_train.npy'), np.load('data/y_train.npy'), np.load('data/x_val.npy'), np.load('data/y_val.npy')) with open('data/id_to_word.pkl', 'rb') as f: id_to_word = pickle.load(f) return x_train, y_train, x_val, y_val, id_to_word
def load_data(data_source): # global sequence_length assert data_source in ["keras_data_set", "local_dir", "pickle"], "Unknown data source" if data_source == "keras_data_set": (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None, oov_char=None, index_from=None) x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post") x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post") vocabulary = imdb.get_word_index() vocabulary_inv = dict((v, k) for k, v in vocabulary.items()) vocabulary_inv[0] = "<PAD/>" # elif data_source == "pickle": # vocabulary_inv = pickle.load(open(".models/vocabulary.p","rb")) # return "","","","",vocabulary_inv else: x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data() vocabulary_inv = { key: value for key, value in enumerate(vocabulary_inv_list) } y = y.argmax(axis=1) # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] train_len = int(len(x) * 0.9) x_train = x[:train_len] y_train = y[:train_len] x_test = x[train_len:] y_test = y[train_len:] return x_train, y_train, x_test, y_test, vocabulary_inv
def sent_anly_prediction(): if request.method == 'POST': text = request.form['text'] Sentiment = '' max_review_length = 500 word_to_id = imdb.get_word_index() strip_special_chars = re.compile("[^A-Za-z0-9 ]+") text = text.lower().replace("<br />", " ") text = re.sub(strip_special_chars, "", text.lower()) words = text.split() #split string into a list x_test = [[ word_to_id[word] if (word in word_to_id and word_to_id[word] <= 20000) else 0 for word in words ]] x_test = sequence.pad_sequences( x_test, maxlen=500) # Should be same which you used for training data vector = np.array([x_test.flatten()]) #with graph.as_default(): with graph.as_default(): # perform the prediction probability = model.predict(array([vector][0]))[0][0] #print(out) #print(class_names[np.argmax(out)]) # convert the response to a string #response = class_names[np.argmax(out)] #return str(response) #probability = model.predict(array([vector][0]))[0][0] class1 = model.predict_classes(array([vector][0]))[0][0] if class1 == 0: sentiment = 'Negative' img_filename = os.path.join(app.config['UPLOAD_FOLDER'], 'Sad_Emoji.png') else: sentiment = 'Positive' img_filename = os.path.join(app.config['UPLOAD_FOLDER'], 'Smiling_Emoji.png') return render_template('home.html', text=text, sentiment=sentiment, probability=probability, image=img_filename)
def main(): (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) print(train_data[0]) word_index = imdb.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) #decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) #print(decoded_review) x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') model = models.Sequential() model.add(layers.Dense(16, activation='relu', input_shape=(10000, ))) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['accuracy']) x_val = x_train[:10000] partial_x_train = x_train[10000:] y_val = y_train[:10000] partial_y_train = y_train[10000:] history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val)) predictions = model.predict(x_test) print(predictions) return
def encode_review(rev): res = [] for i, el in enumerate(rev): el = el.lower() delete_el = [',', '!', '.', '?'] for d_el in delete_el: el = el.replace(d_el, '') el = el.split() for j, word in enumerate(el): code = imdb.get_word_index().get(word) if code is None or code >= 10000: code = 0 el[j] = code res.append(el) for i, r in enumerate(res): res[i] = sequence.pad_sequences([r], maxlen=MAX_REVIEW_LENGTH) res = np.array(res) return res.reshape((res.shape[0], res.shape[2]))
def preinfo(): # label为1表示正能量,0表示负能量 print(train_data[0]) print(train_labels[0]) #频率与单词的对应关系存储在哈希表word_index中,它的key对应的是单词,value对应的是单词的频率 word_index = imdb.get_word_index() #我们要把表中的对应关系反转一下,变成key是频率,value是单词 reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) ''' 在train_data所包含的数值中,数值1,2,3对应的不是单词,而用来表示特殊含义,1表示“填充”,2表示”文本起始“, 3表示”未知“,因此当我们从train_data中读到的数值是1,2,3时,我们要忽略它,从4开始才对应单词,如果数值是4, 那么它表示频率出现最高的单词 ''' decoded_review = ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[0]]) print(decoded_review)
def saved_model(path, inpVal, MaxLen): model = load_model(path) wordIndex = imdb.get_word_index() words = inpVal.split() review = [] for word in words: if word not in wordIndex: review.append(2) else: review.append(wordIndex[word] + 3) review = sequence.pad_sequences([review], maxlen=MaxLen) result = model.predict(review) print('Prediction (0 = negative, 1 = positive) = ', end=" ") print("%0.4f" % result[0][0]) del model bkn.clear_session() return result
def classify_review(review): maxlen = 100 model = load_model('model.h5') d = imdb.get_word_index() words = review.split() review = [] for word in words: if word not in d: review.append(2) else: review.append(d[word] + 3) review = sequence.pad_sequences([review], truncating='pre', padding='pre', maxlen=maxlen) prediction = model.predict(review) return prediction[0][0]
def explore(): print(train_data.shape) print(train_labels.shape) print(test_data.shape) print(test_labels.shape) print(train_data[0]) print(train_labels[0]) print(max(max(sequence) for sequence in train_data)) # word_index is a dictionary mapping: word -> int indices word_index = imdb.get_word_index() # reversing it, mapping becomes int indices -> word reversed_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ' '.join(reversed_word_index.get(i-3, '?') for i in train_data[0]) print(decoded_review)
def Preparing_string(text_string, dimension=40): text_string = text_string.lower() table = str.maketrans(dict.fromkeys(string.punctuation)) text_string = text_string.translate(table) word2index = imdb.get_word_index() test = [] for word in word_tokenize(text_string): test.append(word2index[word]) print(text_string) print(test) out = np.zeros(dimension) for _, sequence in enumerate(test): if sequence < dimension: out[sequence] = 1 print("\nOutput:", out)
def predict_score(model, review_text, word_to_ind=imdb.get_word_index()): ''' Predict and produce the accuracy of the review text @param model:SequentialModel which we trained the data on review_text:Review text to be predicted on word_to_ind: dictionary mapping of words to indices @returns sentiment score on the review text. ''' # convert review text into vector x_predict = get_vectors_from_text([review_text], word_to_ind)[0] # reshape x_predict x_predict = np.reshape(x_predict, (1, len(x_predict))) return model.predict(x_predict)[0][0]
def encodeData(self, x_test): print("Encoding data...") word_indices = imdb.get_word_index() reviews = [] for doc in x_test: review = [] for word in doc: if word not in word_indices: review.append(2) else: review.append(word_indices[word] + 3) review.sort(reverse=True) reviews.append(review) print("Encoding done...!!!") return reviews
def load_data(size=0.2): # load the dataset (X_train, y_train), (X_test, y_test) = imdb.load_data() X = np.concatenate((X_train, X_test), axis=0) y = np.concatenate((y_train, y_test), axis=0) X, y = shuffle(X, y, random_state=42) vocab_size = len(imdb.get_word_index()) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=42) x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) return x_train, x_test, y_train, y_test
def multi_dataset_test(): random.seed(time.time()) if random.random() > 0.8: (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() assert len(x_train) == len(y_train) == 60000 assert len(x_test) == len(y_test) == 10000 (x_train, y_train), (x_test, y_test) = boston_housing.load_data() assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) (x_train, y_train), (x_test, y_test) = imdb.load_data() (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40) assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) word_index = imdb.get_word_index() assert isinstance(word_index, dict) (x_train, y_train), (x_test, y_test) = mnist.load_data() assert len(x_train) == len(y_train) == 60000 assert len(x_test) == len(y_test) == 10000 (x_train, y_train), (x_test, y_test) = reuters.load_data() assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) assert len(x_train) + len(x_test) == 11228 (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10) assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) word_index = reuters.get_word_index() assert isinstance(word_index, dict) (x_train, y_train), (x_test, y_test) = cifar10.load_data() cifarDefaultTrainLength = 50000 cifarDefaultTestLength = 10000 assert len(x_train) == len(y_train) == cifarDefaultTrainLength assert len(x_test) == len(y_test) == cifarDefaultTestLength (x_train, y_train), (x_test, y_test) = cifar100.load_data('fine') cifarFineTrainLength = 50000 cifarFineTestLength = 10000 assert len(x_train) == len(y_train) == cifarFineTrainLength assert len(x_test) == len(y_test) == cifarFineTestLength (x_train, y_train), (x_test, y_test) = cifar100.load_data('coarse') cifarCoarseTrainLength = 50000 cifarCoarseTestLength = 10000 assert len(x_train) == len(y_train) == cifarCoarseTrainLength assert len(x_test) == len(y_test) == cifarCoarseTestLength
def text_load(): dictionary = dict(imdb.get_word_index()) test_x = [] test_y = np.array(answers) for string in strings: words = string.replace(',', ' ').replace('.', ' ').replace( '?', ' ').replace('\n', ' ').split() num_words = [] for word in words: word = dictionary.get(word) if word is not None and word < 10000: num_words.append(word) test_x.append(num_words) test_x = [vectorize(test_x)] model = build_model(10000) (train_x, train_y), (s1, s2) = prepare_data(10000) model.fit(train_x, train_y, epochs=2, batch_size=500) predictions = model.predict(test_x) print(predictions)
def my_form_post(): tb._SYMBOLIC_SCOPE.value = True # resolved error itext = request.form['text'] words = re.sub("[^\w]", " ", itext).split() INDEX_FROM = 3 # word index offset # import os, ssl # if (not os.environ.get('PYTHONHTTPSVERIFY', '') and # getattr(ssl, '_create_unverified_context', # None)): ssl._create_default_https_context = ssl._create_unverified_contex word_to_id = imdb.get_word_index() word_to_id = {k: (v + INDEX_FROM) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} x_ireview = [[word_to_id.get(i, 2) for i in words]] tokenizer = Tokenizer(num_words=1000) x_predict = tokenizer.sequences_to_matrix(x_ireview, mode='binary') # ynew =loaded_model.predict_proba(x_predict) ynew = sentiment_predict.sentiment_predict(x_predict) # if ynew[0, 1] < 0.5: isentiment = ':(' else: isentiment = ':)' # return ynew, prednr processed_text = ynew # return processed_text return render_template('my-form.html', nnoutcome=processed_text, isentiment=isentiment, itext=itext)
def main(): (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) word_index = imdb.get_word_index() reverse_words_index = dict([(v, k) for k, v in word_index.items()]) decoded_review = ' '.join( [reverse_words_index.get(i - 3, '?') for i in train_data[0]]) print(decoded_review) x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') model = models.Sequential() model.add(layers.Dense(16, activation='relu', input_shape=(10000, ))) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense(1, activation='sigmoid', input_shape=(10000, ))) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) x_val = x_train[:10000] partial_x_train = x_train[10000:] y_val = y_train[:10000] partial_y_train = y_train[10000:] history = model.fit(partial_x_train, partial_y_train, epochs=4, batch_size=512, validation_data=(x_val, y_val)) test_loss, test_acc = model.evaluate(x_test, y_test) print('test_acc:', test_acc) history_dict = history.history loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss'] plot_loss(loss_values, val_loss_values) acc_values = history_dict['acc'] val_acc_values = history_dict['val_acc'] plot_acc(acc_values, val_acc_values)
def load_data(data_source): assert data_source in ["keras_data_set", "local_dir"], "Unknown data source" if data_source == "keras_data_set": (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None, oov_char=None, index_from=None) x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post") x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post") vocabulary = imdb.get_word_index() vocabulary_inv = dict((v, k) for k, v in vocabulary.items()) vocabulary_inv[0] = "<PAD/>" print("length of the vocab inv is ",len(vocabulary_inv)) else: print("no data in the keras dataset") return x_train, y_train, x_test, y_test, vocabulary_inv
def train(): (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # word_index is a dictionary mapping words to an integer index word_index = imdb.get_word_index() # We reverse it, mapping integer indices to words reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # We decode the review; note that our indices were offset by 3 # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown". decoded_review = ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[0]]) # Our vectorized training data x_train = vectorize_sequences(train_data) # Our vectorized test data x_test = vectorize_sequences(test_data) # Our vectorized labels y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') x_val = x_train[:10000] partial_x_train = x_train[10000:] y_val = y_train[:10000] partial_y_train = y_train[10000:] model = models.Sequential() model.add(layers.Dense(16, activation='relu', input_shape=(10000, ))) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy]) return model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))
def sent_anly_prediction(): if (request.method == 'POST'): text = request.form['text'] sentiment = '' max_review_length = 500 word_to_id = imdb.get_word_index() strip_special_chars = re.compile("[^A-Za-z0-9 ]+") text = text.lower().replace("<br />", " ") text = re.sub(strip_special_chars, "", text.lower()) words = text.split() #split string into a list x_test = [[ word_to_id[word] if (word in word_to_id and word_to_id[word] <= 20000) else 0 for word in words ]] x_test = sequence.pad_sequences( x_test, maxlen=500) # Should be same which you used for training data vector = np.array([x_test.flatten()]) graph = tf.compat.v1.get_default_graph() with graph.as_default(): model = load_model('sentimental_analysis_model_new.h5') probability = model.predict(array([vector][0]))[0][0] print("Probability is ", probability) class1 = model.predict_classes(array([vector][0]))[0][0] print("Class is ", class1) if class1 == 0: sentiment = 'Negative' img_filename = os.path.join(app.config['UPLOAD_FOLDER'], 'Sad_Emoji.png') else: sentiment = 'Positive' img_filename = os.path.join(app.config['UPLOAD_FOLDER'], 'Smiling_Emoji.png') return render_template('home1.html', text=text, sentiment=sentiment, probability=probability, image=img_filename)
def main(): (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=None, skip_top=0, maxlen=None, seed=113, start_char=2, oov_char=1, index_from=0) i2w = {w_id: w for w, w_id in imdb.get_word_index().items()} with open('train.csv', mode='w', encoding='utf-8') as f: for i in range(x_train.shape[0]): line = ' '.join([i2w[w_id] for w_id in x_train[i]][1:]) f.write('{0}, {1}\n'.format(line, y_train[i])) with open('test.csv', mode='w', encoding='utf-8') as f: for i in range(x_test.shape[0]): line = ' '.join([i2w[w_id] for w_id in x_test[i]][1:]) f.write('{0}, {1}\n'.format(line, y_test[i]))
def Preparing_string(text_string, dimension=TOP_WORDS): text_string = text_string.lower() table = str.maketrans(dict.fromkeys(string.punctuation)) text_string = text_string.translate(table) word2index = imdb.get_word_index() test = [] for word in word_tokenize(text_string): test.append(word2index[word]) results = np.zeros(dimension) for _, sequence in enumerate(test): if sequence < dimension: results[sequence] = 1 print("\nOriginal string:", text_string, "\n") print("\nIndex conversion:", test, "\n") results = np.reshape(results, (1, TOP_WORDS)) print("\nConvert to vectors:", results, "\n") return results
def _prepare_index_to_word(preview=0, top_words=5000): word_to_index = imdb.get_word_index() index_to_word = dict() for word, index in word_to_index.items(): if top_words is not None and index > top_words: continue index_to_word[index + len(SpecialConstants)] = word index_to_word[SpecialConstants.PADDING.value] = SpecialConstants.PADDING index_to_word[SpecialConstants.START.value] = SpecialConstants.START index_to_word[SpecialConstants.OUT_OF_VOCABULARY. value] = SpecialConstants.OUT_OF_VOCABULARY assert top_words is None or len( index_to_word) == top_words + len(SpecialConstants) for index, word in list(index_to_word.items())[:preview]: print(index, ':', word) return index_to_word
def decode_sentence(sentence): word_index = imdb.get_word_index() ''' It is converting a text into sequence of words or token ''' sentence = text.text_to_word_sequence( sentence, filters='!”#$%&()*+,-./:;?@[\\]^_`{|}~\t\n', lower=True) ''' SEEN FROM THE INTERNET Converitng the word list into numpy array of word indexes , with 0 for unknown words for each string in the data file. ''' sentence = np.array( [word_index[word] if word in word_index else 0 for word in sentence]) print("sentence", sentence) sentence[sentence > 5000] = 2 l = 500 - len(sentence) sentence = np.pad(sentence, (l, 0), 'constant') sentence = sentence.reshape(1, -1) return sentence
def lecture_du_jeu_de_imdbkeras(): max_words = 20000 x_train_imdb = [] y_train_imdb = [] (x_imdb, y_imdb), (x_test_imdb, y_test_imdb) = imdb.load_data(num_words=max_words, maxlen=300, seed=113) #reconstruction wordDict = {y: x for x, y in imdb.get_word_index().items()} for doc in x_imdb: sequence = "" for index in doc: sequence += " " + wordDict.get(index) x_train_imdb.append(sequence) for i in y_imdb: y_train_imdb.append(str(i)) return x_train_imdb, y_train_imdb
def get_imdb_corpus(): (x_train, y_train), (y_test, y_test) = imdb.load_data() # num_words = vocabulary_size word2id = { word: (i + INDEX_FROM) for word, i in imdb.get_word_index().items() } id2word = {i: word for word, i in word2id.items()} train_samples = [[id2word.get(i, '<<UNKNOWN>>') for i in sample if i >= 2] for sample in x_train] train_tags = y_train test_samples = [[id2word.get(i, '<<UNKNOWN>>') for i in sample if i >= 2] for sample in x_train] test_tags = y_test return (train_samples, train_tags), (test_samples, test_samples)
def highlight_attention_words(model, in_seq): aux = imdb.get_word_index() words = {} for x in aux: words[aux[x]] = x sentence = '' for x in in_seq: if x >= 3 and x - 3 in words: sentence += words[x - 3] + ' ' print(sentence) print("") print("Top worlds:") x = model.predict(np.expand_dims(in_seq, 0))[0] sol = [] for i in range(0, len(in_seq)): if in_seq[i] >= 3: sol.append((x[i], in_seq[i])) sol.sort(reverse=True) for i in range(0, 10): score = sol[i][0] x = sol[i][1] print(words[x - 3], score)
import keras # print(keras.__version__) # 加载LMDB数据集 from keras.datasets import imdb # 保留数据中前10000个单词,低频单词被舍弃 (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # 最大索引值 不超过10000 print(max([max(sequence) for sequence in train_data])) # 解码为英文单词 # word_index is a dictionary mapping words to an integer index word_index = imdb.get_word_index() # We reverse it, mapping integer indices to words reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # We decode the review; note that our indices were offset by 3 # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown". decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) print(decoded_review) # preparing the data import numpy as np # one-hot编码 def vectorize_sequences(sequences, dimension=10000): # Create an all-zero matrix of shape (len(sequences), dimension) results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences):
def load_data_set(type,max_len,vocab_size,batch_size): """ Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification Args: type : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set max_len: {int} timesteps used for padding vocab_size: {int} size of the vocabulary batch_size: batch_size Returns: train_loader: {torch.Dataloader} train dataloader x_test_pad : padded tokenized test_data for cross validating y_test : y_test word_to_id : {dict} words mapped to indices """ INDEX_FROM=3 if not bool(type): NUM_WORDS=vocab_size # only use top 1000 words # word index offset train_set,test_set = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM) x_train,y_train = train_set[0],train_set[1] x_test,y_test = test_set[0],test_set[1] word_to_id = imdb.get_word_index() word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value:key for key,value in word_to_id.items()} x = np.concatenate([x_train, x_test]) y = np.concatenate([y_train, y_test]) n_train = x.shape[0] - 1000 n_valid = 1000 x_train = x[:n_train] y_train = y[:n_train] x_test = x[n_train:n_train+n_valid] y_test = y[n_train:n_train+n_valid] #embeddings = load_glove_embeddings("../../GloVe/glove.6B.50d.txt",word_to_id,50) x_train_pad = pad_sequences(x_train,maxlen=max_len) x_test_pad = pad_sequences(x_test,maxlen=max_len) train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.DoubleTensor)) train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True) return train_loader,x_test_pad,y_test,word_to_id else: from keras.datasets import reuters train_set,test_set = reuters.load_data(path="reuters.npz",num_words=vocab_size,skip_top=0,index_from=INDEX_FROM) x_train,y_train = train_set[0],train_set[1] x_test,y_test = test_set[0],test_set[1] word_to_id = reuters.get_word_index(path="reuters_word_index.json") word_to_id = {k:(v+3) for k,v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 word_to_id['<EOS>'] = 3 id_to_word = {value:key for key,value in word_to_id.items()} x_train_pad = pad_sequences(x_train,maxlen=max_len) x_test_pad = pad_sequences(x_test,maxlen=max_len) train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.LongTensor)) train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True) return train_loader,train_set,test_set,x_test_pad,word_to_id
from keras.datasets import imdb vocabulary_size = 5000 (X_train, y_train),(X_test, y_test) = imdb.load_data(num_words = vocabulary_size) print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test))) print('---review---') print(X_train[6]) # review is stored as a sequence of integers. These are word IDs that have been pre-assigned to individual words print('---label---') print(y_train[6]) #label is an integer (0 for negative, 1 for positive). word2id = imdb.get_word_index() id2word = {i: word for word, i in word2id.items()} print('---review with words---') print([id2word.get(i, ' ') for i in X_train[6]]) print('---label---') print(y_train[6]) print('Maximum review length: {}'.format(len(max((X_train + X_test), key=len)))) print('Minimum review length: {}'.format(len(min((X_test + X_test), key=len)))) from keras.preprocessing import sequence max_words = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_words) X_test = sequence.pad_sequences(X_test, maxlen=max_words) from keras import Sequential from keras.layers import Embedding, LSTM, Dense, Dropout embedding_size=32 model=Sequential()
def section3pt4(): print('\n##############################') print('starting: section3pt4()') (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) print('\ntrain_data.shape') print( train_data.shape ) print('\ntrain_labels.shape') print( train_labels.shape ) print('\ntest_data.shape') print( test_data.shape ) print('\ntest_labels.shape') print( test_labels.shape ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### max_train_data = max([max(sequence) for sequence in train_data]) print('\nmax_train_data') print( max_train_data ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### word_index = imdb.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) # Note that the indices are offset by 3 because 0, 1, and 2 are reserved # indices for “padding,” “start of sequence,” and “unknown.” print('\ndecoded_review') print( decoded_review ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### x_train = vectorize_sequences(train_data) x_test = vectorize_sequences( test_data) y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray( test_labels).astype('float32') print('\nx_train.shape') print( x_train.shape ) print('\ny_train.shape') print( y_train.shape ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### print('\n### starting: fitting model with 4 epochs ...') model = models.Sequential() model.add(layers.Dense(16, activation='relu', input_shape=(10000,))) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense( 1, activation='sigmoid')) print('\nmodel.summary()') print( model.summary() ) model.compile( optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'] ) model.fit(x_train, y_train, epochs = 4, batch_size = 512) results = model.evaluate(x_test, y_test) print('\nresults (4 epochs)') print( results ) predictions = model.predict(x_test) print('\npredictions (4 epochs)') print( predictions ) print('\n### finished: fitting model with 4 epochs') print('\n') ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### # return( None ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### model = models.Sequential() model.add(layers.Dense(16, activation='relu', input_shape=(10000,))) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense( 1, activation='sigmoid')) model.compile( #optimizer = optimizers.RMSprop(lr=0.001), optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'] # metrics = [metrics.binary_accuracy] ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### x_val = x_train[:10000] partial_x_train = x_train[10000:] y_val = y_train[:10000] partial_y_train = y_train[10000:] fitting_history = model.fit( partial_x_train, partial_y_train, verbose = 2, epochs = 20, batch_size = 512, validation_data = (x_val, y_val) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### fitting_history_dict = fitting_history.history print('\nfitting_history_dict.keys()') print( fitting_history_dict.keys() ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### loss_values = fitting_history_dict['loss'] val_loss_values = fitting_history_dict['val_loss'] epochs = range(1, len(loss_values) + 1) outputFILE = 'plot-train-validation-loss.png' plt.plot(epochs, loss_values, 'bo', label='Training loss' ) plt.plot(epochs, val_loss_values, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.savefig(fname = outputFILE, dpi = 600, bbox_inches = 'tight', pad_inches = 0.2) plt.clf() ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### acc_values = fitting_history_dict['acc'] val_acc_values = fitting_history_dict['val_acc'] outputFILE = 'plot-train-validation-accuracy.png' plt.plot(epochs, acc_values, 'bo', label='Training accuracy' ) plt.plot(epochs, val_acc_values, 'b', label='Validation accuracy') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.savefig(fname = outputFILE, dpi = 600, bbox_inches = 'tight', pad_inches = 0.2) plt.clf() ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### print('\nexiting: section3pt4()') print('##############################') return( None )