def load_data(data_source):
    assert data_source in ["keras_data_set", "local_dir"], "Unknown data source"
    if data_source == "keras_data_set":
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"
    else:
        x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data()
        vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
        y = y.argmax(axis=1)

        # Shuffle data
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x = x[shuffle_indices]
        y = y[shuffle_indices]
        train_len = int(len(x) * 0.9)
        x_train = x[:train_len]
        y_train = y[:train_len]
        x_test = x[train_len:]
        y_test = y[train_len:]

    return x_train, y_train, x_test, y_test, vocabulary_inv
Exemplo n.º 2
0
def backToWords(comment):
	word_index=imdb.get_word_index()  #getting words related to all integers 
	reverse_word_index=dict(
		[(value,key)for (key,value) in word_index.items()] #converting integer index to words key=integer val=word
	)
	decoded_review=' '.join(
		[reverse_word_index.get(i-3,'?')for i in comment]  #getting words related to that integer
	)
Exemplo n.º 3
0
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict)
Exemplo n.º 4
0
 def get_vectors_from_text(dataset_list,word_to_ind=imdb.get_word_index(),
                         start_char=1,
                         index_from=3,
                         maxlen=400,
                         num_words=5000,
                         oov_char=2,skip_top=0):
     '''
     Gets the list vector mapped according to the word to indices dictionary.
     
     @param
         dataset_list = list of review texts in unicode format
         word_to_ind = word to indices dictionary
         hyperparameters: start_char-->sentence starting after this char.
                         index_from-->indices below this will not be encoded.
                         max-len-->maximum length of the sequence to be considered.
                         num_words-->number of words to be considered according to the rank.Rank is
                                     given according to the frequency of occurence
                         oov_char-->out of variable character.
                         skip_top-->no of top rank words to be skipped
     @returns:
         x_train:       Final list of vectors(as list) of the review texts
     '''
     x_train = []
     for review_string in dataset_list:
         review_string_list = text_to_word_sequence(review_string)
         review_string_list = [ele for ele in review_string_list]
         
         x_predict = []
         for i in range(len(review_string_list)):
             if review_string_list[i] not in word_to_ind:
                 continue
             x_predict.append(word_to_ind[review_string_list[i]])
         x_train.append((x_predict))
     # add te start char and also take care of indexfrom
     if start_char is not None:
         x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
     elif index_from:
         x_train = [[w + index_from for w in x] for x in x_train]
     # only maxlen is out criteria
     x_train=[ele[:maxlen] for ele in x_train]
     # if num is not given take care
     if not num_words:
         num_words = max([max(x) for x in x_train])
     # by convention, use 2 as OOV word
     # reserve 'index_from' (=3 by default) characters:
     # 0 (padding), 1 (start), 2 (OOV)
     if oov_char is not None:
         x_train = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in x_train]
     else:
         x_train = [[w for w in x if (skip_top <= w < num_words)] for x in x_train]
     # padd the sequences
     x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
     # return the vectors form of the text
     return x_train
Exemplo n.º 5
0
def imdb_word_dic() -> (Dict[str, int], Dict[int, str]):
    """
    forms the dictionary of word2index and index2word
    """
    # A dictionary mapping words2index
    word2index = imdb.get_word_index()

    # The first indices are reserved
    word2index = {k: (v + 2) for k, v in word2index.items()}
    word2index["<PAD>"] = 0
    word2index["<START>"] = 1
    word2index["<UNK>"] = 2  # unknown

    # index2word
    index2word = dict([(value, key) for (key, value) in word2index.items()])

    return word2index, index2word
Exemplo n.º 6
0
def main(epochs,
         batch_size=32,
         unique_word_count=5000,
         max_word_count=400,
         seed=124):
    # save np.load
    # Load imdb data
    (words_train, labels_train), (words_test, labels_test) = load_data(
        unique_word_count, max_word_count)

    # Create model
    model = create_model_cnn("first-cnn", unique_word_count, max_word_count)

    # Create map from words to their equivalent vectors
    embeddings = model.layers[0].get_weights()[0]
    word_to_token = imdb.get_word_index()
    word_to_embedding = {
        word: embeddings[token]
        for word, token in word_to_token.items() if token < embeddings.shape[0]
    }

    # Train model on all data `epochs` times
    train(model, epochs, batch_size)

    # Test model
    positive_test = vectorize_word_list(
        ["basically", "getting", "action", "right", "from", "the", "start"],
        word_to_embedding)
    negative_test = vectorize_word_list(["poor", "terrible", "awful"],
                                        word_to_embedding)
    print(positive_test)
    positive_test = sequence.pad_sequences(positive_test,
                                           maxlen=max_word_count,
                                           dtype=np.float32)
    negative_test = sequence.pad_sequences(negative_test,
                                           maxlen=max_word_count,
                                           dtype=np.float32)
    print(positive_test)
    print("Positive: ", model.predict(positive_test))
    print("Negative: ", model.predict(negative_test))

    imdb_test = words_test[0:5, :]
    print(imdb_test.shape)
    print("IMDB[0]:", vector_to_word_list(imdb_test, word_to_embedding),
          model.predict(imdb_test))
    return
Exemplo n.º 7
0
def load_text(filename, idim):
    data = []
    print("\n", filename, ":")
    with open(filename, 'r') as file:
        for line in file.readlines():
            print(line)
            data += [
                w.strip(''.join(['.', ',', ':', ';', '!', '?', '(',
                                 ')'])).lower() for w in line.strip().split()
            ]
    index = imdb.get_word_index()
    x_test = []
    for w in data:
        if w in index and index[w] < idim:
            x_test.append(index[w])
    x_test = vectorize([np.array(x_test)], idim)
    return x_test
Exemplo n.º 8
0
def load_file(filename):
    file = open(filename, 'r')
    text = file.read().lower()
    text = re.sub(r"[^a-z0-9' ]", "", text)
    text = text.split(" ")

    index = imdb.get_word_index()
    coded = [1]
    for word in text:
        num = index.get(word, 0)
        if num != 0:
            num += 3
        if num > top_words:
            num = 2
        coded.append(num)

    return coded
Exemplo n.º 9
0
def load_data():
    """
    Load data if data have been created.
    Create data otherwise.

    """
    if 'data' not in os.listdir('.'):
        os.mkdir('data')

    if 'id_to_word.pkl' not in os.listdir('data'):
        print('Loading data...')
        (x_train, y_train), (x_val,
                             y_val) = imdb.load_data(num_words=max_features,
                                                     index_from=3)
        word_to_id = imdb.get_word_index()
        word_to_id = {k: (v + 3) for k, v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
        id_to_word = {value: key for key, value in word_to_id.items()}

        print(len(x_train), 'train sequences')
        print(len(x_val), 'test sequences')

        print('Pad sequences (samples x time)')
        x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
        x_val = sequence.pad_sequences(x_val, maxlen=maxlen)
        y_train = np.eye(2)[y_train]
        y_val = np.eye(2)[y_val]

        np.save('./data/x_train.npy', x_train)
        np.save('./data/y_train.npy', y_train)
        np.save('./data/x_val.npy', x_val)
        np.save('./data/y_val.npy', y_val)
        with open('data/id_to_word.pkl', 'wb') as f:
            pickle.dump(id_to_word, f)

    else:
        x_train, y_train, x_val, y_val = (np.load('data/x_train.npy'),
                                          np.load('data/y_train.npy'),
                                          np.load('data/x_val.npy'),
                                          np.load('data/y_val.npy'))
        with open('data/id_to_word.pkl', 'rb') as f:
            id_to_word = pickle.load(f)

    return x_train, y_train, x_val, y_val, id_to_word
Exemplo n.º 10
0
def load_data(data_source):
    # global sequence_length
    assert data_source in ["keras_data_set", "local_dir",
                           "pickle"], "Unknown data source"
    if data_source == "keras_data_set":
        (x_train, y_train), (x_test,
                             y_test) = imdb.load_data(num_words=max_words,
                                                      start_char=None,
                                                      oov_char=None,
                                                      index_from=None)
        x_train = sequence.pad_sequences(x_train,
                                         maxlen=sequence_length,
                                         padding="post",
                                         truncating="post")
        x_test = sequence.pad_sequences(x_test,
                                        maxlen=sequence_length,
                                        padding="post",
                                        truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"

    # elif data_source == "pickle":
    #     vocabulary_inv = pickle.load(open(".models/vocabulary.p","rb"))
    #     return "","","","",vocabulary_inv
    else:
        x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data()
        vocabulary_inv = {
            key: value
            for key, value in enumerate(vocabulary_inv_list)
        }
        y = y.argmax(axis=1)

        # Shuffle data
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x = x[shuffle_indices]
        y = y[shuffle_indices]
        train_len = int(len(x) * 0.9)
        x_train = x[:train_len]
        y_train = y[:train_len]
        x_test = x[train_len:]
        y_test = y[train_len:]

    return x_train, y_train, x_test, y_test, vocabulary_inv
Exemplo n.º 11
0
def sent_anly_prediction():
    if request.method == 'POST':
        text = request.form['text']
        Sentiment = ''
        max_review_length = 500
        word_to_id = imdb.get_word_index()
        strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
        text = text.lower().replace("<br />", " ")
        text = re.sub(strip_special_chars, "", text.lower())

        words = text.split()  #split string into a list
        x_test = [[
            word_to_id[word] if
            (word in word_to_id and word_to_id[word] <= 20000) else 0
            for word in words
        ]]
        x_test = sequence.pad_sequences(
            x_test,
            maxlen=500)  # Should be same which you used for training data
        vector = np.array([x_test.flatten()])
        #with graph.as_default():
        with graph.as_default():

            # perform the prediction
            probability = model.predict(array([vector][0]))[0][0]
            #print(out)
            #print(class_names[np.argmax(out)])
            # convert the response to a string
            #response = class_names[np.argmax(out)]
            #return str(response)
            #probability = model.predict(array([vector][0]))[0][0]
            class1 = model.predict_classes(array([vector][0]))[0][0]
        if class1 == 0:
            sentiment = 'Negative'
            img_filename = os.path.join(app.config['UPLOAD_FOLDER'],
                                        'Sad_Emoji.png')
        else:
            sentiment = 'Positive'
            img_filename = os.path.join(app.config['UPLOAD_FOLDER'],
                                        'Smiling_Emoji.png')
    return render_template('home.html',
                           text=text,
                           sentiment=sentiment,
                           probability=probability,
                           image=img_filename)
Exemplo n.º 12
0
def main():
    (train_data, train_labels), (test_data,
                                 test_labels) = imdb.load_data(num_words=10000)

    print(train_data[0])

    word_index = imdb.get_word_index()
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])

    #decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
    #print(decoded_review)

    x_train = vectorize_sequences(train_data)
    x_test = vectorize_sequences(test_data)

    y_train = np.asarray(train_labels).astype('float32')
    y_test = np.asarray(test_labels).astype('float32')

    model = models.Sequential()
    model.add(layers.Dense(16, activation='relu', input_shape=(10000, )))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(optimizer=optimizers.RMSprop(lr=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    x_val = x_train[:10000]
    partial_x_train = x_train[10000:]

    y_val = y_train[:10000]
    partial_y_train = y_train[10000:]

    history = model.fit(partial_x_train,
                        partial_y_train,
                        epochs=20,
                        batch_size=512,
                        validation_data=(x_val, y_val))

    predictions = model.predict(x_test)

    print(predictions)

    return
Exemplo n.º 13
0
def encode_review(rev):
    res = []
    for i, el in enumerate(rev):
        el = el.lower()
        delete_el = [',', '!', '.', '?']
        for d_el in delete_el:
            el = el.replace(d_el, '')
        el = el.split()
        for j, word in enumerate(el):
            code = imdb.get_word_index().get(word)
            if code is None or code >= 10000:
                code = 0
            el[j] = code
        res.append(el)
    for i, r in enumerate(res):
        res[i] = sequence.pad_sequences([r], maxlen=MAX_REVIEW_LENGTH)
    res = np.array(res)
    return res.reshape((res.shape[0], res.shape[2]))
Exemplo n.º 14
0
def preinfo():
    # label为1表示正能量,0表示负能量
    print(train_data[0])
    print(train_labels[0])

    #频率与单词的对应关系存储在哈希表word_index中,它的key对应的是单词,value对应的是单词的频率
    word_index = imdb.get_word_index()
    #我们要把表中的对应关系反转一下,变成key是频率,value是单词
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])
    '''
    在train_data所包含的数值中,数值1,2,3对应的不是单词,而用来表示特殊含义,1表示“填充”,2表示”文本起始“,
    3表示”未知“,因此当我们从train_data中读到的数值是1,2,3时,我们要忽略它,从4开始才对应单词,如果数值是4,
    那么它表示频率出现最高的单词
    '''
    decoded_review = ' '.join(
        [reverse_word_index.get(i - 3, '?') for i in train_data[0]])
    print(decoded_review)
def saved_model(path, inpVal, MaxLen):
    model = load_model(path)
    wordIndex = imdb.get_word_index()
    words = inpVal.split()
    review = []
    for word in words:
        if word not in wordIndex:
            review.append(2)
        else:
            review.append(wordIndex[word] + 3)

    review = sequence.pad_sequences([review], maxlen=MaxLen)
    result = model.predict(review)
    print('Prediction (0 = negative, 1 = positive) = ', end=" ")
    print("%0.4f" % result[0][0])
    del model
    bkn.clear_session()
    return result
Exemplo n.º 16
0
def classify_review(review):
    maxlen = 100
    model = load_model('model.h5')
    d = imdb.get_word_index()
    words = review.split()
    review = []
    for word in words:
        if word not in d:
            review.append(2)
        else:
            review.append(d[word] + 3)

    review = sequence.pad_sequences([review],
                                    truncating='pre',
                                    padding='pre',
                                    maxlen=maxlen)
    prediction = model.predict(review)
    return prediction[0][0]
Exemplo n.º 17
0
def explore():
    print(train_data.shape)
    print(train_labels.shape)

    print(test_data.shape)
    print(test_labels.shape)

    print(train_data[0])
    print(train_labels[0])

    print(max(max(sequence) for sequence in train_data))

    # word_index is a dictionary mapping: word -> int indices
    word_index = imdb.get_word_index()
    # reversing it, mapping becomes int indices -> word
    reversed_word_index = dict([(value, key) for (key, value) in word_index.items()])
    decoded_review = ' '.join(reversed_word_index.get(i-3, '?') for i in train_data[0])
    print(decoded_review)
Exemplo n.º 18
0
def Preparing_string(text_string, dimension=40):

    text_string = text_string.lower()
    table = str.maketrans(dict.fromkeys(string.punctuation))
    text_string = text_string.translate(table)

    word2index = imdb.get_word_index()
    test = []
    for word in word_tokenize(text_string):
        test.append(word2index[word])
    print(text_string)
    print(test)

    out = np.zeros(dimension)
    for _, sequence in enumerate(test):
        if sequence < dimension:
            out[sequence] = 1
    print("\nOutput:", out)
Exemplo n.º 19
0
def predict_score(model, review_text, word_to_ind=imdb.get_word_index()):
    '''
    Predict and produce the accuracy of the review text

    @param
        model:SequentialModel which we trained the data on
        review_text:Review text to be predicted on  
        word_to_ind: dictionary mapping of words to indices
    @returns
        sentiment score on the review text.
    '''
    # convert review text into vector
    x_predict = get_vectors_from_text([review_text], word_to_ind)[0]

    # reshape x_predict
    x_predict = np.reshape(x_predict, (1, len(x_predict)))

    return model.predict(x_predict)[0][0]
Exemplo n.º 20
0
    def encodeData(self, x_test):

        print("Encoding data...")

        word_indices = imdb.get_word_index()
        reviews = []
        for doc in x_test:
            review = []
            for word in doc:
                if word not in word_indices:
                    review.append(2)
                else:
                    review.append(word_indices[word] + 3)
            review.sort(reverse=True)
            reviews.append(review)

        print("Encoding done...!!!")
        return reviews
Exemplo n.º 21
0
def load_data(size=0.2):
    # load the dataset
    (X_train, y_train), (X_test, y_test) = imdb.load_data()
    X = np.concatenate((X_train, X_test), axis=0)
    y = np.concatenate((y_train, y_test), axis=0)
    X, y = shuffle(X, y, random_state=42)

    vocab_size = len(imdb.get_word_index())

    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=size,
                                                        random_state=42)

    x_train = sequence.pad_sequences(x_train, maxlen=max_len)
    x_test = sequence.pad_sequences(x_test, maxlen=max_len)

    return x_train, x_test, y_train, y_test
Exemplo n.º 22
0
def multi_dataset_test():
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
        assert len(x_train) == len(y_train) == 60000
        assert len(x_test) == len(y_test) == 10000
        (x_train, y_train), (x_test, y_test) = boston_housing.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict)
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        assert len(x_train) == len(y_train) == 60000
        assert len(x_test) == len(y_test) == 10000
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict)
        (x_train, y_train), (x_test, y_test) = cifar10.load_data()
        cifarDefaultTrainLength = 50000
        cifarDefaultTestLength = 10000
        assert len(x_train) == len(y_train) == cifarDefaultTrainLength
        assert len(x_test) == len(y_test) == cifarDefaultTestLength

        (x_train, y_train), (x_test, y_test) = cifar100.load_data('fine')
        cifarFineTrainLength = 50000
        cifarFineTestLength = 10000
        assert len(x_train) == len(y_train) == cifarFineTrainLength
        assert len(x_test) == len(y_test) == cifarFineTestLength

        (x_train, y_train), (x_test, y_test) = cifar100.load_data('coarse')
        cifarCoarseTrainLength = 50000
        cifarCoarseTestLength = 10000
        assert len(x_train) == len(y_train) == cifarCoarseTrainLength
        assert len(x_test) == len(y_test) == cifarCoarseTestLength
Exemplo n.º 23
0
def text_load():
    dictionary = dict(imdb.get_word_index())
    test_x = []
    test_y = np.array(answers)
    for string in strings:
        words = string.replace(',', ' ').replace('.', ' ').replace(
            '?', ' ').replace('\n', ' ').split()
        num_words = []
        for word in words:
            word = dictionary.get(word)
            if word is not None and word < 10000:
                num_words.append(word)
        test_x.append(num_words)
    test_x = [vectorize(test_x)]
    model = build_model(10000)
    (train_x, train_y), (s1, s2) = prepare_data(10000)
    model.fit(train_x, train_y, epochs=2, batch_size=500)
    predictions = model.predict(test_x)
    print(predictions)
Exemplo n.º 24
0
def my_form_post():
    tb._SYMBOLIC_SCOPE.value = True  # resolved error

    itext = request.form['text']

    words = re.sub("[^\w]", " ", itext).split()

    INDEX_FROM = 3  # word index offset

    # import os, ssl
    # if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
    #         getattr(ssl, '_create_unverified_context',
    #                 None)): ssl._create_default_https_context = ssl._create_unverified_contex

    word_to_id = imdb.get_word_index()
    word_to_id = {k: (v + INDEX_FROM) for k, v in word_to_id.items()}
    word_to_id["<PAD>"] = 0
    word_to_id["<START>"] = 1
    word_to_id["<UNK>"] = 2
    id_to_word = {value: key for key, value in word_to_id.items()}

    x_ireview = [[word_to_id.get(i, 2) for i in words]]

    tokenizer = Tokenizer(num_words=1000)

    x_predict = tokenizer.sequences_to_matrix(x_ireview, mode='binary')

    # ynew =loaded_model.predict_proba(x_predict)
    ynew = sentiment_predict.sentiment_predict(x_predict)
    #
    if ynew[0, 1] < 0.5:
        isentiment = ':('
    else:
        isentiment = ':)'

    # return ynew, prednr
    processed_text = ynew

    # return processed_text
    return render_template('my-form.html',
                           nnoutcome=processed_text,
                           isentiment=isentiment,
                           itext=itext)
Exemplo n.º 25
0
def main():
    (train_data, train_labels), (test_data,
                                 test_labels) = imdb.load_data(num_words=10000)
    word_index = imdb.get_word_index()
    reverse_words_index = dict([(v, k) for k, v in word_index.items()])
    decoded_review = ' '.join(
        [reverse_words_index.get(i - 3, '?') for i in train_data[0]])
    print(decoded_review)

    x_train = vectorize_sequences(train_data)
    x_test = vectorize_sequences(test_data)
    y_train = np.asarray(train_labels).astype('float32')
    y_test = np.asarray(test_labels).astype('float32')

    model = models.Sequential()
    model.add(layers.Dense(16, activation='relu', input_shape=(10000, )))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid', input_shape=(10000, )))
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    x_val = x_train[:10000]
    partial_x_train = x_train[10000:]
    y_val = y_train[:10000]
    partial_y_train = y_train[10000:]
    history = model.fit(partial_x_train,
                        partial_y_train,
                        epochs=4,
                        batch_size=512,
                        validation_data=(x_val, y_val))
    test_loss, test_acc = model.evaluate(x_test, y_test)
    print('test_acc:', test_acc)

    history_dict = history.history
    loss_values = history_dict['loss']
    val_loss_values = history_dict['val_loss']

    plot_loss(loss_values, val_loss_values)

    acc_values = history_dict['acc']
    val_acc_values = history_dict['val_acc']
    plot_acc(acc_values, val_acc_values)
Exemplo n.º 26
0
def load_data(data_source):
    assert data_source in ["keras_data_set", "local_dir"], "Unknown data source"
    if data_source == "keras_data_set":
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"

        print("length of the vocab inv is ",len(vocabulary_inv))

    else:
        print("no data in the keras dataset")

    return x_train, y_train, x_test, y_test, vocabulary_inv
Exemplo n.º 27
0
def train():
    (train_data, train_labels), (test_data,
                                 test_labels) = imdb.load_data(num_words=10000)

    # word_index is a dictionary mapping words to an integer index
    word_index = imdb.get_word_index()
    # We reverse it, mapping integer indices to words
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])
    # We decode the review; note that our indices were offset by 3
    # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
    decoded_review = ' '.join(
        [reverse_word_index.get(i - 3, '?') for i in train_data[0]])

    # Our vectorized training data
    x_train = vectorize_sequences(train_data)
    # Our vectorized test data
    x_test = vectorize_sequences(test_data)

    # Our vectorized labels
    y_train = np.asarray(train_labels).astype('float32')
    y_test = np.asarray(test_labels).astype('float32')

    x_val = x_train[:10000]
    partial_x_train = x_train[10000:]

    y_val = y_train[:10000]
    partial_y_train = y_train[10000:]

    model = models.Sequential()
    model.add(layers.Dense(16, activation='relu', input_shape=(10000, )))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(optimizer=optimizers.RMSprop(lr=0.001),
                  loss=losses.binary_crossentropy,
                  metrics=[metrics.binary_accuracy])

    return model.fit(partial_x_train,
                     partial_y_train,
                     epochs=20,
                     batch_size=512,
                     validation_data=(x_val, y_val))
Exemplo n.º 28
0
def sent_anly_prediction():
    if (request.method == 'POST'):
        text = request.form['text']
        sentiment = ''
        max_review_length = 500
        word_to_id = imdb.get_word_index()
        strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
        text = text.lower().replace("<br />", " ")
        text = re.sub(strip_special_chars, "", text.lower())

        words = text.split()  #split string into a list
        x_test = [[
            word_to_id[word] if
            (word in word_to_id and word_to_id[word] <= 20000) else 0
            for word in words
        ]]
        x_test = sequence.pad_sequences(
            x_test,
            maxlen=500)  # Should be same which you used for training data
        vector = np.array([x_test.flatten()])
        graph = tf.compat.v1.get_default_graph()

        with graph.as_default():
            model = load_model('sentimental_analysis_model_new.h5')
            probability = model.predict(array([vector][0]))[0][0]
            print("Probability is ", probability)
            class1 = model.predict_classes(array([vector][0]))[0][0]
            print("Class is ", class1)

        if class1 == 0:
            sentiment = 'Negative'
            img_filename = os.path.join(app.config['UPLOAD_FOLDER'],
                                        'Sad_Emoji.png')
        else:
            sentiment = 'Positive'
            img_filename = os.path.join(app.config['UPLOAD_FOLDER'],
                                        'Smiling_Emoji.png')

    return render_template('home1.html',
                           text=text,
                           sentiment=sentiment,
                           probability=probability,
                           image=img_filename)
Exemplo n.º 29
0
def main():
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=None,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         seed=113,
                                                         start_char=2,
                                                         oov_char=1,
                                                         index_from=0)

    i2w = {w_id: w for w, w_id in imdb.get_word_index().items()}

    with open('train.csv', mode='w', encoding='utf-8') as f:
        for i in range(x_train.shape[0]):
            line = ' '.join([i2w[w_id] for w_id in x_train[i]][1:])
            f.write('{0}, {1}\n'.format(line, y_train[i]))

    with open('test.csv', mode='w', encoding='utf-8') as f:
        for i in range(x_test.shape[0]):
            line = ' '.join([i2w[w_id] for w_id in x_test[i]][1:])
            f.write('{0}, {1}\n'.format(line, y_test[i]))
def Preparing_string(text_string, dimension=TOP_WORDS):
    text_string = text_string.lower()
    table = str.maketrans(dict.fromkeys(string.punctuation))
    text_string = text_string.translate(table)

    word2index = imdb.get_word_index()
    test = []
    for word in word_tokenize(text_string):
        test.append(word2index[word])

    results = np.zeros(dimension)
    for _, sequence in enumerate(test):
        if sequence < dimension:
            results[sequence] = 1

    print("\nOriginal string:", text_string, "\n")
    print("\nIndex conversion:", test, "\n")
    results = np.reshape(results, (1, TOP_WORDS))
    print("\nConvert to vectors:", results, "\n")
    return results
def _prepare_index_to_word(preview=0, top_words=5000):
    word_to_index = imdb.get_word_index()
    index_to_word = dict()
    for word, index in word_to_index.items():
        if top_words is not None and index > top_words:
            continue
        index_to_word[index + len(SpecialConstants)] = word

    index_to_word[SpecialConstants.PADDING.value] = SpecialConstants.PADDING
    index_to_word[SpecialConstants.START.value] = SpecialConstants.START
    index_to_word[SpecialConstants.OUT_OF_VOCABULARY.
                  value] = SpecialConstants.OUT_OF_VOCABULARY

    assert top_words is None or len(
        index_to_word) == top_words + len(SpecialConstants)

    for index, word in list(index_to_word.items())[:preview]:
        print(index, ':', word)

    return index_to_word
def decode_sentence(sentence):
    word_index = imdb.get_word_index()
    '''
         It is converting a text into sequence of words or token
    '''
    sentence = text.text_to_word_sequence(
        sentence, filters='!”#$%&()*+,-./:;?@[\\]^_`{|}~\t\n', lower=True)
    '''
                            SEEN FROM THE INTERNET
        Converitng the word list into numpy array of word indexes , with 0 for unknown words
        for each string in the data file.
    '''
    sentence = np.array(
        [word_index[word] if word in word_index else 0 for word in sentence])
    print("sentence", sentence)
    sentence[sentence > 5000] = 2
    l = 500 - len(sentence)
    sentence = np.pad(sentence, (l, 0), 'constant')
    sentence = sentence.reshape(1, -1)
    return sentence
Exemplo n.º 33
0
def lecture_du_jeu_de_imdbkeras():
    max_words = 20000
    x_train_imdb = []
    y_train_imdb = []
    (x_imdb, y_imdb), (x_test_imdb,
                       y_test_imdb) = imdb.load_data(num_words=max_words,
                                                     maxlen=300,
                                                     seed=113)

    #reconstruction
    wordDict = {y: x for x, y in imdb.get_word_index().items()}
    for doc in x_imdb:
        sequence = ""
        for index in doc:
            sequence += " " + wordDict.get(index)
        x_train_imdb.append(sequence)
    for i in y_imdb:
        y_train_imdb.append(str(i))

    return x_train_imdb, y_train_imdb
Exemplo n.º 34
0
def get_imdb_corpus():
    (x_train,
     y_train), (y_test,
                y_test) = imdb.load_data()  # num_words = vocabulary_size

    word2id = {
        word: (i + INDEX_FROM)
        for word, i in imdb.get_word_index().items()
    }
    id2word = {i: word for word, i in word2id.items()}

    train_samples = [[id2word.get(i, '<<UNKNOWN>>') for i in sample if i >= 2]
                     for sample in x_train]
    train_tags = y_train

    test_samples = [[id2word.get(i, '<<UNKNOWN>>') for i in sample if i >= 2]
                    for sample in x_train]
    test_tags = y_test

    return (train_samples, train_tags), (test_samples, test_samples)
def highlight_attention_words(model, in_seq):
    aux = imdb.get_word_index()
    words = {}
    for x in aux:
        words[aux[x]] = x
    sentence = ''
    for x in in_seq:
        if x >= 3 and x - 3 in words:
            sentence += words[x - 3] + ' '
    print(sentence)
    print("")
    print("Top worlds:")
    x = model.predict(np.expand_dims(in_seq, 0))[0]
    sol = []
    for i in range(0, len(in_seq)):
        if in_seq[i] >= 3:
            sol.append((x[i], in_seq[i]))
    sol.sort(reverse=True)
    for i in range(0, 10):
        score = sol[i][0]
        x = sol[i][1]
        print(words[x - 3], score)
Exemplo n.º 36
0

import keras
# print(keras.__version__)
# 加载LMDB数据集
from keras.datasets import imdb

# 保留数据中前10000个单词,低频单词被舍弃
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

# 最大索引值 不超过10000
print(max([max(sequence) for sequence in train_data]))

# 解码为英文单词
# word_index is a dictionary mapping words to an integer index
word_index = imdb.get_word_index()
# We reverse it, mapping integer indices to words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# We decode the review; note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
print(decoded_review)


# preparing the data
import numpy as np
# one-hot编码
def vectorize_sequences(sequences, dimension=10000):
    # Create an all-zero matrix of shape (len(sequences), dimension)
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
def load_data_set(type,max_len,vocab_size,batch_size):
    """
        Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification
 
        Args:
            type   : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set
            max_len: {int} timesteps used for padding
			vocab_size: {int} size of the vocabulary
			batch_size: batch_size
        Returns:
            train_loader: {torch.Dataloader} train dataloader
            x_test_pad  : padded tokenized test_data for cross validating
			y_test      : y_test
            word_to_id  : {dict} words mapped to indices
 
      
        """
   
    INDEX_FROM=3
    if not bool(type):
        NUM_WORDS=vocab_size # only use top 1000 words
           # word index offset
 
        train_set,test_set = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
        x_train,y_train = train_set[0],train_set[1]
        x_test,y_test = test_set[0],test_set[1]
        word_to_id = imdb.get_word_index()
        word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
 
        id_to_word = {value:key for key,value in word_to_id.items()}
        x = np.concatenate([x_train, x_test])
        y = np.concatenate([y_train, y_test])
        n_train = x.shape[0] - 1000
        n_valid = 1000
 
        x_train = x[:n_train]
        y_train = y[:n_train]
        x_test = x[n_train:n_train+n_valid]
        y_test = y[n_train:n_train+n_valid]
 
 
        #embeddings = load_glove_embeddings("../../GloVe/glove.6B.50d.txt",word_to_id,50)
        x_train_pad = pad_sequences(x_train,maxlen=max_len)
        x_test_pad = pad_sequences(x_test,maxlen=max_len)
 
 
        train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.DoubleTensor))
        train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True)
        return train_loader,x_test_pad,y_test,word_to_id
       
    else:
        from keras.datasets import reuters
 
        train_set,test_set = reuters.load_data(path="reuters.npz",num_words=vocab_size,skip_top=0,index_from=INDEX_FROM)
        x_train,y_train = train_set[0],train_set[1]
        x_test,y_test = test_set[0],test_set[1]
        word_to_id = reuters.get_word_index(path="reuters_word_index.json")
        word_to_id = {k:(v+3) for k,v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
        word_to_id['<EOS>'] = 3
        id_to_word = {value:key for key,value in word_to_id.items()}
        x_train_pad = pad_sequences(x_train,maxlen=max_len)
        x_test_pad = pad_sequences(x_test,maxlen=max_len)
 
 
        train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.LongTensor))
        train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True)
        return train_loader,train_set,test_set,x_test_pad,word_to_id
Exemplo n.º 38
0
from keras.datasets import imdb
vocabulary_size = 5000
(X_train, y_train),(X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

print('---review---')
print(X_train[6])  # review is stored as a sequence of integers. These are word IDs that have been pre-assigned to individual words
print('---label---')
print(y_train[6]) #label is an integer (0 for negative, 1 for positive).

word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i, ' ') for i in X_train[6]])
print('---label---')
print(y_train[6])

print('Maximum review length: {}'.format(len(max((X_train + X_test), key=len))))

print('Minimum review length: {}'.format(len(min((X_test + X_test), key=len))))


from keras.preprocessing import sequence
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
embedding_size=32
model=Sequential()
Exemplo n.º 39
0
def section3pt4():

    print('\n##############################')
    print('starting: section3pt4()')

    (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

    print('\ntrain_data.shape')
    print(   train_data.shape )

    print('\ntrain_labels.shape')
    print(   train_labels.shape )

    print('\ntest_data.shape')
    print(   test_data.shape )

    print('\ntest_labels.shape')
    print(   test_labels.shape )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    max_train_data = max([max(sequence) for sequence in train_data])
    print('\nmax_train_data')
    print(   max_train_data )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    word_index = imdb.get_word_index()
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
    # Note that the indices are offset by 3 because 0, 1, and 2 are reserved
    # indices for “padding,” “start of sequence,” and “unknown.”
    print('\ndecoded_review')
    print(   decoded_review )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    x_train = vectorize_sequences(train_data)
    x_test  = vectorize_sequences( test_data)

    y_train = np.asarray(train_labels).astype('float32')
    y_test  = np.asarray( test_labels).astype('float32')

    print('\nx_train.shape')
    print(   x_train.shape )

    print('\ny_train.shape')
    print(   y_train.shape )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    print('\n### starting: fitting model with 4 epochs ...')

    model = models.Sequential()
    model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense( 1, activation='sigmoid'))

    print('\nmodel.summary()')
    print(   model.summary() )

    model.compile(
        optimizer = 'rmsprop',
        loss      = 'binary_crossentropy',
        metrics   = ['accuracy']
        )

    model.fit(x_train, y_train, epochs = 4, batch_size = 512)

    results = model.evaluate(x_test, y_test)
    print('\nresults (4 epochs)')
    print(   results )

    predictions = model.predict(x_test)
    print('\npredictions (4 epochs)')
    print(   predictions )

    print('\n### finished: fitting model with 4 epochs')
    print('\n')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    # return( None )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    model = models.Sequential()

    model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense( 1, activation='sigmoid'))

    model.compile(
        #optimizer = optimizers.RMSprop(lr=0.001),
        optimizer  = 'rmsprop',
        loss       = 'binary_crossentropy',
        metrics    = ['accuracy']
        # metrics  = [metrics.binary_accuracy]
        )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    x_val           = x_train[:10000]
    partial_x_train = x_train[10000:]

    y_val           = y_train[:10000]
    partial_y_train = y_train[10000:]

    fitting_history = model.fit(
        partial_x_train,
        partial_y_train,
        verbose         =   2,
        epochs          =  20,
        batch_size      = 512,
        validation_data = (x_val, y_val)
        )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    fitting_history_dict = fitting_history.history
    print('\nfitting_history_dict.keys()')
    print(   fitting_history_dict.keys() )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    loss_values     = fitting_history_dict['loss']
    val_loss_values = fitting_history_dict['val_loss']

    epochs = range(1, len(loss_values) + 1)

    outputFILE = 'plot-train-validation-loss.png'
    plt.plot(epochs,     loss_values, 'bo', label='Training loss'  )
    plt.plot(epochs, val_loss_values, 'b',  label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(fname = outputFILE, dpi = 600, bbox_inches = 'tight', pad_inches = 0.2)
    plt.clf()

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    acc_values     = fitting_history_dict['acc']
    val_acc_values = fitting_history_dict['val_acc']

    outputFILE = 'plot-train-validation-accuracy.png'
    plt.plot(epochs,     acc_values, 'bo', label='Training accuracy'  )
    plt.plot(epochs, val_acc_values, 'b',  label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(fname = outputFILE, dpi = 600, bbox_inches = 'tight', pad_inches = 0.2)
    plt.clf()

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    print('\nexiting: section3pt4()')
    print('##############################')

    return( None )