Exemplo n.º 1
0
def get_data():
    with open(data_dir + 'texts.pkl', 'rb') as f:
        texts = pickle.load(f)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts[:25000])
    sequences = tokenizer.texts_to_sequences(texts)

    # word_index = tokenizer.word_index
    # sequences = []
    # for i in range(50000):
    #     t = []
    #     tokens = texts[i].lower().split(' ')
    #     for j in range(len(tokens)):
    #         index = word_index.get(tokens[j], 0)
    #         if index < num_words:
    #             t.append(index)
    #         else:
    #             t.append(0)
    #     sequences.append(t)

    x = pad_sequences(sequences, maxlen=max_len)
    x_train = x[:25000]
    x_test = x[25000:]
    y_train = np.zeros((25000, ), dtype=np.float32)
    y_test = np.zeros((25000, ), dtype=np.float32)
    y_train[12500:25000] = np.ones((12500, ), dtype=np.float32)
    y_test[12500:25000] = np.ones((12500, ), dtype=np.float32)

    return x_train, y_train, x_test, y_test
Exemplo n.º 2
0
def get_input():
    with open(root_dir + "temp/ag/extract_data/texts.pkl", 'rb') as f:
        texts = pickle.load(f)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(texts[:num_train])
    print('there are %d words' % (len(tokenizer.word_index)))
    sequences = tokenizer.texts_to_sequences(texts)
    # print('average length is %d'%(np.sum([len(s) for s in sequences])/len(sequences)))
    new_text = []
    for seq in sequences:
        t = []
        for i in range(len(seq)):
            t.append('%d' % (seq[i]))
        for i in range(len(seq) - 1):
            t.append('%d_%d' % (seq[i], seq[i + 1]))
        # for i in range(len(seq) - 2):
        #     t.append('%d_%d_%d' % (seq[i], seq[i + 1], seq[i + 2]))
        new_text.append(' '.join(t))

    tokenizer2 = Tokenizer(num_words=num_ngram)
    tokenizer2.filters = ''
    tokenizer2.fit_on_texts(new_text[:num_train])
    sequences2 = tokenizer2.texts_to_sequences(new_text)
    print('there are %d ngrams' % (len(tokenizer2.word_index)))

    x = pad_sequences(sequences2, maxlen=max_len)
    x_train = x[:num_train]
    x_test = x[num_train:]
    y = np.load(root_dir + 'temp/ag/extract_data/label.npy')
    y_train = to_categorical(y[:num_train])
    y_test = to_categorical(y[num_train:])

    return x_train, y_train, x_test, y_test
Exemplo n.º 3
0
def prepare_train():
    print("prepare training data")
    f = codecs.open('../../../temp/imdb/keras_code/utils/texts.pkl', 'rb')
    text1 = pickle.load(f)
    text1 = text1[:25000]
    f.close()
    f = codecs.open('../../../temp/imdb/keras_code/utils/texts_unsup.pkl',
                    'rb')
    text2 = pickle.load(f)
    f.close()
    texts = text1 + text2

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts)
    sequence = tokenizer.texts_to_sequences(texts)
    sequence_pad = pad_sequences(sequence,
                                 maxlen=MAX_DOCUMENT_LENGTH + 1,
                                 dtype=np.int32,
                                 padding='post',
                                 truncating='post')
    seq_len = []
    for i in range(len(sequence)):
        r = len(sequence[i])
        if r < MAX_DOCUMENT_LENGTH:
            seq_len.append(r)
        else:
            seq_len.append(MAX_DOCUMENT_LENGTH)
    x_1 = sequence_pad[:, :-1]

    y_ = sequence_pad[:, 1:]
    return x_1, seq_len, y_
Exemplo n.º 4
0
def prepare_train():
    print("prepare training data")
    f = codecs.open('../temp/texts.pkl', 'rb')
    text1 = pickle.load(f)
    text1 = text1[:25000]
    f.close()
    f = codecs.open('../temp/texts_unsup.pkl', 'rb')
    text2 = pickle.load(f)
    f.close()
    texts = text1 + text2

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts)
    sequence = tokenizer.texts_to_sequences(texts)
    sequence_pad = pad_sequences(sequence,
                                 maxlen=MAX_DOCUMENT_LENGTH + 1,
                                 dtype=np.int32,
                                 padding='post',
                                 truncating='post')

    x_1 = sequence_pad[:, :-1]
    x_2 = np.arange(num_train)
    x_2 = np.reshape(x_2, (num_train, 1))
    y_ = sequence_pad[:, 1:]
    return x_1, x_2, y_
Exemplo n.º 5
0
def preserve_index_word():
    num_words = 30000
    max_len = 500
    num_train = 25000
    num_test = 25000
    f = codecs.open('../temp/texts.pkl', 'rb')
    texts = pickle.load(f)
    f.close()
    newText = []
    for sentence in texts:
        t = []
        words = WordPunctTokenizer().tokenize(sentence)
        for word in words:
            if word.isalpha():
                t.append(word)
        newText.append(' '.join(t))

    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(newText[:num_train])
    sequences = tokenizer.texts_to_sequences(newText)
    word_index = tokenizer.word_index
    index_word = {}
    for word, index in word_index.items():
        index_word[index] = word
    # f = codecs.open('../temp/index_word.pkl', 'wb')
    # pickle.dump(index_word, f, 1)
    # f.close()
    return index_word, sequences
Exemplo n.º 6
0
def prepare_data():
    global num_ngram
    print("prepare training data")
    with open('../../../temp/imdb/keras_code/utils/texts.pkl', 'rb') as f:
        texts = pickle.load(f)[:25000]
    with open('../../../temp/imdb/keras_code/utils/texts_unsup.pkl', 'rb') as f:
        texts += pickle.load(f)

    with open('/home/yan/my_datasets/glove/words.pkl', 'rb') as f:
        glove_words = pickle.load(f)
        glove_words_set = set(glove_words)
    for i, s in enumerate(texts):
        new_s = []
        for token in s.split(' '):
            if token in glove_words_set:
                new_s.append(token)
        texts[i] = ' '.join(new_s)

    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index

    embeddings_index = {}
    wordX = np.load('/home/yan/my_datasets/glove/embedding.300d.npy')
    for i in range(len(glove_words)):
        embeddings_index[glove_words[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    all_ngram = set()
    for seq in sequences:
        for i in range(len(seq) - 2):
            all_ngram.add(tuple(seq[i:i + 3]))
    num_ngram = len(all_ngram)
    all_ngram=list(all_ngram)
    print('there are %d ngrams' % (num_ngram))

    x = np.zeros((num_ngram * 3, 3), dtype=np.int32)
    y = np.zeros((num_ngram * 3, 1), dtype=np.int32)
    index = 0
    for i, seq in enumerate(all_ngram):
        for id in seq:
            x[index] = seq
            y[index] = id
            index += 1
    indice = np.arange(num_ngram * 3)
    np.random.shuffle(indice)
    x = x[indice]
    y = y[indice]

    print(x.shape,y.shape)

    return x, y, embedding_matrix
Exemplo n.º 7
0
def prepare_data():
    print("prepare training data")
    global embedding_matrix
    f = codecs.open('../../../temp/imdb/keras_code/utils/texts.pkl', 'rb')
    text = pickle.load(f)
    f.close()
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(text)
    sequence = tokenizer.texts_to_sequences(text)
    sequence_pad = pad_sequences(sequence,
                                 maxlen=MAX_DOCUMENT_LENGTH,
                                 dtype=np.int32,
                                 padding='post',
                                 truncating='post')
    # prepare pretrained word2vec vectors
    # word_index = tokenizer.word_index
    # embeddings_index = {}
    # wordX = np.load('/media/yan/winD/docker/word2vec/words/word2vec.npy')
    # f = codecs.open('/media/yan/winD/docker/word2vec/words/wordsInWord2vec.pkl', 'rb')
    # allwords = pickle.load(f)
    # f.close()
    # for i in range(3000000):
    #     embeddings_index[''.join(allwords[i])] = wordX[i, :]
    # embedding_matrix = np.zeros((vocab_size, 300))
    # for word, i in word_index.items():
    #     embedding_vector = embeddings_index.get(word)
    #     if embedding_vector is not None and i < vocab_size:
    #         embedding_matrix[i] = embedding_vector

    seq_len = []
    for i in range(len(sequence)):
        r = len(sequence[i])
        if r < MAX_DOCUMENT_LENGTH:
            seq_len.append(r)
        else:
            seq_len.append(MAX_DOCUMENT_LENGTH)

    x_train = sequence_pad[:25000]
    x_test = sequence_pad[25000:]
    x_train_len = np.asarray(seq_len[:25000])
    x_test_len = np.asarray(seq_len[25000:])
    y_train = np.zeros((25000, ), dtype=np.int8)
    y_test = np.zeros((25000, ), dtype=np.int8)
    y_train[12500:25000] = np.ones((12500, ), dtype=np.int8)
    y_test[12500:25000] = np.ones((12500, ), dtype=np.int8)

    indice = np.arange(25000)
    np.random.shuffle(indice)
    x_train = x_train[indice]
    x_test = x_test[indice]
    y_train = y_train[indice]
    y_test = y_test[indice]
    x_train_len = x_train_len[indice]
    x_test_len = x_test_len[indice]

    return x_train, x_test, y_train, y_test, x_train_len, x_test_len
Exemplo n.º 8
0
def prepare_data():
    print("prepare training data")
    with open('../../../temp/imdb/keras_code/utils/texts.pkl', 'rb') as f:
        text1 = pickle.load(f)
    with open('../../../temp/imdb/keras_code/utils/texts_unsup.pkl',
              'rb') as f:
        texts = text1[:25000] + pickle.load(f)

    with open('/home/yan/my_datasets/glove/words.pkl', 'rb') as f:
        glove_words = pickle.load(f)
        glove_words_set = set(glove_words)
    for i, s in enumerate(texts):
        new_s = []
        for token in s.split(' '):
            if token in glove_words_set:
                new_s.append(token)
        texts[i] = ' '.join(new_s)

    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(text1)
    sequences = pad_sequences(sequences, maxlen=max_len)
    word_index = tokenizer.word_index

    embeddings_index = {}
    wordX = np.load('/home/yan/my_datasets/glove/embedding.300d.npy')
    for i in range(len(glove_words)):
        embeddings_index[glove_words[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    x = np.zeros((50000, (max_len - 2) * 3), dtype=np.int32)
    for i, seq in enumerate(sequences):
        for j in range(len(seq) - 2):
            x[i, j:j + 3] = seq[j:j + 3]

    x_train = x[:25000]
    x_test = x[25000:]
    y_train = np.zeros((25000, ), dtype=np.float32)
    y_test = np.zeros((25000, ), dtype=np.float32)
    y_train[12500:25000] = np.ones((12500, ), dtype=np.float32)
    y_test[12500:25000] = np.ones((12500, ), dtype=np.float32)

    indice = np.arange(25000)
    np.random.shuffle(indice)
    x_train = x_train[indice]
    x_test = x_test[indice]
    y_train = y_train[indice]
    y_test = y_test[indice]

    return x_train, x_test, y_train, y_test, embedding_matrix
Exemplo n.º 9
0
    def load_data(path, vocab_size, maxlen, filters, ids=False, tknzr=None):
        '''
        Read corpus file, tokenize words and encode to sequences
        '''
        # Read file and append end anf begin of sentence tags
        print(' Reading data file')
        f = open(path,'r')
        text = []
        if ids: # Open ids file
            # Change file extension to .ids
            name = ''.join(path.split('.')[:-1]) + 'ids'
            idfile = open(name)
            idname = ''

        for line in f:
            text.append(BOS +' '+line[:-1]+' ' +EOS)
            if ids: # Add context separator
                read = idfile.readline()
                if read != idname:
                    idname = read
                    text.append('<EOC>')
        f.close()

        # Create vocabulary
        if tknzr is None:
            print(' Generating vocabulary')
            tknzr = Tokenizer(num_words=vocab_size, lower=False, oov_token=OOV)
            if not filters:
                tknzr.filters = ''
            else:
                tknzr.filters = tknzr.filters.replace('<','') #need keep tags
                tknzr.filters = tknzr.filters.replace('>','')
            tknzr.fit_on_texts(text)
            print(' Word2idx len:',len(tknzr.word_index))

        # Create one_hot vectors
        print(' Creating one-hot vectors')
        data = tokenize(tknzr, text, maxlen)

        return data, tknzr
Exemplo n.º 10
0
def get_input():
    with open(root_dir + "temp/imdb/keras_code/utils/texts.pkl", 'rb') as f:
        texts = pickle.load(f)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.filters=''
    tokenizer.fit_on_texts(texts[:25000])
    print('there are %d words' % (len(tokenizer.word_index)))
    # word_index=tokenizer.word_index
    # index_word=dict([(index,word) for word, index in word_index.items()])
    sequences = tokenizer.texts_to_sequences(texts)
    new_text=[]
    for seq in sequences:
        t=[]
        for i in range(len(seq)):
            t.append('%d'%(seq[i]))
        for i in range(len(seq)-1):
            t.append('%d_%d'%(seq[i],seq[i+1]))
        for i in range(len(seq)-2):
            t.append('%d_%d_%d'%(seq[i],seq[i+1],seq[i+2]))
        new_text.append(' '.join(t))

    tokenizer2=Tokenizer(num_words=num_ngram)
    tokenizer2.filters=''
    tokenizer2.fit_on_texts(new_text[:25000])
    sequences2=tokenizer2.texts_to_sequences(new_text)
    print('there are %d ngrams'%(len(tokenizer2.word_index)))

    x = pad_sequences(sequences2, maxlen=max_len)
    x_train = x[:25000]
    x_test = x[25000:]
    y_train = np.zeros((25000,), dtype=np.float32)
    y_test = np.zeros((25000,), dtype=np.float32)
    y_train[12500:25000] = np.ones((12500,), dtype=np.float32)
    y_test[12500:25000] = np.ones((12500,), dtype=np.float32)

    # print(x_train[0])
    return x_train, y_train, x_test, y_test
Exemplo n.º 11
0
def tokenize(x_train,
             y_train,
             x_test,
             y_test,
             max_nb_words=400000,
             max_sequence_length=26):
    """This function does the following:
        Apply tokenization on headlines
        Apply tokenization on holdout headlines 
        (input for both: MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)
    """

    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import np_utils
    import numpy as np

    print("Initializing Keras tokenizer, max. unique words allowed = %d" %
          max_nb_words)
    tokenizer = Tokenizer(num_words=max_nb_words)
    tokenizer.filters = '#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n'
    tokenizer.fit_on_texts(x_train)
    print("Applying tokenizer on training corpus...")
    x_train = tokenizer.texts_to_sequences(x_train)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    print('Padding training sequences to max length %d' % max_sequence_length)
    x_train = pad_sequences(x_train, maxlen=max_sequence_length)

    y_train = np_utils.to_categorical(np.asarray(y_train))
    print('Shape of training x_train tensor:', x_train.shape)
    print('Shape of training y_train tensor:', y_train.shape)
    """ Here starts a section dedicated to tokenizing new headlines on which the model has
    not been trained and which may thus contain words that are not in the vocabulary.
    These words will be removed from the new headline and a tokenization will be made
    on the sentence as if it does not contain those words.
    """
    print("Applying tokenizer on holdout corpus...")
    x_test = tokenizer.texts_to_sequences(x_test)
    print('Padding holdout sequences to max length %d' % max_sequence_length)
    x_test = pad_sequences(x_test, maxlen=max_sequence_length)
    y_test = np_utils.to_categorical(np.asarray(y_test))
    print('Shape of holdout x_test tensor:', x_test.shape)
    print('Shape of holdout y_test tensor:', y_test.shape)
    return (x_train, y_train, x_test, y_test, tokenizer)
Exemplo n.º 12
0
 def __init__(self,
              num_words,
              max_len,
              filters='',
              truncing='post',
              padding='pre'):
     self.num_words = num_words
     self.max_len = max_len
     self.truncing = truncing
     self.padding = padding
     f = codecs.open('../../../temp/imdb/keras_code/utils/texts.pkl', 'rb')
     self.texts = pickle.load(f)
     f.close()
     tokenizer = Tokenizer(num_words=num_words)
     if filters is not None:
         tokenizer.filters = filters
     tokenizer.fit_on_texts(self.texts[:25000])
     self.tokenizer = tokenizer
Exemplo n.º 13
0
import numpy as np
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Conv1D, GlobalMaxPooling1D, Input, Embedding, \
    GlobalAveragePooling1D, MaxPooling2D, AveragePooling1D

num_words = 30000
max_len = 600
f = codecs.open('../temp/texts.pkl', 'rb')
texts = pickle.load(f)
f.close()

tokenizer = Tokenizer(num_words=num_words)
tokenizer.filters = ''
tokenizer.fit_on_texts(texts[0:25000])
# sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = []
for i in range(50000):
    t = []
    tokens = texts[i].lower().split(' ')
    for j in range(len(tokens)):
        index = word_index.get(tokens[j], 0)
        if index < num_words:
            t.append(index)
        else:
            t.append(0)
    sequences.append(t)
Exemplo n.º 14
0
def main():
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()
    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts[0:25000])
    # print(texts[0])
    # sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    sequences = []
    for i in range(50000):
        t = []
        tokens = texts[i].lower().split(' ')
        for j in range(len(tokens)):
            index = word_index.get(tokens[j], 0)
            if index < num_words:
                t.append(index)
            else:
                t.append(0)
        sequences.append(t)

    print('Found %s unique tokens.' % len(word_index))

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000, ), dtype=np.float32)
    Ytest = np.zeros((25000, ), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500, ), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500, ), dtype=np.float32)

    Xtrain1 = np.zeros((25000, (max_len - 2) * 3), dtype=np.int)
    Xtest1 = np.zeros((25000, (max_len - 2) * 3), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 2):
            Xtrain1[i, j * 3] = data1[i, j]
            Xtrain1[i, j * 3 + 1] = data1[i][j + 1] + num_words
            Xtrain1[i, j * 3 + 2] = data1[i][j + 2] + num_words * 2
    for i in range(25000):
        for j in range(max_len - 2):
            Xtest1[i, j * 3] = data2[i, j]
            Xtest1[i, j * 3 + 1] = data2[i][j + 1] + num_words
            Xtest1[i, j * 3 + 2] = data2[i][j + 2] + num_words * 2

    Xtrain2 = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    Xtest2 = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 1):
            Xtrain2[i, j * 2] = data1[i, j]
            Xtrain2[i, j * 2 + 1] = data1[i][j + 1] + num_words
    for i in range(25000):
        for j in range(max_len - 1):
            Xtest2[i, j * 2] = data2[i, j]
            Xtest2[i, j * 2 + 1] = data2[i][j + 1] + num_words

    indice1 = np.arange(25000)
    np.random.shuffle(indice1)
    Xtrain1 = Xtrain1[indice1]
    Xtrain2 = Xtrain2[indice1]
    Ytrain = Ytrain[indice1]

    indice2 = np.arange(25000)
    np.random.shuffle(indice2)
    Xtest1 = Xtest1[indice2]
    Xtest2 = Xtest2[indice2]
    Ytest = Ytest[indice2]
    print('begin to build model ...')
    input1 = Input(shape=((max_len - 2) * 3, ))
    embedding1 = Embedding(num_words * 3,
                           embedding_dimension,
                           input_length=(max_len - 2) * 3,
                           init='orthogonal')(input1)
    x = AveragePooling1D(pool_length=3)(embedding1)
    x = GlobalMaxPooling1D()(x)

    input2 = Input(shape=((max_len - 1) * 2, ))
    embedding2 = Embedding(num_words * 2,
                           embedding_dimension,
                           input_length=(max_len - 1) * 2,
                           init='orthogonal')(input2)
    y = AveragePooling1D(pool_length=2, stride=2)(embedding2)
    y = GlobalMaxPooling1D()(y)
    z = merge([x, y], mode='concat')
    # model.add(Dropout(0.5))
    output = Dense(1, activation='sigmoid')(z)

    model = Model(input=[input1, input2], output=output)
    model.compile(loss='binary_crossentropy',
                  optimizer='nadam',
                  metrics=['accuracy'])
    model.fit([Xtrain1, Xtrain2],
              Ytrain,
              batch_size=32,
              nb_epoch=20,
              verbose=2,
              validation_data=([Xtest1, Xtest2], Ytest))
Exemplo n.º 15
0
def prepare_data():
    global tokenizer
    print("prepare training data")
    with FileIO(os.path.join(FLAGS.buckets, 'imdb/texts.pkl'), 'r+') as f:
        texts = pickle.load(f)[:25000]
    with FileIO(os.path.join(FLAGS.buckets, 'imdb/texts_unsup.pkl'),
                'r+') as f:
        texts += pickle.load(f)

    with FileIO(os.path.join(FLAGS.buckets, 'glove/words.pkl'), 'r+') as f:
        glove_words = pickle.load(f)
        glove_words_set = set(glove_words)
    for i, s in enumerate(texts):
        new_s = []
        for token in s.split(' '):
            if token in glove_words_set:
                new_s.append(token)
        texts[i] = ' '.join(new_s)

    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index

    embeddings_index = {}
    wordX = np.load(
        FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"),
               mode='r+'))
    for i in range(len(glove_words)):
        embeddings_index[glove_words[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    ngram_text = []
    for seq in sequences:
        s = [str(i) for i in seq]
        ngram_sentence = []
        for i in range(len(seq) - 2):
            ngram_sentence.append('_'.join(s[i:i + 3]))
        ngram_text.append(' '.join(ngram_sentence))
    tokenizer2 = Tokenizer(num_words=num_ngram)
    tokenizer2.filters = ''
    tokenizer2.fit_on_texts(ngram_text)
    ngram_index = tokenizer2.word_index
    frequent_ngram = sorted(ngram_index.items(),
                            key=lambda k: k[1])[:num_ngram]
    print('there are %d ngrams' % (len(ngram_index)))

    x = np.zeros((num_ngram * 3, 1), dtype=np.int32)
    y = np.zeros((num_ngram * 3, 1), dtype=np.int32)
    index = 0
    for i, seq in enumerate(frequent_ngram):
        word_ids = seq[0].split('_')
        for id in word_ids:
            x[index] = i
            y[index] = int(id)
            index += 1
    indice = np.arange(num_ngram * 3)
    np.random.shuffle(indice)
    x = x[indice]
    y = y[indice]

    pickle.dump(
        word_index,
        FileIO(os.path.join(FLAGS.buckets, "word_index.pkl"), mode='w+'))
    pickle.dump(
        ngram_index,
        FileIO(os.path.join(FLAGS.buckets, "ngram_index.pkl"), mode='w+'))
    return x, y, embedding_matrix
Exemplo n.º 16
0
def main():
    global ngram
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.filters=''
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    # word_index = tokenizer.word_index
    # sequences = []
    # for i in range(50000):
    #     t = []
    #     tokens = texts[i].lower().split(' ')
    #     for j in range(len(tokens)):
    #         index = word_index.get(tokens[j], 0)
    #         if index < num_words:
    #             t.append(index)
    #         else:
    #             t.append(0)
    #     sequences.append(t)

    # print('Found %s unique tokens.' % len(word_index))

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000,), dtype=np.float32)
    Ytest = np.zeros((25000,), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500,), dtype=np.float32)

    Xtrain = np.zeros((25000, (max_len - ngram + 1) * ngram), dtype=np.int)
    Xtest = np.zeros((25000, (max_len - ngram + 1) * ngram), dtype=np.int)

    id_range = np.arange(max_len - ngram + 1)
    for i in range(ngram):
        Xtrain[:, id_range * ngram + i] = data1[:, id_range + i] + num_words * i
        Xtest[:, id_range * ngram + i] = data2[:, id_range + i] + num_words * i

    print('begin to build model ...')
    main_input = Input(shape=((max_len - ngram + 1) * ngram,))
    # embedding1 = Embedding(num_words * ngram, word_dim, embeddings_initializer=keras.initializers.Orthogonal())(main_input)
    embedding1 = Embedding(num_words * ngram, word_dim)(main_input)
    x = AveragePooling1D(pool_size=ngram)(embedding1)
    x = GlobalMaxPooling1D()(x)

    weight = np.ones((word_dim, 1), dtype=np.float)
    weight[int(word_dim / 2):] = -1 * np.ones([int(word_dim / 2), 1], dtype=np.float)
    output = Dense(1,
                   weights=[weight, np.zeros([1])],
                   trainable=False,
                   activation='sigmoid')(x)

    model = Model(input=main_input, output=output)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    model.fit([Xtrain], Ytrain,
              batch_size=32,
              shuffle=True,
              nb_epoch=15,
              verbose=2,
              validation_data=([Xtest], Ytest))